Skip to content

Commit

Permalink
Write a virtual machine for C++ regexes (#502)
Browse files Browse the repository at this point in the history
The C++ standard library comes with a very slow engine for regular
expressions in most implementations. For example, it seems that GNU
C++ compiler runs in exponential time, see this bug report:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93502

This is impractical for most real-world inputs. Therefore, we implement
a virtual machine for matching regular expressions which runs in ``O(m *
n)`` time (where ``m`` is the number of instructions in the regular
expression and ``n`` is the length of the input).

The original approach has been described in:
Thompson, K., "Regular expression search algorithm", ACM 11(6) 1968

while we follow this blog post:
https://swtch.com/~rsc/regexp/regexp2.html
  • Loading branch information
mristin authored Jun 20, 2024
1 parent 1a50e13 commit a2f329f
Show file tree
Hide file tree
Showing 73 changed files with 75,798 additions and 2,285 deletions.
82 changes: 77 additions & 5 deletions aas_core_codegen/cpp/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,13 @@ def wstring_literal(text: str) -> Stripped:
elif code_point < 32:
# Non-printable ASCII characters
escaped.append(f"\\x{ord(character):x}")
elif 255 < code_point < 65536:
elif code_point <= 127:
# ASCII
escaped.append(character)
elif 127 < code_point < 255:
# Above ASCII, but can be encoded as a single byte
escaped.append(f"\\x{ord(character):x}")
elif 255 <= code_point < 65536:
# Above ASCII
escaped.append(f"\\u{ord(character):04x}")
elif code_point >= 65536:
Expand All @@ -79,6 +85,64 @@ def wstring_literal(text: str) -> Stripped:
return Stripped('L"{}"'.format("".join(escaped)))


@require(lambda character: len(character) == 1)
def wchar_literal(character: str) -> Stripped:
"""Generate a C++ wide character literal from the ``character``."""
code_point = ord(character)

escaped: str

if character == "\a":
escaped = "L'\\a'"
elif character == "\b":
escaped = "L'\\b'"
elif character == "\f":
escaped = "L'\\f'"
elif character == "\n":
escaped = "L'\\n'"
elif character == "\r":
escaped = "L'\\r'"
elif character == "\t":
escaped = "L'\\t'"
elif character == "\v":
escaped = "L'\\v'"
elif character == "'":
escaped = "L'\\''"
elif character == "\\":
escaped = "L'\\\\'"
elif code_point < 32:
# Non-printable ASCII characters
escaped = f"L'\\x{ord(character):x}'"
elif code_point <= 127:
# ASCII
escaped = f"L'{character}'"
elif 127 < code_point < 255:
# Above ASCII, but can be encoded as a single byte
escaped = f"L'\\x{ord(character):x}'"
elif 0xD800 <= code_point <= 0xDFFF:
# NOTE (mristin):
# These are the surrogate points and can not be represented as wide character
# literals directly as common compilers such as MSVC++ will complain.
#
# We have to fool the compiler at this point as we deliberately want to model
# the surrogate point.
escaped = f"static_cast<wchar_t>(0x{ord(character):04x})"

# NOTE (mristin):
# Mind the intersecting range for surrogate points just above if you ever convert
# this if-elif-else statement into a mapping or pattern matching.
elif 255 <= code_point < 65536:
# Above ASCII
escaped = f"L'\\u{ord(character):04x}'"
elif code_point >= 65536:
# Above Unicode Basic Multilingual Pane
escaped = f"L'\\U{ord(character):08x}'"
else:
raise AssertionError(f"Unexpected unhandled character: {character!r}")

return Stripped(escaped)


# fmt: off
# NOTE (mristin, 2023-09-24):
# We use a pre-condition here to simplify the client code. The client must check
Expand All @@ -87,7 +151,7 @@ def wstring_literal(text: str) -> Stripped:
@require(
lambda text:
all(
ord(character) <= 255
ord(character) <= 127
for character in text
),
"Only ASCII text can be converted to a C++ string literal, otherwise encoding "
Expand Down Expand Up @@ -124,7 +188,7 @@ def string_literal(text: str) -> Stripped:
elif code_point < 32:
# Non-printable ASCII characters
escaped.append(f"\\x{ord(character):x}")
elif code_point <= 255:
elif code_point <= 127:
escaped.append(character)
else:
# Above ASCII
Expand Down Expand Up @@ -159,14 +223,20 @@ def needs_escaping(text: str) -> bool:
elif code_point < 32:
# Non-printable ASCII characters
return True
elif 255 < code_point < 65536:
elif code_point <= 127:
# ASCII
continue
elif 127 < code_point < 255:
# Above ASCII, but can be encoded as a single byte
return True
elif 255 <= code_point < 65536:
# Above ASCII
return True
elif code_point >= 65536:
# Above Unicode Binary Multilingual Pane
return True
else:
pass
raise AssertionError(f"Unexpected unhandled character: {character!r}")

return False

Expand Down Expand Up @@ -286,6 +356,8 @@ def _assert_all_primitive_types_are_mapped() -> None:
VERIFICATION_NAMESPACE = Identifier("verification")
JSONIZATION_NAMESPACE = Identifier("jsonization")
XMLIZATION_NAMESPACE = Identifier("xmlization")
REVM_NAMESPACE = Identifier("revm")
PATTERN_NAMESPACE = Identifier("pattern")


def generate_primitive_type(primitive_type: intermediate.PrimitiveType) -> Stripped:
Expand Down
81 changes: 81 additions & 0 deletions aas_core_codegen/cpp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
enhancing as cpp_enhancing,
iteration as cpp_iteration,
jsonization as cpp_jsonization,
pattern as cpp_pattern,
revm as cpp_revm,
stringification as cpp_stringification,
structure as cpp_structure,
verification as cpp_verification,
Expand Down Expand Up @@ -320,6 +322,85 @@ def execute(context: run.Context, stdout: TextIO, stderr: TextIO) -> int:
return 1
# endregion

# region Pattern
code = cpp_pattern.generate_header(
symbol_table=context.symbol_table, library_namespace=namespace
)

pth = context.output_dir / "pattern.hpp"
try:
pth.write_text(code, encoding="utf-8")
except Exception as exception:
run.write_error_report(
message=(f"Failed to write the header for the C++ pattern code to {pth}"),
errors=[str(exception)],
stderr=stderr,
)
return 1

code, errors = cpp_pattern.generate_implementation(
symbol_table=context.symbol_table,
library_namespace=namespace,
)

if errors is not None:
run.write_error_report(
message=(
f"Failed to generate the implementation of the C++ pattern code "
f"based on {context.model_path}"
),
errors=[context.lineno_columner.error_message(error) for error in errors],
stderr=stderr,
)
return 1
assert code is not None

pth = context.output_dir / "pattern.cpp"
try:
pth.write_text(code, encoding="utf-8")
except Exception as exception:
run.write_error_report(
message=(
f"Failed to write the implementation of the C++ pattern code to {pth}"
),
errors=[str(exception)],
stderr=stderr,
)
return 1
# endregion

# region REVM
code = cpp_revm.generate_header(library_namespace=namespace)

pth = context.output_dir / "revm.hpp"
try:
pth.write_text(code, encoding="utf-8")
except Exception as exception:
run.write_error_report(
message=(f"Failed to write the header for the C++ REVM code to {pth}"),
errors=[str(exception)],
stderr=stderr,
)
return 1

code = cpp_revm.generate_implementation(
library_namespace=namespace,
)

pth = context.output_dir / "revm.cpp"
try:
pth.write_text(code, encoding="utf-8")
except Exception as exception:
run.write_error_report(
message=(
f"Failed to write the implementation of the C++ REVM code to {pth}"
),
errors=[str(exception)],
stderr=stderr,
)
return 1
# endregion

# region Stringification
code = cpp_stringification.generate_header(
symbol_table=context.symbol_table, library_namespace=namespace
Expand Down
6 changes: 6 additions & 0 deletions aas_core_codegen/cpp/pattern/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Generate instructions for a virtual machine for matching regular expressions."""

from aas_core_codegen.cpp.pattern import _generate

generate_header = _generate.generate_header
generate_implementation = _generate.generate_implementation
Loading

0 comments on commit a2f329f

Please sign in to comment.