From 9f85a883e4809152ea4016105f81755c569a34c3 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Wed, 24 Jul 2024 13:49:15 -0400
Subject: [PATCH] regex-utils: Add support for handling escaped regex
 metacharacters. (#487)

---
 components/core/.clang-format                 |  2 +-
 .../core/src/clp/regex_utils/ErrorCode.cpp    |  4 ++
 .../core/src/clp/regex_utils/ErrorCode.hpp    |  1 +
 .../core/src/clp/regex_utils/constants.hpp    | 30 +++++++++
 .../regex_utils/regex_translation_utils.cpp   | 67 ++++++++++++++-----
 components/core/tests/test-regex_utils.cpp    | 30 +++++++--
 .../dev-guide/components-core/regex-utils.md  | 15 +++++
 7 files changed, 125 insertions(+), 24 deletions(-)
diff --git a/components/core/.clang-format b/components/core/.clang-format
index c8e66579c..35934f594 100644
--- a/components/core/.clang-format
+++ b/components/core/.clang-format
@@ -75,7 +75,7 @@ IncludeCategories:
   # Library headers. Update when adding new libraries.
   # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec.
   - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mariadb\
-|mongocxx|msgpack|outcome|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)"
+|mongocxx|msgpack|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)"
     Priority: 3
   # C system headers
   - Regex: "^<.+\\.h>"
diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp
index 5779263e8..112ede242 100644
--- a/components/core/src/clp/regex_utils/ErrorCode.cpp
+++ b/components/core/src/clp/regex_utils/ErrorCode.cpp
@@ -65,6 +65,10 @@ auto ErrorCodeCategory::message(int ev) const -> string {
         case ErrorCode::IllegalDollarSign:
             return "Failed to translate due to end anchor `$` in the middle of the string.";
 
+        case ErrorCode::IllegalEscapeSequence:
+            return "Currently only supports escape sequences that are used to suppress special "
+                   "meanings of regex metacharacters. Alphanumeric characters are disallowed.";
+
         case ErrorCode::UnmatchedParenthesis:
             return "Unmatched opening `(` or closing `)`.";
 
diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp
index 1babb2fec..77a52cf58 100644
--- a/components/core/src/clp/regex_utils/ErrorCode.hpp
+++ b/components/core/src/clp/regex_utils/ErrorCode.hpp
@@ -19,6 +19,7 @@ enum class ErrorCode : uint8_t {
     UnsupportedPipe,
     IllegalCaret,
     IllegalDollarSign,
+    IllegalEscapeSequence,
     UnmatchedParenthesis,
 };
 
diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp
index 879e7641d..9833543fc 100644
--- a/components/core/src/clp/regex_utils/constants.hpp
+++ b/components/core/src/clp/regex_utils/constants.hpp
@@ -1,7 +1,29 @@
 #ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
 #define CLP_REGEX_UTILS_CONSTANTS_HPP
 
+#include <array>
+#include <cstddef>
+#include <string_view>
+
 namespace clp::regex_utils {
+constexpr size_t cCharBitarraySize = 128;
+
+/**
+ * Creates an ASCII character lookup table at compile time.
+ *
+ * @param char_str A string that contains the characters to look up.
+ * @return The lookup table as bit array.
+ */
+[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
+) -> std::array<bool, cCharBitarraySize> {
+    std::array<bool, cCharBitarraySize> bit_array{};
+    bit_array.fill(false);
+    for (auto const ch : char_str) {
+        bit_array.at(ch) = true;
+    }
+    return bit_array;
+}
+
 // Wildcard meta characters
 constexpr char cZeroOrMoreCharsWildcard{'*'};
 constexpr char cSingleCharWildcard{'?'};
@@ -14,6 +36,14 @@ constexpr char cRegexStartAnchor{'^'};
 constexpr char cRegexEndAnchor{'$'};
 constexpr char cEscapeChar{'\\'};
 constexpr char cCharsetNegate{'^'};
+
+// Character bitmaps
+// The set of regex metacharacters that can be preceded with an escape backslash to be treated as a
+// literal.
+constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\");
+// The set of wildcard metacharacters that must remain escaped in the translated string to be
+// treated as a literal.
+constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\");
 }  // namespace clp::regex_utils
 
 #endif  // CLP_REGEX_UTILS_CONSTANTS_HPP
diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp
index 349c106f4..f26d70521 100644
--- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp
+++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp
@@ -27,17 +27,19 @@ class TranslatorState {
      *
      * This list may be expanded as the translator supports translating more regex patterns.
      * <ul>
-     *   <li>NORMAL: The initial state, where characters have no special meanings and are treated
+     *   <li>Normal: The initial state, where characters have no special meanings and are treated
      * literally.</li>
-     *   <li>DOT: Encountered a period `.`. Expecting wildcard expression.</li>
-     *   <li>END: Encountered a dollar sign `$`, meaning the regex string has reached the end
+     *   <li>Dot: Encountered a period `.`. Expecting wildcard expression.</li>
+     *   <li>Escaped: Encountered a backslash `\`. Expecting an escape sequence.</li>
+     *   <li>End: Encountered a dollar sign `$`, meaning the regex string has reached the end
      * anchor.</li>
      * </ul>
      */
     enum class RegexPatternState : uint8_t {
-        NORMAL = 0,
-        DOT,
-        END,
+        Normal = 0,
+        Dot,
+        Escaped,
+        End,
     };
 
     // Constructor
@@ -51,7 +53,7 @@ class TranslatorState {
 
 private:
     // Members
-    RegexPatternState m_state{RegexPatternState::NORMAL};
+    RegexPatternState m_state{RegexPatternState::Normal};
 };
 
 /**
@@ -92,13 +94,22 @@ using StateTransitionFuncSig
  */
 [[nodiscard]] StateTransitionFuncSig dot_state_transition;
 
+/**
+ * Appends an escaped regex metacharacter as a literal character to the wildcard string by
+ * discarding its preceding backslash.
+ *
+ * The preceding backslash must be kept for characters that also have special meanings in the
+ * wildcard syntax, e.g. `abc.\*xyz` should be translated into `abc?\*xyz` instead of `abc?*xyz`.
+ */
+[[nodiscard]] StateTransitionFuncSig escaped_state_transition;
+
 /**
  * Disallows the appearances of other characters after encountering an end anchor in the string.
  */
 [[nodiscard]] StateTransitionFuncSig end_state_transition;
 
 /**
- * States other than the NORMAL state may require special handling after the whole regex string has
+ * States other than the Normal state may require special handling after the whole regex string has
  * been scanned and processed.
  */
 [[nodiscard]] StateTransitionFuncSig final_state_cleanup;
@@ -112,10 +123,13 @@ auto normal_state_transition(
     auto const ch{*it};
     switch (ch) {
         case '.':
-            state.set_next_state(TranslatorState::RegexPatternState::DOT);
+            state.set_next_state(TranslatorState::RegexPatternState::Dot);
+            break;
+        case cEscapeChar:
+            state.set_next_state(TranslatorState::RegexPatternState::Escaped);
             break;
         case cRegexEndAnchor:
-            state.set_next_state(TranslatorState::RegexPatternState::END);
+            state.set_next_state(TranslatorState::RegexPatternState::End);
             break;
         case cRegexZeroOrMore:
             return ErrorCode::UntranslatableStar;
@@ -155,7 +169,25 @@ auto dot_state_transition(
             --it;
             break;
     }
-    state.set_next_state(TranslatorState::RegexPatternState::NORMAL);
+    state.set_next_state(TranslatorState::RegexPatternState::Normal);
+    return ErrorCode::Success;
+}
+
+auto escaped_state_transition(
+        TranslatorState& state,
+        string_view::const_iterator& it,
+        string& wildcard_str,
+        [[maybe_unused]] RegexToWildcardTranslatorConfig const& config
+) -> error_code {
+    auto const ch{*it};
+    if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) {
+        return ErrorCode::IllegalEscapeSequence;
+    }
+    if (cWildcardMetaCharsLut.at(ch)) {
+        wildcard_str += cEscapeChar;
+    }
+    wildcard_str += ch;
+    state.set_next_state(TranslatorState::RegexPatternState::Normal);
     return ErrorCode::Success;
 }
 
@@ -178,7 +210,7 @@ auto final_state_cleanup(
         RegexToWildcardTranslatorConfig const& config
 ) -> error_code {
     switch (state.get_state()) {
-        case TranslatorState::RegexPatternState::DOT:
+        case TranslatorState::RegexPatternState::Dot:
             // The last character is a single `.`, without the possibility of becoming a
             // multichar wildcard
             wildcard_str += cSingleCharWildcard;
@@ -187,7 +219,7 @@ auto final_state_cleanup(
             break;
     }
 
-    if (TranslatorState::RegexPatternState::END != state.get_state()
+    if (TranslatorState::RegexPatternState::End != state.get_state()
         && config.add_prefix_suffix_wildcards())
     {
         wildcard_str += cZeroOrMoreCharsWildcard;
@@ -220,13 +252,16 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
     error_code ec{};
     while (it != regex_str.cend()) {
         switch (state.get_state()) {
-            case TranslatorState::RegexPatternState::NORMAL:
+            case TranslatorState::RegexPatternState::Normal:
                 ec = normal_state_transition(state, it, wildcard_str, config);
                 break;
-            case TranslatorState::RegexPatternState::DOT:
+            case TranslatorState::RegexPatternState::Dot:
                 ec = dot_state_transition(state, it, wildcard_str, config);
                 break;
-            case TranslatorState::RegexPatternState::END:
+            case TranslatorState::RegexPatternState::Escaped:
+                ec = escaped_state_transition(state, it, wildcard_str, config);
+                break;
+            case TranslatorState::RegexPatternState::End:
                 ec = end_state_transition(state, it, wildcard_str, config);
                 break;
             default:
diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp
index fc79b966a..9defd7d08 100644
--- a/components/core/tests/test-regex_utils.cpp
+++ b/components/core/tests/test-regex_utils.cpp
@@ -1,31 +1,47 @@
+#include <Catch2/single_include/catch2/catch.hpp>
 #include <regex_utils/ErrorCode.hpp>
 #include <regex_utils/regex_translation_utils.hpp>
 #include <regex_utils/RegexToWildcardTranslatorConfig.hpp>
 
-#include <Catch2/single_include/catch2/catch.hpp>
-
 using clp::regex_utils::ErrorCode;
 using clp::regex_utils::regex_to_wildcard;
 using clp::regex_utils::RegexToWildcardTranslatorConfig;
 
-TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") {
-    // Test empty string
+TEST_CASE("regex_to_wildcard_simple_translations", "[regex_utils][re2wc][simple_translations]") {
     REQUIRE(regex_to_wildcard("").value().empty());
 
-    // Test simple wildcard translations
     REQUIRE((regex_to_wildcard("xyz").value() == "xyz"));
     REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?"));
     REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *"));
+}
 
-    // Test unescaped meta characters
+TEST_CASE("regex_to_wildcard_unescaped_metachar", "[regex_utils][re2wc][unescaped_metachar]") {
     REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::UnsupportedQuestionMark));
     REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::UntranslatableStar));
     REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus));
     REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe));
     REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret));
+    REQUIRE((regex_to_wildcard(". xyz $.* zyx .").error() == ErrorCode::IllegalDollarSign));
+}
+
+TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_metachar]") {
+    // Escape backslash is superfluous for the following set of characters
+    REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!"));
+    REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!"));
+    // Test the full escape sequences set
+    REQUIRE(
+            (regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\")
+                     .value()
+             == "\\*+\\?|^$.{}[]()<>-_/=!\\\\")
+    );
+    // Test unsupported escape sequences
+    REQUIRE(
+            (regex_to_wildcard("abc\\Qdefghi\\Ejkl").error()
+             == clp::regex_utils::ErrorCode::IllegalEscapeSequence)
+    );
 }
 
-TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") {
+TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
     // Test anchors and prefix/suffix wildcards
     RegexToWildcardTranslatorConfig const config{false, true};
     REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
diff --git a/docs/src/dev-guide/components-core/regex-utils.md b/docs/src/dev-guide/components-core/regex-utils.md
index a7ec16774..f7af037df 100644
--- a/docs/src/dev-guide/components-core/regex-utils.md
+++ b/docs/src/dev-guide/components-core/regex-utils.md
@@ -62,6 +62,21 @@ For a detailed description on the options order and usage, see the
   * Turn `.*` into `*`
   * Turn `.+` into `?*`
   * E.g. `abc.*def.ghi.+` will get translated to `abc*def?ghi?*`
+* Metacharacter escape sequences
+  * An escaped regex metacharacter is treated as a literal and appended to the wildcard output.
+    * The list of characters that require escaping to have their special meanings suppressed is
+      `[\/^$.|?*+(){}`.
+    * Superfluous escape characters are ignored for the following characters: `],<>-_=!`.
+    * E.g. `a\[\+b\-\_c-_d` will get translated to `a[+b-_c-_d`
+    * Note: generally, any non-alphanumeric character can be escaped to use it as a literal. The
+      list this utils library supports is non-exhaustive and can be expanded when necessary.
+  * For metacharacters shared by both syntaxes, keep the escape backslashes.
+    * The list of characters that fall into this category is `*?\`. All wildcard metacharacters are
+      also regex metacharacters.
+    * E.g. `a\*b\?c\\d` will get translated to `a\*b\?c\\d` (no change)
+  * Escape sequences with alphanumeric characters are disallowed.
+    * E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc.
+      cannot be translated.
 
 ### Custom configuration