From e74f04371af35c58c28098b9a31ee85758115994 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 25 Jul 2024 01:29:04 -0400
Subject: [PATCH] Implement translator logic to reduce regex character sets
 into wildcards.

---
 .../core/src/clp/regex_utils/ErrorCode.cpp    |   6 +
 .../core/src/clp/regex_utils/ErrorCode.hpp    |   2 +
 .../core/src/clp/regex_utils/constants.hpp    |   3 +
 .../regex_utils/regex_translation_utils.cpp   | 152 +++++++++++++++++-
 components/core/tests/test-regex_utils.cpp    |  43 ++++-
 .../dev-guide/components-core/regex-utils.md  |  11 +-
 6 files changed, 207 insertions(+), 10 deletions(-)
diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp
index 112ede242..c160c0396 100644
--- a/components/core/src/clp/regex_utils/ErrorCode.cpp
+++ b/components/core/src/clp/regex_utils/ErrorCode.cpp
@@ -72,6 +72,12 @@ auto ErrorCodeCategory::message(int ev) const -> string {
         case ErrorCode::UnmatchedParenthesis:
             return "Unmatched opening `(` or closing `)`.";
 
+        case ErrorCode::IncompleteCharsetStructure:
+            return "Unmatched closing `]` at the end of the string.";
+
+        case ErrorCode::UnsupportedCharsetPattern:
+            return "Currently only supports character set that contains a single character.";
+
         default:
             return "(unrecognized error)";
     }
diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp
index 77a52cf58..9b4fbf8f2 100644
--- a/components/core/src/clp/regex_utils/ErrorCode.hpp
+++ b/components/core/src/clp/regex_utils/ErrorCode.hpp
@@ -21,6 +21,8 @@ enum class ErrorCode : uint8_t {
     IllegalDollarSign,
     IllegalEscapeSequence,
     UnmatchedParenthesis,
+    IncompleteCharsetStructure,
+    UnsupportedCharsetPattern,
 };
 
 /**
diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp
index 9833543fc..ff2eb5b10 100644
--- a/components/core/src/clp/regex_utils/constants.hpp
+++ b/components/core/src/clp/regex_utils/constants.hpp
@@ -44,6 +44,9 @@ constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[](
 // The set of wildcard metacharacters that must remain escaped in the translated string to be
 // treated as a literal.
 constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\");
+// The set of metacharacters that can be preceded with an escape backslash in the regex character
+// set to be treated as a literal.
+constexpr auto cRegexCharsetEscapeSeqMetaCharsLut = create_char_bit_array("^-]\\");
 }  // namespace clp::regex_utils
 
 #endif  // CLP_REGEX_UTILS_CONSTANTS_HPP
diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp
index f26d70521..e28f1d36a 100644
--- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp
+++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp
@@ -6,12 +6,14 @@
 #include <system_error>
 
 #include <outcome/single-header/outcome.hpp>
+#include <string_utils/string_utils.hpp>
 
 #include "regex_utils/constants.hpp"
 #include "regex_utils/ErrorCode.hpp"
 #include "regex_utils/RegexToWildcardTranslatorConfig.hpp"
 
 namespace clp::regex_utils {
+using clp::string_utils::is_alphabet;
 using std::error_code;
 using std::string;
 using std::string_view;
@@ -31,6 +33,8 @@ class TranslatorState {
      * literally.</li>
      *   <li>Dot: Encountered a period `.`. Expecting wildcard expression.</li>
      *   <li>Escaped: Encountered a backslash `\`. Expecting an escape sequence.</li>
+     *   <li>Charset: Encountered an opening square bracket `[`. Expecting a character set.</li>
+     *   <li>CharsetEscaped: Encountered an escape backslash in the character set.</li>
      *   <li>End: Encountered a dollar sign `$`, meaning the regex string has reached the end
      * anchor.</li>
      * </ul>
@@ -39,21 +43,28 @@ class TranslatorState {
         Normal = 0,
         Dot,
         Escaped,
+        Charset,
+        CharsetEscaped,
         End,
     };
 
     // Constructor
-    TranslatorState() = default;
+    TranslatorState(string_view::const_iterator it) : m_it{it} {};
 
     // Getters
     [[nodiscard]] auto get_state() const -> RegexPatternState { return m_state; }
 
+    [[nodiscard]] auto get_marked_iterator() const -> string_view::const_iterator { return m_it; }
+
     // Setters
     auto set_next_state(RegexPatternState const& state) -> void { m_state = state; }
 
+    void mark_iterator(string_view::const_iterator it) { m_it = it; }
+
 private:
     // Members
     RegexPatternState m_state{RegexPatternState::Normal};
+    string_view::const_iterator m_it;
 };
 
 /**
@@ -65,7 +76,7 @@ class TranslatorState {
  * @param[in, out] it The iterator that represents the current regex string scan position. May be
  * updated to advance or backtrack the scan position.
  * @param[out] wildcard_str The translated wildcard string. May or may not be updated.
- * @param[in] config The translator config.
+ * @param[in] config The translator config predefined by the user.
  * @return clp::regex_utils::ErrorCode
  */
 using StateTransitionFuncSig
@@ -103,6 +114,25 @@ using StateTransitionFuncSig
  */
 [[nodiscard]] StateTransitionFuncSig escaped_state_transition;
 
+/**
+ * Attempts to reduce regex character sets into a single character so that the regex string is still
+ * translatable to wildcard.
+ *
+ * In most cases, only a trival character set containing a single character is reducable. However,
+ * if the output wildcard query will be analyzed in case-insensitive mode, character set patterns
+ * such as [aA] [Bb] are also reducable.
+ * Throws two possible kinds of error codes, with IncompleteCharsetStructure having a higher
+ * precedence over UnsupportedCharsetPattern.
+ */
+[[nodiscard]] StateTransitionFuncSig charset_state_transition;
+
+/**
+ * A transient state used to defer handling of escape sequences in a charset pattern.
+ *
+ * Allows the charset state to accurately capture the appearance of a closing bracket `]`.
+ */
+[[nodiscard]] StateTransitionFuncSig charsetescaped_state_transition;
+
 /**
  * Disallows the appearances of other characters after encountering an end anchor in the string.
  */
@@ -114,6 +144,36 @@ using StateTransitionFuncSig
  */
 [[nodiscard]] StateTransitionFuncSig final_state_cleanup;
 
+// Other helpers
+/**
+ * Appends a single character as a literal to the wildcard string.
+ *
+ * If the literal is a metacharacter in the wildcard syntax, prepend the literal with an escape
+ * backslash.
+ * @param ch The literal to be appended.
+ * @param wildcard_str The wildcard string to be updated.
+ */
+inline auto append_single_char_to_wildcard(char const ch, string& wildcard_str) -> void {
+    if (cWildcardMetaCharsLut.at(ch)) {
+        wildcard_str += cEscapeChar;
+    }
+    wildcard_str += ch;
+}
+
+/**
+ * Detects if the two input arguments are a matching pair of upper and lowercase characters.
+ *
+ * @param ch0
+ * @param ch1
+ * @return True if the input is a matching pair.
+ */
+inline auto matching_upper_lower_case_char_pair(char const ch0, char const ch1) -> bool {
+    int const upper_lower_case_ascii_offset{'a' - 'A'};
+    return (is_alphabet(ch0) && is_alphabet(ch1)
+            && (((ch0 - ch1) == upper_lower_case_ascii_offset)
+                || ((ch1 - ch0) == upper_lower_case_ascii_offset)));
+}
+
 auto normal_state_transition(
         TranslatorState& state,
         string_view::const_iterator& it,
@@ -128,6 +188,10 @@ auto normal_state_transition(
         case cEscapeChar:
             state.set_next_state(TranslatorState::RegexPatternState::Escaped);
             break;
+        case '[':
+            state.mark_iterator(it + 1);  // Mark the first character of character set
+            state.set_next_state(TranslatorState::RegexPatternState::Charset);
+            break;
         case cRegexEndAnchor:
             state.set_next_state(TranslatorState::RegexPatternState::End);
             break;
@@ -183,14 +247,74 @@ auto escaped_state_transition(
     if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) {
         return ErrorCode::IllegalEscapeSequence;
     }
-    if (cWildcardMetaCharsLut.at(ch)) {
-        wildcard_str += cEscapeChar;
+    append_single_char_to_wildcard(ch, wildcard_str);
+    state.set_next_state(TranslatorState::RegexPatternState::Normal);
+    return ErrorCode::Success;
+}
+
+auto charset_state_transition(
+        TranslatorState& state,
+        string_view::const_iterator& it,
+        string& wildcard_str,
+        RegexToWildcardTranslatorConfig const& config
+) -> error_code {
+    auto const ch{*it};
+    string_view::const_iterator charset_start{state.get_marked_iterator()};  // avoid casting to ptr
+    auto const charset_len{it - charset_start};
+
+    if (']' != ch) {
+        // Only process charset until a closing bracket is reached.
+        if (cEscapeChar == ch) {
+            state.set_next_state(TranslatorState::RegexPatternState::CharsetEscaped);
+        }
+        return ErrorCode::Success;
     }
-    wildcard_str += ch;
+
+    if (0 == charset_len || charset_len > 2) {
+        // Does not support empty charset or pattern that is longer than two characters.
+        return ErrorCode::UnsupportedCharsetPattern;
+    }
+
+    // Passed the length check. Now check for accepted charset patterns.
+    auto const ch0{*charset_start};
+    auto const ch1{*(charset_start + 1)};
+    char parsed_char{};
+
+    if (1 == charset_len) {
+        if (cCharsetNegate == ch0 || cEscapeChar == ch0) {
+            return ErrorCode::UnsupportedCharsetPattern;
+        }
+        parsed_char = ch0;
+    } else {  // 2 == charset_len
+        if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaCharsLut.at(ch1)) {
+            // 2-char escape sequence
+            parsed_char = ch1;
+        } else if (config.case_insensitive_wildcard()
+                   && matching_upper_lower_case_char_pair(ch0, ch1))
+        {
+            // case-insensitive patterns like [aA] [Bb] etc.
+            parsed_char = ch0 > ch1 ? ch0 : ch1;  // choose the lower case character
+        } else {
+            return ErrorCode::UnsupportedCharsetPattern;
+        }
+    }
+
+    append_single_char_to_wildcard(parsed_char, wildcard_str);
     state.set_next_state(TranslatorState::RegexPatternState::Normal);
     return ErrorCode::Success;
 }
 
+auto charsetescaped_state_transition(
+        TranslatorState& state,
+        [[maybe_unused]] string_view::const_iterator& it,
+        [[maybe_unused]] string& wildcard_str,
+        [[maybe_unused]] RegexToWildcardTranslatorConfig const& config
+) -> error_code {
+    // Defer the handling of escape sequences to entire character set analysis..
+    state.set_next_state(TranslatorState::RegexPatternState::Charset);
+    return ErrorCode::Success;
+}
+
 auto end_state_transition(
         [[maybe_unused]] TranslatorState& state,
         string_view::const_iterator& it,
@@ -215,6 +339,10 @@ auto final_state_cleanup(
             // multichar wildcard
             wildcard_str += cSingleCharWildcard;
             break;
+        case TranslatorState::RegexPatternState::Charset:
+        case TranslatorState::RegexPatternState::CharsetEscaped:
+            return ErrorCode::IncompleteCharsetStructure;
+            break;
         default:
             break;
     }
@@ -226,10 +354,14 @@ auto final_state_cleanup(
     }
     return ErrorCode::Success;
 }
+
 }  // namespace
 
 auto regex_to_wildcard(string_view regex_str) -> OUTCOME_V2_NAMESPACE::std_result<string> {
-    return regex_to_wildcard(regex_str, {false, false});
+    return regex_to_wildcard(
+            regex_str,
+            {/*case_insensitive_wildcard=*/false, /*add_prefix_suffix_wildcards=*/false}
+    );
 }
 
 auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config)
@@ -238,9 +370,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
         return string{};
     }
 
-    TranslatorState state;
     string_view::const_iterator it{regex_str.cbegin()};
     string wildcard_str;
+    TranslatorState state{it};
 
     // If there is no starting anchor character, append multichar wildcard prefix
     if (cRegexStartAnchor == *it) {
@@ -261,6 +393,12 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
             case TranslatorState::RegexPatternState::Escaped:
                 ec = escaped_state_transition(state, it, wildcard_str, config);
                 break;
+            case TranslatorState::RegexPatternState::Charset:
+                ec = charset_state_transition(state, it, wildcard_str, config);
+                break;
+            case TranslatorState::RegexPatternState::CharsetEscaped:
+                ec = charsetescaped_state_transition(state, it, wildcard_str, config);
+                break;
             case TranslatorState::RegexPatternState::End:
                 ec = end_state_transition(state, it, wildcard_str, config);
                 break;
diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp
index 9defd7d08..64af60318 100644
--- a/components/core/tests/test-regex_utils.cpp
+++ b/components/core/tests/test-regex_utils.cpp
@@ -41,9 +41,50 @@ TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_me
     );
 }
 
+TEST_CASE("regex_to_wildcard_charset", "[regex_utils][re2wc][charset]") {
+    REQUIRE((regex_to_wildcard("x[y]z").value() == "xyz"));
+    REQUIRE((regex_to_wildcard("x[\\^]z").value() == "x^z"));
+    REQUIRE((regex_to_wildcard("x[\\]]z").value() == "x]z"));
+    REQUIRE((regex_to_wildcard("x[-]z").value() == "x-z"));
+    REQUIRE((regex_to_wildcard("x[\\-]z").value() == "x-z"));
+    REQUIRE((regex_to_wildcard("x[\\\\]z").value() == "x\\\\z"));
+    REQUIRE((regex_to_wildcard("[a][b][\\^][-][\\-][\\]][\\\\][c][d]").value() == "ab^--]\\\\cd"));
+
+    REQUIRE((regex_to_wildcard("x[]y").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE((regex_to_wildcard("x[a-z]y").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE((regex_to_wildcard("x[^^]y").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE((regex_to_wildcard("x[^0-9]y").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE((regex_to_wildcard("[xX][yY]").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));
+
+    REQUIRE((regex_to_wildcard("[\\").error() == ErrorCode::IncompleteCharsetStructure));
+    REQUIRE((regex_to_wildcard("[\\\\").error() == ErrorCode::IncompleteCharsetStructure));
+    REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
+    REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9").error() == ErrorCode::IncompleteCharsetStructure));
+}
+
+TEST_CASE("regex_to_wildcard_case_insensitive_config", "[regex_utils][re2wc][case_insensitive]") {
+    RegexToWildcardTranslatorConfig const config{/*case_insensitive_wildcard=*/true, false};
+    REQUIRE((regex_to_wildcard("[xX][yY]", config).value() == "xy"));
+    REQUIRE((regex_to_wildcard("[Yy][Xx]", config).value() == "yx"));
+    REQUIRE((regex_to_wildcard("[aA][Bb][Cc]", config).value() == "abc"));
+    REQUIRE((regex_to_wildcard("[aA][Bb][\\^][-][\\]][Cc][dD]", config).value() == "ab^-]cd"));
+
+    REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
+    REQUIRE(
+            (regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD", config).error()
+             == ErrorCode::IncompleteCharsetStructure)
+    );
+    REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));
+    REQUIRE(
+            (regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD]", config).error()
+             == ErrorCode::UnsupportedCharsetPattern)
+    );
+}
+
 TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
     // Test anchors and prefix/suffix wildcards
-    RegexToWildcardTranslatorConfig const config{false, true};
+    RegexToWildcardTranslatorConfig const config{false, /*add_prefix_suffix_wildcards=*/true};
     REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
     REQUIRE((regex_to_wildcard("$", config).value() == "*"));
     REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz"));
diff --git a/docs/src/dev-guide/components-core/regex-utils.md b/docs/src/dev-guide/components-core/regex-utils.md
index f7af037df..c0f022a7a 100644
--- a/docs/src/dev-guide/components-core/regex-utils.md
+++ b/docs/src/dev-guide/components-core/regex-utils.md
@@ -77,14 +77,21 @@ For a detailed description on the options order and usage, see the
   * Escape sequences with alphanumeric characters are disallowed.
     * E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc.
       cannot be translated.
+* Character set
+  * Reduces a character set into a single character if possible.
+    * A trivial character set containing a single character or a single escaped metacharacter.
+      * E.g. `[a]` into `a`, `[\^]` into `^`
+    * If the `case_insensitive_wildcard` config is turned on, the translator can also reduce the
+      following patterns into a single lowercase character:
+      * E.g. `[aA]` into `a`, `[Bb]` into `b`, `[xX][Yy][zZ]` into `xyz`
 
 ### Custom configuration
 
 The `RegexToWildcardTranslatorConfig` class objects are currently immutable once instantiated. The
 constructor takes the following arguments in order:
 
-* `case_insensitive_wildcard`: to be added later along with the character set translation
-  implementation.
+* `case_insensitive_wildcard`: see *Character set* bullet point in the [Functionalities]
+  (#functionalities) section.
 
 * `add_prefix_suffix_wildcards`: in the absence of regex anchors, add prefix or suffix wildcards so
   the query becomes a substring query.