From e74f04371af35c58c28098b9a31ee85758115994 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 25 Jul 2024 01:29:04 -0400 Subject: [PATCH] Implement translator logic to reduce regex character sets into wildcards. --- .../core/src/clp/regex_utils/ErrorCode.cpp | 6 + .../core/src/clp/regex_utils/ErrorCode.hpp | 2 + .../core/src/clp/regex_utils/constants.hpp | 3 + .../regex_utils/regex_translation_utils.cpp | 152 +++++++++++++++++- components/core/tests/test-regex_utils.cpp | 43 ++++- .../dev-guide/components-core/regex-utils.md | 11 +- 6 files changed, 207 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp index 112ede242..c160c0396 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.cpp +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -72,6 +72,12 @@ auto ErrorCodeCategory::message(int ev) const -> string { case ErrorCode::UnmatchedParenthesis: return "Unmatched opening `(` or closing `)`."; + case ErrorCode::IncompleteCharsetStructure: + return "Unmatched closing `]` at the end of the string."; + + case ErrorCode::UnsupportedCharsetPattern: + return "Currently only supports character set that contains a single character."; + default: return "(unrecognized error)"; } diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp index 77a52cf58..9b4fbf8f2 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.hpp +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -21,6 +21,8 @@ enum class ErrorCode : uint8_t { IllegalDollarSign, IllegalEscapeSequence, UnmatchedParenthesis, + IncompleteCharsetStructure, + UnsupportedCharsetPattern, }; /** diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp index 9833543fc..ff2eb5b10 100644 --- a/components/core/src/clp/regex_utils/constants.hpp +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -44,6 +44,9 @@ constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[]( // The set of wildcard metacharacters that must remain escaped in the translated string to be // treated as a literal. constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\"); +// The set of metacharacters that can be preceded with an escape backslash in the regex character +// set to be treated as a literal. +constexpr auto cRegexCharsetEscapeSeqMetaCharsLut = create_char_bit_array("^-]\\"); } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp index f26d70521..e28f1d36a 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp @@ -6,12 +6,14 @@ #include #include +#include #include "regex_utils/constants.hpp" #include "regex_utils/ErrorCode.hpp" #include "regex_utils/RegexToWildcardTranslatorConfig.hpp" namespace clp::regex_utils { +using clp::string_utils::is_alphabet; using std::error_code; using std::string; using std::string_view; @@ -31,6 +33,8 @@ class TranslatorState { * literally. *
  • Dot: Encountered a period `.`. Expecting wildcard expression.
  • *
  • Escaped: Encountered a backslash `\`. Expecting an escape sequence.
  • + *
  • Charset: Encountered an opening square bracket `[`. Expecting a character set.
  • + *
  • CharsetEscaped: Encountered an escape backslash in the character set.
  • *
  • End: Encountered a dollar sign `$`, meaning the regex string has reached the end * anchor.
  • * @@ -39,21 +43,28 @@ class TranslatorState { Normal = 0, Dot, Escaped, + Charset, + CharsetEscaped, End, }; // Constructor - TranslatorState() = default; + TranslatorState(string_view::const_iterator it) : m_it{it} {}; // Getters [[nodiscard]] auto get_state() const -> RegexPatternState { return m_state; } + [[nodiscard]] auto get_marked_iterator() const -> string_view::const_iterator { return m_it; } + // Setters auto set_next_state(RegexPatternState const& state) -> void { m_state = state; } + void mark_iterator(string_view::const_iterator it) { m_it = it; } + private: // Members RegexPatternState m_state{RegexPatternState::Normal}; + string_view::const_iterator m_it; }; /** @@ -65,7 +76,7 @@ class TranslatorState { * @param[in, out] it The iterator that represents the current regex string scan position. May be * updated to advance or backtrack the scan position. * @param[out] wildcard_str The translated wildcard string. May or may not be updated. - * @param[in] config The translator config. + * @param[in] config The translator config predefined by the user. * @return clp::regex_utils::ErrorCode */ using StateTransitionFuncSig @@ -103,6 +114,25 @@ using StateTransitionFuncSig */ [[nodiscard]] StateTransitionFuncSig escaped_state_transition; +/** + * Attempts to reduce regex character sets into a single character so that the regex string is still + * translatable to wildcard. + * + * In most cases, only a trival character set containing a single character is reducable. However, + * if the output wildcard query will be analyzed in case-insensitive mode, character set patterns + * such as [aA] [Bb] are also reducable. + * Throws two possible kinds of error codes, with IncompleteCharsetStructure having a higher + * precedence over UnsupportedCharsetPattern. + */ +[[nodiscard]] StateTransitionFuncSig charset_state_transition; + +/** + * A transient state used to defer handling of escape sequences in a charset pattern. + * + * Allows the charset state to accurately capture the appearance of a closing bracket `]`. + */ +[[nodiscard]] StateTransitionFuncSig charsetescaped_state_transition; + /** * Disallows the appearances of other characters after encountering an end anchor in the string. */ @@ -114,6 +144,36 @@ using StateTransitionFuncSig */ [[nodiscard]] StateTransitionFuncSig final_state_cleanup; +// Other helpers +/** + * Appends a single character as a literal to the wildcard string. + * + * If the literal is a metacharacter in the wildcard syntax, prepend the literal with an escape + * backslash. + * @param ch The literal to be appended. + * @param wildcard_str The wildcard string to be updated. + */ +inline auto append_single_char_to_wildcard(char const ch, string& wildcard_str) -> void { + if (cWildcardMetaCharsLut.at(ch)) { + wildcard_str += cEscapeChar; + } + wildcard_str += ch; +} + +/** + * Detects if the two input arguments are a matching pair of upper and lowercase characters. + * + * @param ch0 + * @param ch1 + * @return True if the input is a matching pair. + */ +inline auto matching_upper_lower_case_char_pair(char const ch0, char const ch1) -> bool { + int const upper_lower_case_ascii_offset{'a' - 'A'}; + return (is_alphabet(ch0) && is_alphabet(ch1) + && (((ch0 - ch1) == upper_lower_case_ascii_offset) + || ((ch1 - ch0) == upper_lower_case_ascii_offset))); +} + auto normal_state_transition( TranslatorState& state, string_view::const_iterator& it, @@ -128,6 +188,10 @@ auto normal_state_transition( case cEscapeChar: state.set_next_state(TranslatorState::RegexPatternState::Escaped); break; + case '[': + state.mark_iterator(it + 1); // Mark the first character of character set + state.set_next_state(TranslatorState::RegexPatternState::Charset); + break; case cRegexEndAnchor: state.set_next_state(TranslatorState::RegexPatternState::End); break; @@ -183,14 +247,74 @@ auto escaped_state_transition( if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) { return ErrorCode::IllegalEscapeSequence; } - if (cWildcardMetaCharsLut.at(ch)) { - wildcard_str += cEscapeChar; + append_single_char_to_wildcard(ch, wildcard_str); + state.set_next_state(TranslatorState::RegexPatternState::Normal); + return ErrorCode::Success; +} + +auto charset_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str, + RegexToWildcardTranslatorConfig const& config +) -> error_code { + auto const ch{*it}; + string_view::const_iterator charset_start{state.get_marked_iterator()}; // avoid casting to ptr + auto const charset_len{it - charset_start}; + + if (']' != ch) { + // Only process charset until a closing bracket is reached. + if (cEscapeChar == ch) { + state.set_next_state(TranslatorState::RegexPatternState::CharsetEscaped); + } + return ErrorCode::Success; } - wildcard_str += ch; + + if (0 == charset_len || charset_len > 2) { + // Does not support empty charset or pattern that is longer than two characters. + return ErrorCode::UnsupportedCharsetPattern; + } + + // Passed the length check. Now check for accepted charset patterns. + auto const ch0{*charset_start}; + auto const ch1{*(charset_start + 1)}; + char parsed_char{}; + + if (1 == charset_len) { + if (cCharsetNegate == ch0 || cEscapeChar == ch0) { + return ErrorCode::UnsupportedCharsetPattern; + } + parsed_char = ch0; + } else { // 2 == charset_len + if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaCharsLut.at(ch1)) { + // 2-char escape sequence + parsed_char = ch1; + } else if (config.case_insensitive_wildcard() + && matching_upper_lower_case_char_pair(ch0, ch1)) + { + // case-insensitive patterns like [aA] [Bb] etc. + parsed_char = ch0 > ch1 ? ch0 : ch1; // choose the lower case character + } else { + return ErrorCode::UnsupportedCharsetPattern; + } + } + + append_single_char_to_wildcard(parsed_char, wildcard_str); state.set_next_state(TranslatorState::RegexPatternState::Normal); return ErrorCode::Success; } +auto charsetescaped_state_transition( + TranslatorState& state, + [[maybe_unused]] string_view::const_iterator& it, + [[maybe_unused]] string& wildcard_str, + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config +) -> error_code { + // Defer the handling of escape sequences to entire character set analysis.. + state.set_next_state(TranslatorState::RegexPatternState::Charset); + return ErrorCode::Success; +} + auto end_state_transition( [[maybe_unused]] TranslatorState& state, string_view::const_iterator& it, @@ -215,6 +339,10 @@ auto final_state_cleanup( // multichar wildcard wildcard_str += cSingleCharWildcard; break; + case TranslatorState::RegexPatternState::Charset: + case TranslatorState::RegexPatternState::CharsetEscaped: + return ErrorCode::IncompleteCharsetStructure; + break; default: break; } @@ -226,10 +354,14 @@ auto final_state_cleanup( } return ErrorCode::Success; } + } // namespace auto regex_to_wildcard(string_view regex_str) -> OUTCOME_V2_NAMESPACE::std_result { - return regex_to_wildcard(regex_str, {false, false}); + return regex_to_wildcard( + regex_str, + {/*case_insensitive_wildcard=*/false, /*add_prefix_suffix_wildcards=*/false} + ); } auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config) @@ -238,9 +370,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co return string{}; } - TranslatorState state; string_view::const_iterator it{regex_str.cbegin()}; string wildcard_str; + TranslatorState state{it}; // If there is no starting anchor character, append multichar wildcard prefix if (cRegexStartAnchor == *it) { @@ -261,6 +393,12 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co case TranslatorState::RegexPatternState::Escaped: ec = escaped_state_transition(state, it, wildcard_str, config); break; + case TranslatorState::RegexPatternState::Charset: + ec = charset_state_transition(state, it, wildcard_str, config); + break; + case TranslatorState::RegexPatternState::CharsetEscaped: + ec = charsetescaped_state_transition(state, it, wildcard_str, config); + break; case TranslatorState::RegexPatternState::End: ec = end_state_transition(state, it, wildcard_str, config); break; diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index 9defd7d08..64af60318 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -41,9 +41,50 @@ TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_me ); } +TEST_CASE("regex_to_wildcard_charset", "[regex_utils][re2wc][charset]") { + REQUIRE((regex_to_wildcard("x[y]z").value() == "xyz")); + REQUIRE((regex_to_wildcard("x[\\^]z").value() == "x^z")); + REQUIRE((regex_to_wildcard("x[\\]]z").value() == "x]z")); + REQUIRE((regex_to_wildcard("x[-]z").value() == "x-z")); + REQUIRE((regex_to_wildcard("x[\\-]z").value() == "x-z")); + REQUIRE((regex_to_wildcard("x[\\\\]z").value() == "x\\\\z")); + REQUIRE((regex_to_wildcard("[a][b][\\^][-][\\-][\\]][\\\\][c][d]").value() == "ab^--]\\\\cd")); + + REQUIRE((regex_to_wildcard("x[]y").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE((regex_to_wildcard("x[a-z]y").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE((regex_to_wildcard("x[^^]y").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE((regex_to_wildcard("x[^0-9]y").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE((regex_to_wildcard("[xX][yY]").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern)); + + REQUIRE((regex_to_wildcard("[\\").error() == ErrorCode::IncompleteCharsetStructure)); + REQUIRE((regex_to_wildcard("[\\\\").error() == ErrorCode::IncompleteCharsetStructure)); + REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure)); + REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9").error() == ErrorCode::IncompleteCharsetStructure)); +} + +TEST_CASE("regex_to_wildcard_case_insensitive_config", "[regex_utils][re2wc][case_insensitive]") { + RegexToWildcardTranslatorConfig const config{/*case_insensitive_wildcard=*/true, false}; + REQUIRE((regex_to_wildcard("[xX][yY]", config).value() == "xy")); + REQUIRE((regex_to_wildcard("[Yy][Xx]", config).value() == "yx")); + REQUIRE((regex_to_wildcard("[aA][Bb][Cc]", config).value() == "abc")); + REQUIRE((regex_to_wildcard("[aA][Bb][\\^][-][\\]][Cc][dD]", config).value() == "ab^-]cd")); + + REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure)); + REQUIRE( + (regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD", config).error() + == ErrorCode::IncompleteCharsetStructure) + ); + REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern)); + REQUIRE( + (regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD]", config).error() + == ErrorCode::UnsupportedCharsetPattern) + ); +} + TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") { // Test anchors and prefix/suffix wildcards - RegexToWildcardTranslatorConfig const config{false, true}; + RegexToWildcardTranslatorConfig const config{false, /*add_prefix_suffix_wildcards=*/true}; REQUIRE(((regex_to_wildcard("^", config).value() == "*"))); REQUIRE((regex_to_wildcard("$", config).value() == "*")); REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz")); diff --git a/docs/src/dev-guide/components-core/regex-utils.md b/docs/src/dev-guide/components-core/regex-utils.md index f7af037df..c0f022a7a 100644 --- a/docs/src/dev-guide/components-core/regex-utils.md +++ b/docs/src/dev-guide/components-core/regex-utils.md @@ -77,14 +77,21 @@ For a detailed description on the options order and usage, see the * Escape sequences with alphanumeric characters are disallowed. * E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc. cannot be translated. +* Character set + * Reduces a character set into a single character if possible. + * A trivial character set containing a single character or a single escaped metacharacter. + * E.g. `[a]` into `a`, `[\^]` into `^` + * If the `case_insensitive_wildcard` config is turned on, the translator can also reduce the + following patterns into a single lowercase character: + * E.g. `[aA]` into `a`, `[Bb]` into `b`, `[xX][Yy][zZ]` into `xyz` ### Custom configuration The `RegexToWildcardTranslatorConfig` class objects are currently immutable once instantiated. The constructor takes the following arguments in order: -* `case_insensitive_wildcard`: to be added later along with the character set translation - implementation. +* `case_insensitive_wildcard`: see *Character set* bullet point in the [Functionalities] + (#functionalities) section. * `add_prefix_suffix_wildcards`: in the absence of regex anchors, add prefix or suffix wildcards so the query becomes a substring query.