From 9f85a883e4809152ea4016105f81755c569a34c3 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 24 Jul 2024 13:49:15 -0400 Subject: [PATCH] regex-utils: Add support for handling escaped regex metacharacters. (#487) --- components/core/.clang-format | 2 +- .../core/src/clp/regex_utils/ErrorCode.cpp | 4 ++ .../core/src/clp/regex_utils/ErrorCode.hpp | 1 + .../core/src/clp/regex_utils/constants.hpp | 30 +++++++++ .../regex_utils/regex_translation_utils.cpp | 67 ++++++++++++++----- components/core/tests/test-regex_utils.cpp | 30 +++++++-- .../dev-guide/components-core/regex-utils.md | 15 +++++ 7 files changed, 125 insertions(+), 24 deletions(-) diff --git a/components/core/.clang-format b/components/core/.clang-format index c8e66579c..35934f594 100644 --- a/components/core/.clang-format +++ b/components/core/.clang-format @@ -75,7 +75,7 @@ IncludeCategories: # Library headers. Update when adding new libraries. # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec. - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mariadb\ -|mongocxx|msgpack|outcome|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)" +|mongocxx|msgpack|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)" Priority: 3 # C system headers - Regex: "^<.+\\.h>" diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp index 5779263e8..112ede242 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.cpp +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -65,6 +65,10 @@ auto ErrorCodeCategory::message(int ev) const -> string { case ErrorCode::IllegalDollarSign: return "Failed to translate due to end anchor `$` in the middle of the string."; + case ErrorCode::IllegalEscapeSequence: + return "Currently only supports escape sequences that are used to suppress special " + "meanings of regex metacharacters. Alphanumeric characters are disallowed."; + case ErrorCode::UnmatchedParenthesis: return "Unmatched opening `(` or closing `)`."; diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp index 1babb2fec..77a52cf58 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.hpp +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -19,6 +19,7 @@ enum class ErrorCode : uint8_t { UnsupportedPipe, IllegalCaret, IllegalDollarSign, + IllegalEscapeSequence, UnmatchedParenthesis, }; diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp index 879e7641d..9833543fc 100644 --- a/components/core/src/clp/regex_utils/constants.hpp +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -1,7 +1,29 @@ #ifndef CLP_REGEX_UTILS_CONSTANTS_HPP #define CLP_REGEX_UTILS_CONSTANTS_HPP +#include +#include +#include + namespace clp::regex_utils { +constexpr size_t cCharBitarraySize = 128; + +/** + * Creates an ASCII character lookup table at compile time. + * + * @param char_str A string that contains the characters to look up. + * @return The lookup table as bit array. + */ +[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str +) -> std::array { + std::array bit_array{}; + bit_array.fill(false); + for (auto const ch : char_str) { + bit_array.at(ch) = true; + } + return bit_array; +} + // Wildcard meta characters constexpr char cZeroOrMoreCharsWildcard{'*'}; constexpr char cSingleCharWildcard{'?'}; @@ -14,6 +36,14 @@ constexpr char cRegexStartAnchor{'^'}; constexpr char cRegexEndAnchor{'$'}; constexpr char cEscapeChar{'\\'}; constexpr char cCharsetNegate{'^'}; + +// Character bitmaps +// The set of regex metacharacters that can be preceded with an escape backslash to be treated as a +// literal. +constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\"); +// The set of wildcard metacharacters that must remain escaped in the translated string to be +// treated as a literal. +constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\"); } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp index 349c106f4..f26d70521 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp @@ -27,17 +27,19 @@ class TranslatorState { * * This list may be expanded as the translator supports translating more regex patterns. *
    - *
  • NORMAL: The initial state, where characters have no special meanings and are treated + *
  • Normal: The initial state, where characters have no special meanings and are treated * literally.
  • - *
  • DOT: Encountered a period `.`. Expecting wildcard expression.
  • - *
  • END: Encountered a dollar sign `$`, meaning the regex string has reached the end + *
  • Dot: Encountered a period `.`. Expecting wildcard expression.
  • + *
  • Escaped: Encountered a backslash `\`. Expecting an escape sequence.
  • + *
  • End: Encountered a dollar sign `$`, meaning the regex string has reached the end * anchor.
  • *
*/ enum class RegexPatternState : uint8_t { - NORMAL = 0, - DOT, - END, + Normal = 0, + Dot, + Escaped, + End, }; // Constructor @@ -51,7 +53,7 @@ class TranslatorState { private: // Members - RegexPatternState m_state{RegexPatternState::NORMAL}; + RegexPatternState m_state{RegexPatternState::Normal}; }; /** @@ -92,13 +94,22 @@ using StateTransitionFuncSig */ [[nodiscard]] StateTransitionFuncSig dot_state_transition; +/** + * Appends an escaped regex metacharacter as a literal character to the wildcard string by + * discarding its preceding backslash. + * + * The preceding backslash must be kept for characters that also have special meanings in the + * wildcard syntax, e.g. `abc.\*xyz` should be translated into `abc?\*xyz` instead of `abc?*xyz`. + */ +[[nodiscard]] StateTransitionFuncSig escaped_state_transition; + /** * Disallows the appearances of other characters after encountering an end anchor in the string. */ [[nodiscard]] StateTransitionFuncSig end_state_transition; /** - * States other than the NORMAL state may require special handling after the whole regex string has + * States other than the Normal state may require special handling after the whole regex string has * been scanned and processed. */ [[nodiscard]] StateTransitionFuncSig final_state_cleanup; @@ -112,10 +123,13 @@ auto normal_state_transition( auto const ch{*it}; switch (ch) { case '.': - state.set_next_state(TranslatorState::RegexPatternState::DOT); + state.set_next_state(TranslatorState::RegexPatternState::Dot); + break; + case cEscapeChar: + state.set_next_state(TranslatorState::RegexPatternState::Escaped); break; case cRegexEndAnchor: - state.set_next_state(TranslatorState::RegexPatternState::END); + state.set_next_state(TranslatorState::RegexPatternState::End); break; case cRegexZeroOrMore: return ErrorCode::UntranslatableStar; @@ -155,7 +169,25 @@ auto dot_state_transition( --it; break; } - state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + state.set_next_state(TranslatorState::RegexPatternState::Normal); + return ErrorCode::Success; +} + +auto escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str, + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config +) -> error_code { + auto const ch{*it}; + if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) { + return ErrorCode::IllegalEscapeSequence; + } + if (cWildcardMetaCharsLut.at(ch)) { + wildcard_str += cEscapeChar; + } + wildcard_str += ch; + state.set_next_state(TranslatorState::RegexPatternState::Normal); return ErrorCode::Success; } @@ -178,7 +210,7 @@ auto final_state_cleanup( RegexToWildcardTranslatorConfig const& config ) -> error_code { switch (state.get_state()) { - case TranslatorState::RegexPatternState::DOT: + case TranslatorState::RegexPatternState::Dot: // The last character is a single `.`, without the possibility of becoming a // multichar wildcard wildcard_str += cSingleCharWildcard; @@ -187,7 +219,7 @@ auto final_state_cleanup( break; } - if (TranslatorState::RegexPatternState::END != state.get_state() + if (TranslatorState::RegexPatternState::End != state.get_state() && config.add_prefix_suffix_wildcards()) { wildcard_str += cZeroOrMoreCharsWildcard; @@ -220,13 +252,16 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co error_code ec{}; while (it != regex_str.cend()) { switch (state.get_state()) { - case TranslatorState::RegexPatternState::NORMAL: + case TranslatorState::RegexPatternState::Normal: ec = normal_state_transition(state, it, wildcard_str, config); break; - case TranslatorState::RegexPatternState::DOT: + case TranslatorState::RegexPatternState::Dot: ec = dot_state_transition(state, it, wildcard_str, config); break; - case TranslatorState::RegexPatternState::END: + case TranslatorState::RegexPatternState::Escaped: + ec = escaped_state_transition(state, it, wildcard_str, config); + break; + case TranslatorState::RegexPatternState::End: ec = end_state_transition(state, it, wildcard_str, config); break; default: diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index fc79b966a..9defd7d08 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -1,31 +1,47 @@ +#include #include #include #include -#include - using clp::regex_utils::ErrorCode; using clp::regex_utils::regex_to_wildcard; using clp::regex_utils::RegexToWildcardTranslatorConfig; -TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { - // Test empty string +TEST_CASE("regex_to_wildcard_simple_translations", "[regex_utils][re2wc][simple_translations]") { REQUIRE(regex_to_wildcard("").value().empty()); - // Test simple wildcard translations REQUIRE((regex_to_wildcard("xyz").value() == "xyz")); REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?")); REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *")); +} - // Test unescaped meta characters +TEST_CASE("regex_to_wildcard_unescaped_metachar", "[regex_utils][re2wc][unescaped_metachar]") { REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::UnsupportedQuestionMark)); REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::UntranslatableStar)); REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus)); REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe)); REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret)); + REQUIRE((regex_to_wildcard(". xyz $.* zyx .").error() == ErrorCode::IllegalDollarSign)); +} + +TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_metachar]") { + // Escape backslash is superfluous for the following set of characters + REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!")); + REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!")); + // Test the full escape sequences set + REQUIRE( + (regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\") + .value() + == "\\*+\\?|^$.{}[]()<>-_/=!\\\\") + ); + // Test unsupported escape sequences + REQUIRE( + (regex_to_wildcard("abc\\Qdefghi\\Ejkl").error() + == clp::regex_utils::ErrorCode::IllegalEscapeSequence) + ); } -TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") { +TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") { // Test anchors and prefix/suffix wildcards RegexToWildcardTranslatorConfig const config{false, true}; REQUIRE(((regex_to_wildcard("^", config).value() == "*"))); diff --git a/docs/src/dev-guide/components-core/regex-utils.md b/docs/src/dev-guide/components-core/regex-utils.md index a7ec16774..f7af037df 100644 --- a/docs/src/dev-guide/components-core/regex-utils.md +++ b/docs/src/dev-guide/components-core/regex-utils.md @@ -62,6 +62,21 @@ For a detailed description on the options order and usage, see the * Turn `.*` into `*` * Turn `.+` into `?*` * E.g. `abc.*def.ghi.+` will get translated to `abc*def?ghi?*` +* Metacharacter escape sequences + * An escaped regex metacharacter is treated as a literal and appended to the wildcard output. + * The list of characters that require escaping to have their special meanings suppressed is + `[\/^$.|?*+(){}`. + * Superfluous escape characters are ignored for the following characters: `],<>-_=!`. + * E.g. `a\[\+b\-\_c-_d` will get translated to `a[+b-_c-_d` + * Note: generally, any non-alphanumeric character can be escaped to use it as a literal. The + list this utils library supports is non-exhaustive and can be expanded when necessary. + * For metacharacters shared by both syntaxes, keep the escape backslashes. + * The list of characters that fall into this category is `*?\`. All wildcard metacharacters are + also regex metacharacters. + * E.g. `a\*b\?c\\d` will get translated to `a\*b\?c\\d` (no change) + * Escape sequences with alphanumeric characters are disallowed. + * E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc. + cannot be translated. ### Custom configuration