From 9f6b02f607ba2a01f71279532442881a7a8d4629 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 19 Jul 2024 15:27:06 -0400 Subject: [PATCH] Add support for translating escaped regex metacharacters. --- .../core/src/clp/regex_utils/ErrorCode.cpp | 4 +++ .../core/src/clp/regex_utils/ErrorCode.hpp | 1 + .../core/src/clp/regex_utils/constants.hpp | 29 ++++++++++++++++ .../regex_utils/regex_translation_utils.cpp | 34 +++++++++++++++++++ components/core/tests/test-regex_utils.cpp | 13 +++++++ 5 files changed, 81 insertions(+) diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp index 5779263e8..112ede242 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.cpp +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -65,6 +65,10 @@ auto ErrorCodeCategory::message(int ev) const -> string { case ErrorCode::IllegalDollarSign: return "Failed to translate due to end anchor `$` in the middle of the string."; + case ErrorCode::IllegalEscapeSequence: + return "Currently only supports escape sequences that are used to suppress special " + "meanings of regex metacharacters. Alphanumeric characters are disallowed."; + case ErrorCode::UnmatchedParenthesis: return "Unmatched opening `(` or closing `)`."; diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp index 1babb2fec..77a52cf58 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.hpp +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -19,6 +19,7 @@ enum class ErrorCode : uint8_t { UnsupportedPipe, IllegalCaret, IllegalDollarSign, + IllegalEscapeSequence, UnmatchedParenthesis, }; diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp index 879e7641d..aff77126a 100644 --- a/components/core/src/clp/regex_utils/constants.hpp +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -1,7 +1,29 @@ #ifndef CLP_REGEX_UTILS_CONSTANTS_HPP #define CLP_REGEX_UTILS_CONSTANTS_HPP +#include +#include +#include + namespace clp::regex_utils { +constexpr size_t cCharBitarraySize = 128; + +/** + * Create an ASCII character lookup table at compile time. + * + * @param char_str A string that contains the characters to look up. + * @return The lookup table as bit array. + */ +[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str +) -> std::array { + std::array bit_array{}; + bit_array.fill(false); + for (char const ch : char_str) { + bit_array.at(ch) = true; + } + return bit_array; +} + // Wildcard meta characters constexpr char cZeroOrMoreCharsWildcard{'*'}; constexpr char cSingleCharWildcard{'?'}; @@ -14,6 +36,13 @@ constexpr char cRegexStartAnchor{'^'}; constexpr char cRegexEndAnchor{'$'}; constexpr char cEscapeChar{'\\'}; constexpr char cCharsetNegate{'^'}; + +// Character bitmaps +// This is a more complete set of meta characters than necessary, as the user might not be fully +// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences. +constexpr auto cRegexEscapeSeqMetaChars = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\"); +// This is the set of meta characters that need to be escaped in the wildcard syntax. +constexpr auto cWildcardMetaChars = create_char_bit_array("?*\\"); } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp index 349c106f4..acccc2bfb 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp @@ -37,6 +37,7 @@ class TranslatorState { enum class RegexPatternState : uint8_t { NORMAL = 0, DOT, + ESCAPED, END, }; @@ -92,6 +93,14 @@ using StateTransitionFuncSig */ [[nodiscard]] StateTransitionFuncSig dot_state_transition; +/** + * Appends regex metacharacters literally to the wildcard string. + * + * These metacharacters are escaped by backslashes, so they have their special meanings suppressed. + * For metacharacters shared by the regex and the wildcard syntax, keep the escape backslashes. + */ +[[nodiscard]] StateTransitionFuncSig escaped_state_transition; + /** * Disallows the appearances of other characters after encountering an end anchor in the string. */ @@ -114,6 +123,9 @@ auto normal_state_transition( case '.': state.set_next_state(TranslatorState::RegexPatternState::DOT); break; + case cEscapeChar: + state.set_next_state(TranslatorState::RegexPatternState::ESCAPED); + break; case cRegexEndAnchor: state.set_next_state(TranslatorState::RegexPatternState::END); break; @@ -159,6 +171,25 @@ auto dot_state_transition( return ErrorCode::Success; } +auto escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str, + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config +) -> error_code { + auto const ch{*it}; + if (!cRegexEscapeSeqMetaChars.at(ch)) { + return ErrorCode::IllegalEscapeSequence; + } + if (cWildcardMetaChars.at(ch)) { + wildcard_str = wildcard_str + cEscapeChar + ch; + } else { + wildcard_str += ch; + } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + auto end_state_transition( [[maybe_unused]] TranslatorState& state, string_view::const_iterator& it, @@ -226,6 +257,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co case TranslatorState::RegexPatternState::DOT: ec = dot_state_transition(state, it, wildcard_str, config); break; + case TranslatorState::RegexPatternState::ESCAPED: + ec = escaped_state_transition(state, it, wildcard_str, config); + break; case TranslatorState::RegexPatternState::END: ec = end_state_transition(state, it, wildcard_str, config); break; diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index fc79b966a..633eb5c64 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -23,6 +23,19 @@ TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus)); REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe)); REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret)); + + // Test escaped meta characters + REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!")); + REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!")); + REQUIRE( + (regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\") + .value() + == "\\*+\\?|^$.{}[]()<>-_/=!\\\\") + ); + REQUIRE( + (regex_to_wildcard("abc\\Qdefghi\\Ejkl").error() + == clp::regex_utils::ErrorCode::IllegalEscapeSequence) + ); } TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") {