From 58657e0d1490249f7bbe81c005da568cafcb3316 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Sat, 13 Jul 2024 22:25:55 -0400 Subject: [PATCH] Add regex utils including regex to wildcard translation --- components/core/CMakeLists.txt | 3 + .../core/src/clp/regex_utils/CMakeLists.txt | 22 + .../core/src/clp/regex_utils/ErrorCode.cpp | 93 +++ .../core/src/clp/regex_utils/ErrorCode.hpp | 46 ++ .../RegexToWildcardTranslatorConfig.hpp | 42 ++ .../core/src/clp/regex_utils/constants.hpp | 48 ++ .../core/src/clp/regex_utils/regex_utils.hpp | 49 ++ .../clp/regex_utils/regex_utils_anchors.cpp | 64 ++ .../regex_utils_regex_to_wildcard.cpp | 614 ++++++++++++++++++ components/core/tests/test-regex_utils.cpp | 297 +++++++++ 10 files changed, 1278 insertions(+) create mode 100644 components/core/src/clp/regex_utils/CMakeLists.txt create mode 100644 components/core/src/clp/regex_utils/ErrorCode.cpp create mode 100644 components/core/src/clp/regex_utils/ErrorCode.hpp create mode 100644 components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp create mode 100644 components/core/src/clp/regex_utils/constants.hpp create mode 100644 components/core/src/clp/regex_utils/regex_utils.hpp create mode 100644 components/core/src/clp/regex_utils/regex_utils_anchors.cpp create mode 100644 components/core/src/clp/regex_utils/regex_utils_regex_to_wildcard.cpp create mode 100644 components/core/tests/test-regex_utils.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 7cba49acb..e3d73843a 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -210,6 +210,7 @@ include(cmake/Modules/FindLibraryDependencies.cmake) FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}") add_subdirectory(src/clp/string_utils) +add_subdirectory(src/clp/regex_utils) add_subdirectory(src/clp/clg) add_subdirectory(src/clp/clo) @@ -475,6 +476,7 @@ set(SOURCE_FILES_unitTest tests/test-Stopwatch.cpp tests/test-StreamingCompression.cpp tests/test-string_utils.cpp + tests/test-regex_utils.cpp tests/test-TimestampPattern.cpp tests/test-utf8_utils.cpp tests/test-Utils.cpp @@ -498,6 +500,7 @@ target_link_libraries(unitTest ${sqlite_LIBRARY_DEPENDENCIES} ${STD_FS_LIBS} clp::string_utils + clp::regex_utils yaml-cpp::yaml-cpp ZStd::ZStd ) diff --git a/components/core/src/clp/regex_utils/CMakeLists.txt b/components/core/src/clp/regex_utils/CMakeLists.txt new file mode 100644 index 000000000..39a290a9c --- /dev/null +++ b/components/core/src/clp/regex_utils/CMakeLists.txt @@ -0,0 +1,22 @@ +set( + REGEX_UTILS_HEADER_LIST + "ErrorCode.hpp" + "RegexToWildcardTranslatorConfig.hpp" + "constants.hpp" + "regex_utils.hpp" +) +add_library( + regex_utils + regex_utils_regex_to_wildcard.cpp + regex_utils_anchors.cpp + ErrorCode.cpp + ${REGEX_UTILS_HEADER_LIST} +) +add_library(clp::regex_utils ALIAS regex_utils) +target_include_directories(regex_utils + PUBLIC + ../ + PRIVATE + "${PROJECT_SOURCE_DIR}/submodules" +) +target_compile_features(regex_utils PRIVATE cxx_std_20) diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp new file mode 100644 index 000000000..acc59abeb --- /dev/null +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -0,0 +1,93 @@ +#include "regex_utils/ErrorCode.hpp" + +#include +#include +#include + +using std::error_category; +using std::error_code; +using std::string; +using std::string_view; + +namespace clp::regex_utils { + +/** + * Class for giving the error codes more detailed string descriptions. + * This class does not need to be seen outside the std error code wrapper implementation. + */ +class ErrorCodeCategory : public error_category { +public: + /** + * @return The class of errors. + */ + [[nodiscard]] char const* name() const noexcept override; + + /** + * @param The error code encoded in int. + * @return The descriptive message for the error. + */ + [[nodiscard]] string message(int ev) const override; +}; + +auto ErrorCodeCategory::name() const noexcept -> char const* { + return "regex utility"; +} + +auto ErrorCodeCategory::message(int ev) const -> string { + switch (static_cast(ev)) { + case ErrorCode::Success: + return "Success."; + + case ErrorCode::IllegalState: + return "Unrecognized state."; + + case ErrorCode::Star: + return "Failed to translate due to metachar `*` (zero or more occurences)."; + + case ErrorCode::Plus: + return "Failed to translate due to metachar `+` (one or more occurences)."; + + case ErrorCode::Question: + return "Currently does not support returning a list of wildcard translations. The " + "metachar `?` (lazy match) may be supported in the future."; + + case ErrorCode::Pipe: + return "Currently does not support returning a list of wildcard translations. The " + "regex OR condition feature may be supported in the future."; + + case ErrorCode::Caret: + return "Failed to translate due to start anchor `^` in the middle of the string."; + + case ErrorCode::Dollar: + return "Failed to translate due to end anchor `$` in the middle of the string."; + + case ErrorCode::DisallowedEscapeSequence: + return "Disallowed escape sequence."; + + case ErrorCode::UnmatchedParenthesis: + return "Unmatched opening `(` or closing `)`."; + + case ErrorCode::UnsupportedCharsets: + return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB])."; + + case ErrorCode::IncompleteCharsetStructure: + return "Unmatched closing `]` at the end of the string."; + + case ErrorCode::UnsupportedQuantifier: + return "Currently only supports exact positive number of repetitions in regex syntax."; + + case ErrorCode::TokenUnquantifiable: + return "The preceding token is not quantifiable."; + + default: + return "(unrecognized error)"; + } +} + +ErrorCodeCategory const cTheErrorCodeCategory{}; + +auto make_error_code(ErrorCode e) -> error_code { + return {static_cast(e), cTheErrorCodeCategory}; +} + +} // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp new file mode 100644 index 000000000..4fa9204fc --- /dev/null +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -0,0 +1,46 @@ +#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP +#define CLP_REGEX_UTILS_ERRORCODE_HPP + +#include +#include +#include + +namespace clp::regex_utils { + +/** + * Enum class for propagating and handling various regex utility errors. + * More detailed descriptions can be found in ErrorCode.cpp. + */ +enum class ErrorCode : uint8_t { + Success = 0, + IllegalState, + Star, + Plus, + Question, + Pipe, + Caret, + Dollar, + DisallowedEscapeSequence, + UnmatchedParenthesis, + UnsupportedCharsets, + IncompleteCharsetStructure, + UnsupportedQuantifier, + TokenUnquantifiable, +}; + +/** + * Wrapper function to turn a regular enum class into an std::error_code. + * + * @param An error code enum. + * @return The corresponding std::error_code type variable. + */ +[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code; + +} // namespace clp::regex_utils + +namespace std { +template <> +struct is_error_code_enum : true_type {}; +} // namespace std + +#endif // CLP_REGEX_UTILS_ERRORCODE_HPP diff --git a/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp new file mode 100644 index 000000000..379b327e5 --- /dev/null +++ b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp @@ -0,0 +1,42 @@ +#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP +#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP + +namespace clp::regex_utils { + +class RegexToWildcardTranslatorConfig { +public: + // Constructors + RegexToWildcardTranslatorConfig() = default; + + // Getters + [[nodiscard]] auto case_insensitive_wildcard() const -> bool { + return m_case_insensitive_wildcard; + } + + [[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; } + + [[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool { + return m_add_prefix_suffix_wildcards; + } + + // Setters + void set_case_insensitive_wildcard(bool case_insensitive_wildcard) { + m_case_insensitive_wildcard = case_insensitive_wildcard; + } + + void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; } + + void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) { + m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards; + } + +private: + // Variables + bool m_case_insensitive_wildcard = false; + bool m_allow_anchors = true; + bool m_add_prefix_suffix_wildcards = false; +}; + +} // namespace clp::regex_utils + +#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp new file mode 100644 index 000000000..e05ccfe83 --- /dev/null +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -0,0 +1,48 @@ +#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP +#define CLP_REGEX_UTILS_CONSTANTS_HPP + +#include +#include +#include + +namespace clp::regex_utils { + +constexpr size_t cCharBitarraySize = 128; + +/** + * Create an ASCII character lookup table (bit array) at compile time. + * + * @param char_str A string that contains the characters to look up. + * @return The lookup table as bit array + */ +[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str +) -> std::array { + std::array bit_array{}; + bit_array.fill(false); + for (char const ch : char_str) { + bit_array.at(ch) = true; + } + return bit_array; +} + +constexpr char cZeroOrMoreCharsWildcard{'*'}; +constexpr char cSingleCharWildcard{'?'}; +constexpr char cRegexZeroOrMore{'*'}; +constexpr char cRegexOneOrMore{'+'}; +constexpr char cRegexZeroOrOne{'+'}; +constexpr char cRegexStartAnchor{'^'}; +constexpr char cRegexEndAnchor{'$'}; +constexpr char cEscapeChar{'\\'}; +constexpr char cCharsetNegate{'^'}; + +// This is a more complete set of meta characters than necessary, as the user might not be fully +// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences. +constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\"); +// This is the set of meta characters that need escaping in the wildcard syntax. +constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\"); +// This is the set of meta characters that need escaping in the character set. +constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\"); + +} // namespace clp::regex_utils + +#endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_utils.hpp b/components/core/src/clp/regex_utils/regex_utils.hpp new file mode 100644 index 000000000..2d1bf43f0 --- /dev/null +++ b/components/core/src/clp/regex_utils/regex_utils.hpp @@ -0,0 +1,49 @@ +#ifndef CLP_REGEX_UTILS_REGEX_UTILS_HPP +#define CLP_REGEX_UTILS_REGEX_UTILS_HPP + +#include +#include + +#include +#include + +#include "regex_utils/RegexToWildcardTranslatorConfig.hpp" + +namespace clp::regex_utils { + +[[nodiscard]] auto regex_to_wildcard(std::string_view regex_str +) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + +[[nodiscard]] auto regex_to_wildcard( + std::string_view regex_str, + RegexToWildcardTranslatorConfig const& config +) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + +/** + * If a regex expression contains multiple starting or ending anchors, remove the duplicates. + * + * @param regex_str + * @return Trimmed the regex string, leaving at most one starting or ending anchor. + */ +[[nodiscard]] auto regex_trim_line_anchors(std::string_view regex_str) -> std::string; + +/** + * Check if a regex string has a starting anchor character `^` (caret). + * + * @param regex_str + * @return True if the regex string begins with `^`, false otherwise. + */ +[[nodiscard]] auto regex_has_start_anchor(std::string_view regex_str) -> bool; + +/** + * Check if a regex string has an ending anchor character `$` (dollar sign). + * Note that the regex string may end with an escaped `$`, in which case the `$` character retain + * its literal meaning. + * + * @param regex_str + * @return True if the regex string ends with an unescaped `$`, false otherwise. + */ +[[nodiscard]] auto regex_has_end_anchor(std::string_view regex_str) -> bool; +} // namespace clp::regex_utils + +#endif // CLP_REGEX_UTILS_REGEX_UTILS_HPP diff --git a/components/core/src/clp/regex_utils/regex_utils_anchors.cpp b/components/core/src/clp/regex_utils/regex_utils_anchors.cpp new file mode 100644 index 000000000..a204a3cfc --- /dev/null +++ b/components/core/src/clp/regex_utils/regex_utils_anchors.cpp @@ -0,0 +1,64 @@ +#include +#include + +#include "regex_utils/constants.hpp" +#include "regex_utils/regex_utils.hpp" + +using std::string; +using std::string_view; + +namespace clp::regex_utils { + +auto regex_trim_line_anchors(string_view regex_str) -> string { + string_view::const_iterator left(regex_str.begin()); + string_view::const_iterator right(regex_str.end()); + + // Find the position of the first non-caret character + while (left != right && cRegexStartAnchor == *left) { + ++left; + } + // Backtrack one char to include at least one start anchor, if there was any. + if (left != regex_str.begin()) { + --left; + } + + // Find the position of the last non-dollar-sign character + while (left != right && cRegexEndAnchor == *(right - 1)) { + --right; + } + if (left != right && right != regex_str.end()) { + // There was at least one end anchor so we include it by advancing one char + ++right; + } + + // If there was more than one end anchor, we need to check if the current end anchor is escaped. + // If so, it's not a real end anchor, and we need to advance the end position once more to + // append a real end anchor. + string trimmed_regex_str(left, right); + if (right != regex_str.end() && !regex_has_end_anchor(trimmed_regex_str)) { + trimmed_regex_str += cRegexEndAnchor; + } + return trimmed_regex_str; +} + +auto regex_has_start_anchor(string_view regex_str) -> bool { + return !regex_str.empty() && cRegexStartAnchor == regex_str.at(0); +} + +auto regex_has_end_anchor(string_view regex_str) -> bool { + auto it{regex_str.rbegin()}; + if (it == regex_str.rend() || cRegexEndAnchor != *it) { + return false; + } + + // Check that ending regex dollar sigh char is unescaped. + // We need to scan the suffix until we encounter a character that is not an + // escape char, since escape chars can escape themselves. + bool escaped{false}; + for (++it; it != regex_str.rend() && cEscapeChar == *it; ++it) { + escaped = !escaped; + } + return !escaped; +} + +} // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/regex_utils_regex_to_wildcard.cpp b/components/core/src/clp/regex_utils/regex_utils_regex_to_wildcard.cpp new file mode 100644 index 000000000..5435c9ab6 --- /dev/null +++ b/components/core/src/clp/regex_utils/regex_utils_regex_to_wildcard.cpp @@ -0,0 +1,614 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "regex_utils/constants.hpp" +#include "regex_utils/ErrorCode.hpp" +#include "regex_utils/regex_utils.hpp" +#include "regex_utils/RegexToWildcardTranslatorConfig.hpp" + +using clp::string_utils::is_alphabet; +using clp::string_utils::is_decimal_digit; +using std::error_code; +using std::get; +using std::make_pair; +using std::monostate; +using std::pair; +using std::string; +using std::string_view; +using std::variant; + +namespace clp::regex_utils { + +/** + * Class for storing regex translation config, states, capture group and quantifier information. + */ +class TranslatorState { +public: + enum class RegexPatternState : uint8_t { + // The initial state, where characters have no special meanings and are treated literally. + NORMAL = 0, + // Encountered a period `.`. Expecting wildcard expression. + DOT, + // Encountered a backslash `\`, used to suppress special meanings of regex meta characters. + ESCAPED, + // Enclosed by parenthesis `()`, used to specify a capture group. + GROUP, + // Encountered a backslash `\` in the capture group. + GROUPESCAPED, + // Enclosed by square brackets `[]`, used to specify a character set. + CHARSET, + // Encountered a backslash `\` in the character set.. + CHARSETESCAPED, + // Enclosed by curly brackets `{}`, used to specify a quantity to repeat. + QUANTIFIER, + // Encountered a dollar sign `$`, meaning the regex string has reached the end anchor. + END, + }; + + // Constructor + TranslatorState(RegexToWildcardTranslatorConfig const& config, string_view regex_str) + : m_config(config), + m_it(regex_str.begin()) {} + + // Getters + [[nodiscard]] auto get_config() const -> RegexToWildcardTranslatorConfig const& { + return m_config; + } + + [[nodiscard]] auto get_state() const -> RegexPatternState const& { return m_state; } + + [[nodiscard]] auto get_marked_iterator() const -> string_view::const_iterator const& { + return m_it; + } + + [[nodiscard]] auto get_preceding_token( + ) const -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + [[nodiscard]] auto get_quantifier() const -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + + [[nodiscard]] auto get_quantifier_as_str() const -> string { return m_quantifier_str; } + + [[nodiscard]] auto quantifier_number_start() const -> bool { + return m_quantifier_str.empty() || ',' == m_quantifier_str.back(); + } + + // Setters + void set_next_state(RegexPatternState const& state) { m_state = state; } + + void mark_iterator(string_view::const_iterator const& it) { m_it = it; } + + void invalidate_preceding_token() { m_preceding_token = monostate{}; } + + void set_preceding_token(char ch) { m_preceding_token = ch; } + + void set_preceding_token(string const& s) { m_preceding_token = s; } + + void reset_quantifiers() { + m_quantifier = size_t{0}; + m_quantifier_str.clear(); + } + + void add_to_quantifier(char ch); + + void switch_to_second_quantifier() { + m_quantifier = make_pair(get(m_quantifier), 0); + m_quantifier_str += ','; + } + + void inc_nested_group_count() { ++m_nested_group_count; } + + [[nodiscard]] auto dec_nested_group_count() -> BOOST_OUTCOME_V2_NAMESPACE::std_result; + +private: + // Variables + RegexToWildcardTranslatorConfig m_config; + RegexPatternState m_state = RegexPatternState::NORMAL; + string_view::const_iterator m_it; + variant m_preceding_token; + variant> m_quantifier; + string m_quantifier_str; + size_t m_nested_group_count = 0; +}; + +auto TranslatorState::get_preceding_token( +) const -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + switch (m_preceding_token.index()) { + case 0: + return ErrorCode::TokenUnquantifiable; + case 1: + return string{get(m_preceding_token)}; + case 2: + return get(m_preceding_token); + default: + return ErrorCode::IllegalState; + } +} + +auto TranslatorState::get_quantifier() const -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + switch (m_quantifier.index()) { + case 0: + return get(m_quantifier); + case 1: + // Maybe we can support a ranged pair of quantifiers in the future + return ErrorCode::UnsupportedQuantifier; + default: + return ErrorCode::IllegalState; + } +} + +void TranslatorState::add_to_quantifier(char ch) { + int const num{ch - '0'}; + int const base = 10; + switch (m_quantifier.index()) { + case 0: + m_quantifier = get<0>(m_quantifier) * base + num; + break; + case 1: + get<1>(m_quantifier).second = get<1>(m_quantifier).second * base + num; + break; + default: + break; + } + m_quantifier_str += ch; +} + +auto TranslatorState::dec_nested_group_count() -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + if (0 == m_nested_group_count) { + return ErrorCode::UnmatchedParenthesis; + } + --m_nested_group_count; + return m_nested_group_count; +} + +// State transition functions common signature +// typedef [[nodiscard]] auto +// StateTransitionFunc(TranslatorState&, string_view::const_iterator&, string&) -> error_code; + +using StateTransitionFunc + = auto(TranslatorState&, string_view::const_iterator&, string&) -> error_code; + +// State transition functions +[[nodiscard]] StateTransitionFunc normal_state_transition; +[[nodiscard]] StateTransitionFunc dot_state_transition; +[[nodiscard]] StateTransitionFunc escaped_state_transition; +[[nodiscard]] StateTransitionFunc group_state_transition; +[[nodiscard]] StateTransitionFunc group_escaped_state_transition; +[[nodiscard]] StateTransitionFunc charset_state_transition; +[[nodiscard]] StateTransitionFunc charset_escaped_state_transition; +[[nodiscard]] StateTransitionFunc quantifier_state_transition; +[[nodiscard]] StateTransitionFunc end_state_transition; +[[nodiscard]] StateTransitionFunc final_state_cleanup; + +// Helper function +void append_incomplete_quantifier_structure(TranslatorState& state, string& wildcard_str); +[[nodiscard]] auto matching_upper_lower_case_char_pair(char ch0, char ch1) -> bool; + +// Main API +auto regex_to_wildcard(string_view regex_str) -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + RegexToWildcardTranslatorConfig const default_config{}; + return regex_to_wildcard(regex_str, default_config); +} + +auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + if (regex_str.empty()) { + return string(); + } + + // Initialize translation state, scan position, and return string + TranslatorState state{config, regex_str}; + string_view::const_iterator it = regex_str.cbegin(); + string wildcard_str; + + // If there is no starting anchor character, append multichar wildcard prefix + if (cRegexStartAnchor == *it) { + if (config.allow_anchors()) { + ++it; + } else { + return ErrorCode::Caret; + } + } else if (config.add_prefix_suffix_wildcards()) { + wildcard_str += cZeroOrMoreCharsWildcard; + } + + error_code ec{}; + while (it != regex_str.end()) { + switch (state.get_state()) { + case TranslatorState::RegexPatternState::NORMAL: + ec = normal_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::DOT: + ec = dot_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::ESCAPED: + ec = escaped_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::GROUP: + ec = group_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::GROUPESCAPED: + ec = group_escaped_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::CHARSET: + ec = charset_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::CHARSETESCAPED: + ec = charset_escaped_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::QUANTIFIER: + ec = quantifier_state_transition(state, it, wildcard_str); + break; + case TranslatorState::RegexPatternState::END: + ec = end_state_transition(state, it, wildcard_str); + break; + default: + ec = ErrorCode::IllegalState; + break; + } + + if (ec) { + return ec; + } + ++it; + } + + // Do the final state check and clean up + ec = final_state_cleanup(state, it, wildcard_str); + if (ec) { + return ec; + } + + return wildcard_str; +} + +auto normal_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + auto const& config = state.get_config(); + switch (ch) { + case '.': + state.set_next_state(TranslatorState::RegexPatternState::DOT); + break; + case cEscapeChar: + state.set_next_state(TranslatorState::RegexPatternState::ESCAPED); + break; + case '(': + state.inc_nested_group_count(); + state.mark_iterator(it + 1); // Mark the beginning of group expression + state.set_next_state(TranslatorState::RegexPatternState::GROUP); + break; + case '[': + state.mark_iterator(it + 1); // Mark the beginning of charset expression + state.set_next_state(TranslatorState::RegexPatternState::CHARSET); + break; + case '{': + state.reset_quantifiers(); + state.set_next_state(TranslatorState::RegexPatternState::QUANTIFIER); + break; + case cRegexEndAnchor: + if (!config.allow_anchors()) { + return ErrorCode::Dollar; + } + state.set_next_state(TranslatorState::RegexPatternState::END); + break; + case '*': + return ErrorCode::Star; + case '+': + return ErrorCode::Plus; + case '?': + return ErrorCode::Question; + case '|': + return ErrorCode::Pipe; + case cRegexStartAnchor: + return ErrorCode::Caret; + case ')': + return ErrorCode::UnmatchedParenthesis; + default: + wildcard_str += ch; + state.set_preceding_token(ch); + break; + } + return ErrorCode::Success; +} + +auto dot_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + switch (*it) { + case '*': + // .* gets translated to * + wildcard_str += cZeroOrMoreCharsWildcard; + state.invalidate_preceding_token(); + break; + case '+': + // .+ gets translated to ?* + wildcard_str = wildcard_str + cSingleCharWildcard + cZeroOrMoreCharsWildcard; + state.invalidate_preceding_token(); + break; + default: + // . gets translated to ? + wildcard_str += cSingleCharWildcard; + state.set_preceding_token(cSingleCharWildcard); + // Backtrack the scan by one position to handle the current char in the next iteration. + --it; + break; + } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + +auto escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if (!cRegexEscapeSeqAcceptedMetaChars.at(ch)) { + return ErrorCode::DisallowedEscapeSequence; + } + if (cRegexEscapeSeqWildcardOnlyMetaChars.at(ch)) { + // Need to keep the backslash for characters that are special in the wildcard syntax too + string const escape_seq = string{cEscapeChar} + ch; + wildcard_str += escape_seq; + state.set_preceding_token(escape_seq); + } else { + wildcard_str += ch; + state.set_preceding_token(ch); + } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + +auto group_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if (cEscapeChar == ch) { + state.set_next_state(TranslatorState::RegexPatternState::GROUPESCAPED); + return ErrorCode::Success; + } + // TODO: make the group unrolling iterative + if ('(' == ch) { + state.inc_nested_group_count(); + return ErrorCode::Success; + } + if (')' != ch) { + return ErrorCode::Success; + } + auto num_nested_group = state.dec_nested_group_count(); + if (num_nested_group.has_error()) { + return num_nested_group.error(); + } + if (num_nested_group.value() > 0) { + // Still within nested group + return ErrorCode::Success; + } + + // End of group: translate the captured group expression. + // capture group should not enable anchors or prefix/suffix wildcards. + string const captured_group(state.get_marked_iterator(), it); + auto config{state.get_config()}; + config.set_allow_anchors(false); + config.set_add_prefix_suffix_wildcards(false); + + // Perform translation + auto translated_group = regex_to_wildcard(captured_group, config); + if (translated_group.has_error()) { + return translated_group.error(); + } + + wildcard_str += translated_group.value(); + state.set_preceding_token(translated_group.value()); + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + +auto group_escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& /*wildcard_str*/ +) -> error_code { + // Defer the handling of escape sequences to entire capture group translation. + state.set_next_state(TranslatorState::RegexPatternState::GROUP); + return ErrorCode::Success; +} + +auto charset_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + string_view::const_iterator const& charset_start = state.get_marked_iterator(); + size_t const charset_len = it - charset_start; + if (cEscapeChar == ch) { + state.set_next_state(TranslatorState::RegexPatternState::CHARSETESCAPED); + return ErrorCode::Success; + } + if (charset_len > 2) { + // Short circuit: the currently accepted charset is at most 2-char long. + return ErrorCode::UnsupportedCharsets; + } + if (']' != ch) { + return ErrorCode::Success; + } + if (0 == charset_len) { + // Empty charset + return ErrorCode::UnsupportedCharsets; + } + + // End of charset: perform analysis on accepted charset patterns. + char const ch0 = *charset_start; + char const ch1 = *(charset_start + 1); + auto config{state.get_config()}; + char parsed_char{}; + + if (1 == charset_len) { + if (cCharsetNegate == ch0 || cEscapeChar == ch0) { + return ErrorCode::UnsupportedCharsets; + } + parsed_char = ch0; + } else { // 2 == charset_len + if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaChars.at(ch1)) { + // 2-char escape sequence + parsed_char = ch1; + } else if (config.case_insensitive_wildcard() + && matching_upper_lower_case_char_pair(ch0, ch1)) + { + // case-insensitive patterns like [aA] [Bb] etc. + parsed_char = ch0 > ch1 ? ch0 : ch1; // Get the lower case char + } else { + return ErrorCode::UnsupportedCharsets; + } + } + + // Add the parsed character to the string + if (cRegexEscapeSeqWildcardOnlyMetaChars.at(parsed_char)) { + auto escaped_char = string{cEscapeChar} + parsed_char; + wildcard_str += escaped_char; + state.set_preceding_token(escaped_char); + } else { + wildcard_str += parsed_char; + state.set_preceding_token(parsed_char); + } + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + return ErrorCode::Success; +} + +auto matching_upper_lower_case_char_pair(char ch0, char ch1) -> bool { + int const upper_lower_case_ascii_offset = 'a' - 'A'; + return (is_alphabet(ch0) && is_alphabet(ch1) + && ((ch0 - ch1 == upper_lower_case_ascii_offset) + || (ch1 - ch0 == upper_lower_case_ascii_offset))); +} + +auto charset_escaped_state_transition( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& /*wildcard_str*/ +) -> error_code { + // Defer the handling of escape sequences to entire character set analysis.. + state.set_next_state(TranslatorState::RegexPatternState::CHARSET); + return ErrorCode::Success; +} + +auto quantifier_state_transition( + TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str +) -> error_code { + char const ch = *it; + if ('-' == ch && state.quantifier_number_start()) { + // Disallow negative quantifiers + return ErrorCode::UnsupportedQuantifier; + } + if (',' == ch) { + // Expecting a pair of quantifiers + state.switch_to_second_quantifier(); + } else if (is_decimal_digit(ch)) { + // Is a regular decimal digit + state.add_to_quantifier(ch); + } else if ('}' != ch) { + // Invalid quantifier syntax. In such case, the special meaning of `(` is suppressed. + // So far we've only seen opening bracket/digits/comma, so append directly. + append_incomplete_quantifier_structure(state, wildcard_str); + // Backtrack the scan by one position to handle the current char in the next iteration. + --it; + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + } else { + // Quantifier expression complete. Perform repetition + auto quantifier = state.get_quantifier(); + if (quantifier.has_error()) { + return quantifier.error(); + } + auto prev_token = state.get_preceding_token(); + if (prev_token.has_error()) { + return prev_token.error(); + } + + size_t const q_val = quantifier.value(); + string const token = prev_token.value(); + if (0 == q_val) { + // Zero repetition removes the token from the string + wildcard_str.erase(wildcard_str.length() - token.length()); + } else { + // Repeat the token for n-1 times + for (size_t i{1}; i < q_val; ++i) { + wildcard_str += token; + } + } + // Compound repetition is not allowed. + state.invalidate_preceding_token(); + state.set_next_state(TranslatorState::RegexPatternState::NORMAL); + } + return ErrorCode::Success; +} + +auto end_state_transition( + TranslatorState& /*state*/, + string_view::const_iterator& it, + string& /*wildcard_str*/ +) -> error_code { + if (cRegexEndAnchor != *it) { + return ErrorCode::Dollar; + } + return ErrorCode::Success; +} + +auto final_state_cleanup( + TranslatorState& state, + string_view::const_iterator& /*it*/, + string& wildcard_str +) -> error_code { + switch (state.get_state()) { + case TranslatorState::RegexPatternState::DOT: + // The last character is a single `.`, without the possibility of becoming a + // multichar wildcard + wildcard_str += cSingleCharWildcard; + break; + case TranslatorState::RegexPatternState::ESCAPED: + return ErrorCode::DisallowedEscapeSequence; + case TranslatorState::RegexPatternState::GROUP: + case TranslatorState::RegexPatternState::GROUPESCAPED: + return ErrorCode::UnmatchedParenthesis; + case TranslatorState::RegexPatternState::CHARSET: + return ErrorCode::IncompleteCharsetStructure; + case TranslatorState::RegexPatternState::QUANTIFIER: + append_incomplete_quantifier_structure(state, wildcard_str); + break; + default: + break; + } + + auto const& config = state.get_config(); + if (TranslatorState::RegexPatternState::END != state.get_state() + && config.add_prefix_suffix_wildcards()) + { + wildcard_str += cZeroOrMoreCharsWildcard; + } + return ErrorCode::Success; +} + +void append_incomplete_quantifier_structure(TranslatorState& state, string& wildcard_str) { + // Invalid quantifier syntax. So far we've only seen digits/comma so append directly. + string const invalid_quantifier_str = string{'{'} + state.get_quantifier_as_str(); + wildcard_str += invalid_quantifier_str; + state.set_preceding_token(invalid_quantifier_str.back()); +} + +} // namespace clp::regex_utils diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp new file mode 100644 index 000000000..5e02c2fb0 --- /dev/null +++ b/components/core/tests/test-regex_utils.cpp @@ -0,0 +1,297 @@ +#include +#include +#include + +#include + +using clp::regex_utils::regex_has_end_anchor; +using clp::regex_utils::regex_has_start_anchor; +using clp::regex_utils::regex_to_wildcard; +using clp::regex_utils::regex_trim_line_anchors; + +TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { + // Test empty string + REQUIRE(regex_to_wildcard("").value().empty()); + + // Test simple wildcard translations + REQUIRE((regex_to_wildcard("^xyz$").value() == "xyz")); + REQUIRE((regex_to_wildcard("xyz").value() == "xyz")); + REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?")); + REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *")); + + // Test unescaped meta characters + REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == clp::regex_utils::ErrorCode::Question) + ); + REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == clp::regex_utils::ErrorCode::Star)); + REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == clp::regex_utils::ErrorCode::Plus)); + REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == clp::regex_utils::ErrorCode::Pipe)); + REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == clp::regex_utils::ErrorCode::Caret)); + + // Test properly escaped meta characters + REQUIRE( + (regex_to_wildcard("\\^\\$\\.\\*\\{\\}\\[\\]\\(\\)\\+\\|\\?\\<\\>\\-\\_\\/\\=\\!\\\\") + .value() + == "^$.\\*{}[]()+|\\?<>-_/=!\\\\") + ); + REQUIRE( + (regex_to_wildcard("abc\\Qdefghi\\Ejkl").error() + == clp::regex_utils::ErrorCode::DisallowedEscapeSequence) + ); + + // Test quantifiers + REQUIRE((regex_to_wildcard("abc{3}").value() == "abccc")); + REQUIRE((regex_to_wildcard("abc{4}").value() == "abcccc")); + REQUIRE((regex_to_wildcard("abc{0}").value() == "ab")); + REQUIRE((regex_to_wildcard("abc.{4}").value() == "abc????")); + REQUIRE((regex_to_wildcard("abc\\[{4}").value() == "abc[[[[")); + REQUIRE((regex_to_wildcard("abc\\^{4}").value() == "abc^^^^")); + REQUIRE((regex_to_wildcard("abc\\*{4}").value() == "abc\\*\\*\\*\\*")); + REQUIRE((regex_to_wildcard("abc\\?{4}").value() == "abc\\?\\?\\?\\?")); + REQUIRE((regex_to_wildcard("abc{123").value() == "abc{123")); + REQUIRE((regex_to_wildcard("abc{123,456").value() == "abc{123,456")); + REQUIRE((regex_to_wildcard("abc{00123\\*").value() == "abc{00123\\*")); + REQUIRE((regex_to_wildcard("abc{3,4{{{{3}").value() == "abc{3,4{{{{{")); + REQUIRE((regex_to_wildcard("abc{3,4{3,4{3,{3}").value() == "abc{3,4{3,4{3,,,")); + REQUIRE((regex_to_wildcard("abc{3,4{3,4{3,4{3}").value() == "abc{3,4{3,4{3,444")); + REQUIRE((regex_to_wildcard("abc{3,4{3,4{3,4.*").value() == "abc{3,4{3,4{3,4*")); + REQUIRE((regex_to_wildcard("abc{3,4{3,4{3,4\\[a-z]").value() == "abc{3,4{3,4{3,4[a-z]")); + REQUIRE((regex_to_wildcard("abc{3,4{3,4{3,4\\*{4}").value() == "abc{3,4{3,4{3,4\\*\\*\\*\\*")); + + REQUIRE( + (regex_to_wildcard("abc{-3}").error() + == clp::regex_utils::ErrorCode::UnsupportedQuantifier) + ); + REQUIRE( + (regex_to_wildcard("abc{3,4}").error() + == clp::regex_utils::ErrorCode::UnsupportedQuantifier) + ); + + REQUIRE(( + regex_to_wildcard("{3}abc").error() == clp::regex_utils::ErrorCode::TokenUnquantifiable + )); + REQUIRE( + (regex_to_wildcard("abc{3}{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable) + ); + REQUIRE( + (regex_to_wildcard("abc.*{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable) + ); + REQUIRE( + (regex_to_wildcard("abc.+{3}").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable) + ); + + // Test grouping and quantifiers + REQUIRE((regex_to_wildcard("(xyz)").value() == "xyz")); + REQUIRE((regex_to_wildcard("abc (xyz) def").value() == "abc xyz def")); + REQUIRE((regex_to_wildcard("abc () def").value() == "abc def")); + REQUIRE( + (regex_to_wildcard("abc (. xyz .+ zyx .*){2} def").value() + == "abc ? xyz ?* zyx *? xyz ?* zyx * def") + ); + REQUIRE( + (regex_to_wildcard("abc (.{3} xyz .+ zyx .*){2} def").value() + == "abc ??? xyz ?* zyx *??? xyz ?* zyx * def") + ); + REQUIRE((regex_to_wildcard("abc (\\)){2} def").value() == "abc )) def")); + REQUIRE((regex_to_wildcard("abc (\\)\\*){2} def").value() == "abc )\\*)\\* def")); + REQUIRE(( + regex_to_wildcard("abc (x(\\*){3}z){2} def").value() == "abc x\\*\\*\\*zx\\*\\*\\*z def" + )); + + REQUIRE( + (regex_to_wildcard("abc (. xyz .+ zyx .*{2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis) + ); + REQUIRE( + (regex_to_wildcard("abc (x(\\*{3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis) + ); + REQUIRE( + (regex_to_wildcard("abc (x(\\*){3}z{2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis) + ); + REQUIRE( + (regex_to_wildcard("abc x(\\*){3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis) + ); + REQUIRE( + (regex_to_wildcard("abc (x\\*){3}z){2} def").error() + == clp::regex_utils::ErrorCode::UnmatchedParenthesis) + ); + REQUIRE(( + regex_to_wildcard("abc (abc | def){2} def").error() == clp::regex_utils::ErrorCode::Pipe + )); + REQUIRE( + (regex_to_wildcard("abc (* xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::Star) + ); + REQUIRE( + (regex_to_wildcard("abc (+ xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::Plus) + ); + REQUIRE( + (regex_to_wildcard("abc (.{3}{3} xyz .+ zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable) + ); + REQUIRE( + (regex_to_wildcard("abc (. xyz .+{3} zyx .*){2} def").error() + == clp::regex_utils::ErrorCode::TokenUnquantifiable) + ); + + // Test charset and quantifiers + REQUIRE((regex_to_wildcard("x[y]z").value() == "xyz")); + REQUIRE((regex_to_wildcard("x[y]{2}z").value() == "xyyz")); + REQUIRE((regex_to_wildcard("x[+]{2}z").value() == "x++z")); + REQUIRE((regex_to_wildcard("x[-]{2}z").value() == "x--z")); + REQUIRE((regex_to_wildcard("x[|]{2}z").value() == "x||z")); + REQUIRE((regex_to_wildcard("x[\\-]{2}z").value() == "x--z")); + REQUIRE((regex_to_wildcard("x[\\^]{2}z").value() == "x^^z")); + REQUIRE((regex_to_wildcard("x[\\]]{2}z").value() == "x]]z")); + REQUIRE((regex_to_wildcard("x[*]{2}z").value() == "x\\*\\*z")); + REQUIRE((regex_to_wildcard("x[?]{2}z").value() == "x\\?\\?z")); + REQUIRE((regex_to_wildcard("x[\\\\]{2}z").value() == "x\\\\\\\\z")); + + REQUIRE((regex_to_wildcard("abc (x[*]{2}z){2} def").value() == "abc x\\*\\*zx\\*\\*z def")); + REQUIRE((regex_to_wildcard("abc (x[\\]]{2}z){2} def").value() == "abc x]]zx]]z def")); + + REQUIRE( + (regex_to_wildcard("x[aA").error() + == clp::regex_utils::ErrorCode::IncompleteCharsetStructure) + ); + REQUIRE(( + regex_to_wildcard("x[]{2}z").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets + )); + REQUIRE( + (regex_to_wildcard("x[^]{2}z").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE( + (regex_to_wildcard("x[\\]{2}z").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + + // Need to set case-insensitive wildcard config for the following to work + REQUIRE((regex_to_wildcard("[aA]").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE((regex_to_wildcard("[Aa]").error() == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE( + (regex_to_wildcard("[Ee][Xx][Cc][Ee][Pp][Tt][Ii][Oo][Nn]").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE( + (regex_to_wildcard("[eE][Xx][cC][eE][pP][Tt][iI][Oo]{2}[Nn]").error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); +} + +TEST_CASE( + "regex_to_wildcard_case_insensitive_wildcard", + "[regex_utils][regex_to_wildcard][case_insensitive_wildcard]" +) { + clp::regex_utils::RegexToWildcardTranslatorConfig config; + config.set_case_insensitive_wildcard(true); + + REQUIRE((regex_to_wildcard("[aA]", config).value() == "a")); + REQUIRE((regex_to_wildcard("[Aa]", config).value() == "a")); + REQUIRE((regex_to_wildcard("[Aa][pP]{2}[Ll][eE]", config).value() == "apple")); + REQUIRE(( + regex_to_wildcard("[Ee][Xx][Cc][Ee][Pp][Tt][Ii][Oo][Nn]", config).value() == "exception" + )); + REQUIRE( + (regex_to_wildcard("[eE][Xx][cC][eE][pP][Tt][iI][Oo]{2}[Nn]", config).value() + == "exceptioon") + ); + + REQUIRE( + (regex_to_wildcard("[eE][Xx][cC][eE][pP][Tk][iI][Oo][Nn]", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + + // The other test cases should not be affected + REQUIRE((regex_to_wildcard("x[y]z", config).value() == "xyz")); + REQUIRE((regex_to_wildcard("x[y]{2}z", config).value() == "xyyz")); + REQUIRE((regex_to_wildcard("x[+]{2}z", config).value() == "x++z")); + REQUIRE((regex_to_wildcard("x[-]{2}z", config).value() == "x--z")); + REQUIRE((regex_to_wildcard("x[|]{2}z", config).value() == "x||z")); + REQUIRE((regex_to_wildcard("x[\\-]{2}z", config).value() == "x--z")); + REQUIRE((regex_to_wildcard("x[\\^]{2}z", config).value() == "x^^z")); + REQUIRE((regex_to_wildcard("x[\\]]{2}z", config).value() == "x]]z")); + REQUIRE((regex_to_wildcard("x[*]{2}z", config).value() == "x\\*\\*z")); + REQUIRE((regex_to_wildcard("x[?]{2}z", config).value() == "x\\?\\?z")); + REQUIRE((regex_to_wildcard("x[\\\\]{2}z", config).value() == "x\\\\\\\\z")); + + REQUIRE(( + regex_to_wildcard("abc (x[*]{2}z){2} def", config).value() == "abc x\\*\\*zx\\*\\*z def" + )); + REQUIRE((regex_to_wildcard("abc (x[\\]]{2}z){2} def", config).value() == "abc x]]zx]]z def")); + + REQUIRE( + (regex_to_wildcard("x[]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE( + (regex_to_wildcard("x[^]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); + REQUIRE( + (regex_to_wildcard("x[\\]{2}z", config).error() + == clp::regex_utils::ErrorCode::UnsupportedCharsets) + ); +} + +TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") { + // Test anchors and prefix/suffix wildcards + clp::regex_utils::RegexToWildcardTranslatorConfig config; + config.set_add_prefix_suffix_wildcards(true); + REQUIRE(((regex_to_wildcard("^", config).value() == "*"))); + REQUIRE((regex_to_wildcard("$", config).value() == "*")); + REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz")); + REQUIRE((regex_to_wildcard("xyz", config).value() == "*xyz*")); + + // Test in groups + REQUIRE((regex_to_wildcard("xyz(. xyz .* zyx .)zyx", config).value() == "*xyz? xyz * zyx ?zyx*") + ); + REQUIRE( + (regex_to_wildcard("xyz(^. xyz .* zyx .)zyx", config).error() + == clp::regex_utils::ErrorCode::Caret) + ); + REQUIRE( + (regex_to_wildcard("xyz(. xyz .* zyx .$)zyx", config).error() + == clp::regex_utils::ErrorCode::Dollar) + ); +} + +TEST_CASE("regex_trim_line_anchors", "[regex_utils][regex_trim_line_anchors]") { + REQUIRE(regex_trim_line_anchors("").empty()); + REQUIRE((regex_trim_line_anchors("^^^hello$$$") == "^hello$")); + REQUIRE((regex_trim_line_anchors("^^\\^hello$$$") == "^\\^hello$")); + REQUIRE((regex_trim_line_anchors("^^^hello\\$$$") == "^hello\\$$")); + REQUIRE((regex_trim_line_anchors("^^\\^hello\\$$$") == "^\\^hello\\$$")); + REQUIRE((regex_trim_line_anchors("^^^hello\\\\\\\\\\\\\\$$$") == "^hello\\\\\\\\\\\\\\$$")); + REQUIRE((regex_trim_line_anchors("^^^\\\\goodbye\\\\\\\\$$$") == "^\\\\goodbye\\\\\\\\$")); +} + +TEST_CASE("regex_has_start_anchor", "[regex_utils][regex_has_start_anchor]") { + REQUIRE_FALSE(regex_has_start_anchor("")); + REQUIRE(regex_has_start_anchor("^hello$")); + REQUIRE_FALSE(regex_has_start_anchor("\\^hello$")); + REQUIRE(regex_has_start_anchor("^hello\\$")); + REQUIRE_FALSE(regex_has_start_anchor("\\^hello\\$")); + REQUIRE(regex_has_start_anchor("^hello\\\\\\\\\\\\\\$")); + REQUIRE(regex_has_start_anchor("^\\\\goodbye\\\\\\\\\\\\$")); +} + +TEST_CASE("regex_has_end_anchor", "[regex_utils][regex_has_end_anchor]") { + REQUIRE_FALSE(regex_has_end_anchor("")); + REQUIRE(regex_has_end_anchor("^hello$")); + REQUIRE(regex_has_end_anchor("\\^hello$")); + REQUIRE_FALSE(regex_has_end_anchor("^hello\\$")); + REQUIRE_FALSE(regex_has_end_anchor("\\^hello\\$")); + REQUIRE_FALSE(regex_has_end_anchor("^hello\\\\\\\\\\\\\\$")); + REQUIRE(regex_has_end_anchor("^\\\\goodbye\\\\\\\\\\\\$")); + REQUIRE(regex_has_end_anchor("\\\\\\\\\\\\$")); + REQUIRE_FALSE(regex_has_end_anchor("\\\\\\\\\\\\\\$")); +}