diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index e3d73843a..57ae8cda1 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -209,8 +209,8 @@ set(sqlite_DYNAMIC_LIBS "dl;m;pthread") include(cmake/Modules/FindLibraryDependencies.cmake) FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}") -add_subdirectory(src/clp/string_utils) add_subdirectory(src/clp/regex_utils) +add_subdirectory(src/clp/string_utils) add_subdirectory(src/clp/clg) add_subdirectory(src/clp/clo) @@ -305,11 +305,11 @@ set(SOURCE_FILES_unitTest src/clp/ffi/ir_stream/decoding_methods.inc src/clp/ffi/ir_stream/encoding_methods.cpp src/clp/ffi/ir_stream/encoding_methods.hpp + src/clp/ffi/ir_stream/protocol_constants.hpp src/clp/ffi/ir_stream/Serializer.cpp src/clp/ffi/ir_stream/Serializer.hpp src/clp/ffi/ir_stream/utils.cpp src/clp/ffi/ir_stream/utils.hpp - src/clp/ffi/ir_stream/protocol_constants.hpp src/clp/ffi/SchemaTree.cpp src/clp/ffi/SchemaTree.hpp src/clp/ffi/SchemaTreeNode.hpp @@ -436,10 +436,10 @@ set(SOURCE_FILES_unitTest src/clp/StringReader.hpp src/clp/Thread.cpp src/clp/Thread.hpp + src/clp/time_types.hpp src/clp/TimestampPattern.cpp src/clp/TimestampPattern.hpp src/clp/TraceableException.hpp - src/clp/time_types.hpp src/clp/type_utils.hpp src/clp/utf8_utils.cpp src/clp/utf8_utils.hpp @@ -471,12 +471,12 @@ set(SOURCE_FILES_unitTest tests/test-NetworkReader.cpp tests/test-ParserWithUserSchema.cpp tests/test-query_methods.cpp + tests/test-regex_utils.cpp tests/test-Segment.cpp tests/test-SQLiteDB.cpp tests/test-Stopwatch.cpp tests/test-StreamingCompression.cpp tests/test-string_utils.cpp - tests/test-regex_utils.cpp tests/test-TimestampPattern.cpp tests/test-utf8_utils.cpp tests/test-Utils.cpp @@ -499,8 +499,8 @@ target_link_libraries(unitTest spdlog::spdlog ${sqlite_LIBRARY_DEPENDENCIES} ${STD_FS_LIBS} - clp::string_utils clp::regex_utils + clp::string_utils yaml-cpp::yaml-cpp ZStd::ZStd ) diff --git a/components/core/src/clp/regex_utils/CMakeLists.txt b/components/core/src/clp/regex_utils/CMakeLists.txt index c5f6183de..c5d54dde0 100644 --- a/components/core/src/clp/regex_utils/CMakeLists.txt +++ b/components/core/src/clp/regex_utils/CMakeLists.txt @@ -1,21 +1,20 @@ set( REGEX_UTILS_HEADER_LIST - "ErrorCode.hpp" - "RegexToWildcardTranslatorConfig.hpp" "constants.hpp" + "ErrorCode.hpp" "regex_translation_utils.hpp" + "RegexToWildcardTranslatorConfig.hpp" ) add_library( regex_utils - regex_translation_utils.cpp ErrorCode.cpp + regex_translation_utils.cpp ${REGEX_UTILS_HEADER_LIST} ) add_library(clp::regex_utils ALIAS regex_utils) target_include_directories(regex_utils - PUBLIC - ../ PRIVATE + ../ "${PROJECT_SOURCE_DIR}/submodules" ) target_compile_features(regex_utils PRIVATE cxx_std_20) diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp index f651478b9..d899ae25b 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.cpp +++ b/components/core/src/clp/regex_utils/ErrorCode.cpp @@ -10,23 +10,22 @@ using std::string; using std::string_view; namespace clp::regex_utils { - +namespace { /** * Class for giving the error codes more detailed string descriptions. - * This class does not need to be seen outside the std error code wrapper implementation. */ class ErrorCodeCategory : public error_category { public: /** * @return The class of errors. */ - [[nodiscard]] char const* name() const noexcept override; + [[nodiscard]] auto name() const noexcept -> char const* override; /** * @param The error code encoded in int. * @return The descriptive message for the error. */ - [[nodiscard]] string message(int ev) const override; + [[nodiscard]] auto message(int ev) const -> string override; }; auto ErrorCodeCategory::name() const noexcept -> char const* { @@ -69,10 +68,10 @@ auto ErrorCodeCategory::message(int ev) const -> string { } } -ErrorCodeCategory const cTheErrorCodeCategory{}; +ErrorCodeCategory const cErrorCodeCategoryInstance; +} // namespace auto make_error_code(ErrorCode e) -> error_code { - return {static_cast(e), cTheErrorCodeCategory}; + return {static_cast(e), cErrorCodeCategoryInstance}; } - } // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp index 4fa9204fc..acb1dcc37 100644 --- a/components/core/src/clp/regex_utils/ErrorCode.hpp +++ b/components/core/src/clp/regex_utils/ErrorCode.hpp @@ -6,7 +6,6 @@ #include namespace clp::regex_utils { - /** * Enum class for propagating and handling various regex utility errors. * More detailed descriptions can be found in ErrorCode.cpp. @@ -35,7 +34,6 @@ enum class ErrorCode : uint8_t { * @return The corresponding std::error_code type variable. */ [[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code; - } // namespace clp::regex_utils namespace std { diff --git a/components/core/src/clp/regex_utils/README.md b/components/core/src/clp/regex_utils/README.md new file mode 100644 index 000000000..3e4777c52 --- /dev/null +++ b/components/core/src/clp/regex_utils/README.md @@ -0,0 +1,46 @@ +# Regex_utils + +This library contains useful utilities to handle all regex related tasks. + +## Regex to Wildcard Translator + +### Goal + +Performs a best-effort translation to turn a regex string to an equivalent wildcard string. + +CLP currently only recognizes three meta-characters in the wildcard syntax: + +* `?` Matches any single character +* `*` Matches zero or more characters +* `\` Suppresses the special meaning of meta characters (including itself) + +If the regex query can actually be expressed as a wildcard query only deploying the three +metacharacters above, CLP should use the wildcard version. + +### Includes + +* To use the translator: + +```shell +#include +``` + +* To add custom configuration to the translator: + +```shell +#include +``` + +### Functionalities + +* Wildcards + - Turn `.` into `?` + - Turn `.*` into `*` + - Turn `.+` into `?*` + +### Custom configuration + +* `add_prefix_suffix_wildcards`: in the absence of regex anchors, add prefix or suffix wildcards so +the query becomes a substring query. + - E.g. `info.*system` gets translated into `*info*system*` which makes the original query a + substring query. diff --git a/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp index 98d88824b..e53963c2e 100644 --- a/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp +++ b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp @@ -2,10 +2,15 @@ #define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP namespace clp::regex_utils { - +/** + * Allows users to customize and fine tune how to translate a regex string to wildcard. + * + * This class won't affect the core logic and state trasition mechanics of the regex to wildcard + * translator, but it can make the translator more versatile. For detailed descriptions of how each + * option should be used, see the getter function docstrings. + */ class RegexToWildcardTranslatorConfig { public: - // Constructors RegexToWildcardTranslatorConfig( bool case_insensitive_wildcard, bool add_prefix_suffix_wildcards @@ -13,23 +18,19 @@ class RegexToWildcardTranslatorConfig { : m_case_insensitive_wildcard{case_insensitive_wildcard}, m_add_prefix_suffix_wildcards{add_prefix_suffix_wildcards} {}; - // Getters - /** - * @return True if the final translated wildcard string will be fed into - * a case-insensitive wildcard analyzer. In such cases, we can - * safely translate charset patterns such as [aA] [Bb] into singular - * lowercase characters a, b. + * @return True if the final translated wildcard string will be fed into a case-insensitive + * wildcard analyzer. In such cases, we can safely translate charset patterns such as [aA] [Bb] + * into singular lowercase characters a, b. */ [[nodiscard]] auto case_insensitive_wildcard() const -> bool { return m_case_insensitive_wildcard; } /** - * @return True if in the absense of starting or ending anchors in the - * regex string, we append prefix or suffix zero or more characters - * wildcards. In other words, this config is true if the search - * is a substring search, and false if the search is an exact search. + * @return True if in the absense of starting or ending anchors in the regex string, we append + * prefix or suffix zero or more characters wildcards. In other words, this config is true if + * the search is a substring search, and false if the search is an exact search. */ [[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool { return m_add_prefix_suffix_wildcards; @@ -40,7 +41,6 @@ class RegexToWildcardTranslatorConfig { bool m_case_insensitive_wildcard; bool m_add_prefix_suffix_wildcards; }; - } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp index 603b6682e..879e7641d 100644 --- a/components/core/src/clp/regex_utils/constants.hpp +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -2,7 +2,6 @@ #define CLP_REGEX_UTILS_CONSTANTS_HPP namespace clp::regex_utils { - // Wildcard meta characters constexpr char cZeroOrMoreCharsWildcard{'*'}; constexpr char cSingleCharWildcard{'?'}; @@ -15,7 +14,6 @@ constexpr char cRegexStartAnchor{'^'}; constexpr char cRegexEndAnchor{'$'}; constexpr char cEscapeChar{'\\'}; constexpr char cCharsetNegate{'^'}; - } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp index 88227ad5b..372629639 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp @@ -5,54 +5,61 @@ #include #include -#include +#include #include "regex_utils/constants.hpp" #include "regex_utils/ErrorCode.hpp" #include "regex_utils/RegexToWildcardTranslatorConfig.hpp" -using clp::regex_utils::cRegexEndAnchor; -using clp::regex_utils::cRegexOneOrMore; -using clp::regex_utils::cRegexStartAnchor; -using clp::regex_utils::cRegexZeroOrMore; -using clp::regex_utils::cRegexZeroOrOne; -using clp::regex_utils::cSingleCharWildcard; -using clp::regex_utils::cZeroOrMoreCharsWildcard; -using clp::regex_utils::ErrorCode; -using clp::regex_utils::RegexToWildcardTranslatorConfig; using std::error_code; using std::string; using std::string_view; +namespace clp::regex_utils { namespace { -// Internal utility class and function declarations -/** - * Class for storing regex translation analysis states, capture group, quantifier information, etc. - */ +// Class for storing regex translation analysis states, capture group, quantifier information, etc. class TranslatorState; -// State transition functions common signature -using StateTransitionFunc - = auto(TranslatorState&, - string_view::const_iterator&, - string&, - RegexToWildcardTranslatorConfig const&) -> error_code; - -// State transition functions whose names correspond to the current analysis state. -[[nodiscard]] StateTransitionFunc normal_state_transition; -[[nodiscard]] StateTransitionFunc dot_state_transition; -[[nodiscard]] StateTransitionFunc end_state_transition; -[[nodiscard]] StateTransitionFunc final_state_cleanup; +/** + * Functions that handle current-state-specific tasks before transitioning to the next state. + * + * @param[in, out] state The object that stores translator's internal information. The primary + * state member variable is always updated if a transition occures. Even if there's no state + * transition, other analysis info may be updated. + * @param[in, out] it The iterator that represents the current regex string scan position. May be + * updated to advance or backtrack the scan position. + * @param[out] wildcard_str The translated wildcard string. May or may not be updated. + * @param[in] config The translator config. + * @return 0 (clp::regex_utils::ErrorCode::Success) upon successful operation. Otherwise, return + * related error code. + */ +using StateTransitionFuncSig + = auto(TranslatorState& state, + string_view::const_iterator& it, + string& wildcard_str, + RegexToWildcardTranslatorConfig const& config) -> error_code; +[[nodiscard]] StateTransitionFuncSig normal_state_transition; +[[nodiscard]] StateTransitionFuncSig dot_state_transition; +[[nodiscard]] StateTransitionFuncSig end_state_transition; +[[nodiscard]] StateTransitionFuncSig final_state_cleanup; class TranslatorState { public: - // Regex translation pattern analysis states. + /** + * States for which we apply specific rules to translate encountered regex patterns. + * + * This list may be expanded as the translator supports translating more regex patterns. + *
    + *
  • NORMAL: The initial state, where characters have no special meanings and are treated + * literally.
  • + *
  • DOT: Encountered a period `.`. Expecting wildcard expression.
  • + *
  • END: Encountered a dollar sign `$`, meaning the regex string has reached the end + * anchor.
  • + *
+ */ enum class RegexPatternState : uint8_t { - // The initial state, where characters have no special meanings and are treated literally. NORMAL = 0, - // Encountered a period `.`. Expecting wildcard expression. DOT, - // Encountered a dollar sign `$`, meaning the regex string has reached the end anchor. END, }; @@ -66,27 +73,23 @@ class TranslatorState { auto set_next_state(RegexPatternState const& state) -> void { m_state = state; } private: - // Variables + // Members RegexPatternState m_state{RegexPatternState::NORMAL}; }; } // namespace -namespace clp::regex_utils { -// Main API -auto regex_to_wildcard(string_view regex_str) -> BOOST_OUTCOME_V2_NAMESPACE::std_result { - RegexToWildcardTranslatorConfig const default_config{false, false}; - return regex_to_wildcard(regex_str, default_config); +auto regex_to_wildcard(string_view regex_str) -> OUTCOME_V2_NAMESPACE::std_result { + return regex_to_wildcard(regex_str, {false, false}); } auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config) - -> BOOST_OUTCOME_V2_NAMESPACE::std_result { + -> OUTCOME_V2_NAMESPACE::std_result { if (regex_str.empty()) { - return string(); + return string{}; } - // Initialize translation state, scan position, and return string TranslatorState state; - string_view::const_iterator it = regex_str.cbegin(); + string_view::const_iterator it{regex_str.cbegin()}; string wildcard_str; // If there is no starting anchor character, append multichar wildcard prefix @@ -97,8 +100,7 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co } error_code ec{}; - while (it != regex_str.end()) { - // Main state transition table + while (it != regex_str.cend()) { switch (state.get_state()) { case TranslatorState::RegexPatternState::NORMAL: ec = normal_state_transition(state, it, wildcard_str, config); @@ -119,24 +121,27 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co ++it; } - // Do the final state check and clean up ec = final_state_cleanup(state, it, wildcard_str, config); if (ec) { return ec; } return wildcard_str; } -} // namespace clp::regex_utils namespace { - +/** + * Treats each character literally and directly append it to the wildcard string, unless it is a + * meta-character. + * + * Each meta-character either triggers a state transition, or makes the regex string untranslatable. + */ auto normal_state_transition( TranslatorState& state, string_view::const_iterator& it, string& wildcard_str, - RegexToWildcardTranslatorConfig const& /*config*/ + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config ) -> error_code { - char const ch = *it; + auto const ch{*it}; switch (ch) { case '.': state.set_next_state(TranslatorState::RegexPatternState::DOT); @@ -163,23 +168,30 @@ auto normal_state_transition( return ErrorCode::Success; } +/** + * Attempt to translate regex wildcard patterns that start with `.` character. + * + * Performs the following translation if possible: + *
    + *
  • `.*` gets translated into `*`
  • + *
  • `.+` gets translated into `?*`
  • + *
  • `.` gets translated into `?`
  • + *
+ */ auto dot_state_transition( TranslatorState& state, string_view::const_iterator& it, string& wildcard_str, - RegexToWildcardTranslatorConfig const& /*config*/ + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config ) -> error_code { switch (*it) { case cZeroOrMoreCharsWildcard: - // .* gets translated to * wildcard_str += cZeroOrMoreCharsWildcard; break; case cRegexOneOrMore: - // .+ gets translated to ?* wildcard_str = wildcard_str + cSingleCharWildcard + cZeroOrMoreCharsWildcard; break; default: - // . gets translated to ? wildcard_str += cSingleCharWildcard; // Backtrack the scan by one position to handle the current char in the next iteration. --it; @@ -189,11 +201,12 @@ auto dot_state_transition( return ErrorCode::Success; } +// Once we've seen the end anchor, we should not expect any other character to appear. auto end_state_transition( - TranslatorState& /*state*/, + [[maybe_unused]] TranslatorState& state, string_view::const_iterator& it, - string& /*wildcard_str*/, - RegexToWildcardTranslatorConfig const& /*config*/ + [[maybe_unused]] string& wildcard_str, + [[maybe_unused]] RegexToWildcardTranslatorConfig const& config ) -> error_code { if (cRegexEndAnchor != *it) { return ErrorCode::Dollar; @@ -201,9 +214,13 @@ auto end_state_transition( return ErrorCode::Success; } +/** + * States other than the NORMAL state may require special handling after the whole regex string has + * been scanned and processed. + */ auto final_state_cleanup( TranslatorState& state, - string_view::const_iterator& /*it*/, + [[maybe_unused]] string_view::const_iterator& it, string& wildcard_str, RegexToWildcardTranslatorConfig const& config ) -> error_code { @@ -225,3 +242,4 @@ auto final_state_cleanup( return ErrorCode::Success; } } // namespace +} // namespace clp::regex_utils diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.hpp b/components/core/src/clp/regex_utils/regex_translation_utils.hpp index e124b933e..8ca703403 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.hpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.hpp @@ -4,26 +4,24 @@ #include #include -#include +#include #include "regex_utils/RegexToWildcardTranslatorConfig.hpp" namespace clp::regex_utils { /** - * Call the regex to wildcard translation function with a default configuration that has all the - * options as false. For more details on the config options, see - * RegexToWildcardTranslatorConfig.hpp. + * Translate a given regex string to wildcard with the default configuration that has all the + * options set to false. * * @param regex_str The regex string to be translated. * @return The translated wildcard string. */ [[nodiscard]] auto regex_to_wildcard(std::string_view regex_str -) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; +) -> OUTCOME_V2_NAMESPACE::std_result; /** - * Translated a given regex string to wildcard with a custom configuration. For more details on the - * config options, see RegexToWildcardTranslatorConfig.hpp. + * Translate a given regex string to wildcard with a custom configuration. * * @param regex_str The regex string to be translated. * @return The translated wildcard string. @@ -31,7 +29,7 @@ namespace clp::regex_utils { [[nodiscard]] auto regex_to_wildcard( std::string_view regex_str, RegexToWildcardTranslatorConfig const& config -) -> BOOST_OUTCOME_V2_NAMESPACE::std_result; +) -> OUTCOME_V2_NAMESPACE::std_result; } // namespace clp::regex_utils diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index 6e7a09911..1f0f954a8 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -1,9 +1,12 @@ #include #include +#include #include +using clp::regex_utils::ErrorCode; using clp::regex_utils::regex_to_wildcard; +using clp::regex_utils::RegexToWildcardTranslatorConfig; TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { // Test empty string @@ -15,10 +18,21 @@ TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *")); // Test unescaped meta characters - REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == clp::regex_utils::ErrorCode::Question) - ); - REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == clp::regex_utils::ErrorCode::Star)); - REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == clp::regex_utils::ErrorCode::Plus)); - REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == clp::regex_utils::ErrorCode::Pipe)); - REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == clp::regex_utils::ErrorCode::Caret)); + REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::Question)); + REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::Star)); + REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::Plus)); + REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::Pipe)); + REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::Caret)); +} + +// Test anchors and prefix/suffix wildcards +TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") { + RegexToWildcardTranslatorConfig const config{false, true}; + REQUIRE(((regex_to_wildcard("^", config).value() == "*"))); + REQUIRE((regex_to_wildcard("$", config).value() == "*")); + REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz")); + REQUIRE((regex_to_wildcard("xyz", config).value() == "*xyz*")); + REQUIRE((regex_to_wildcard("xyz$$", config).value() == "*xyz")); + + REQUIRE((regex_to_wildcard("xyz$zyx$", config).error() == ErrorCode::Dollar)); }