diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index c68865bf9..ff25da6de 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -1,72 +1,84 @@ #include "string_utils/string_utils.hpp" #include -#include +#include #include using std::string; using std::string_view; +namespace clp::string_utils { namespace { /** - * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the pointer in - * tame to the next character which matches wild. This method should be inlined - * for performance. - * @param tame_current - * @param tame_bookmark - * @param tame_end - * @param wild_current - * @param wild_bookmark - * @return true on success, false if wild cannot match tame + * Helper for `wildcard_match_unsafe_case_sensitive` to advance `tame`'s iterator to the next + * character that matches the current character in `wild`, and to bookmark this character. If the + * current character in `wild` is escaped, `wild`'s iterator will also be advanced. + * + * NOTE: + * - This method expects that `tame_it` < `tame_end_it` + * - This method should be inlined for performance. + * + * @param tame_end_it + * @param tame_it Returns `tame`'s updated iterator. + * @param tame_bookmark_it Returns `tame`'s updated bookmark. + * @param wild_it Returns `wild`'s updated iterator. + * @return Whether `tame` might be able to match `wild`. */ inline bool advance_tame_to_next_match( - char const*& tame_current, - char const*& tame_bookmark, - char const* tame_end, - char const*& wild_current + string_view::const_iterator tame_end_it, + string_view::const_iterator& tame_it, + string_view::const_iterator& tame_bookmark_it, + string_view::const_iterator& wild_it ); +/** + * Helper for `wildcard_match_unsafe_case_sensitive` to determine if the given iterator points to + * the end of `wild`, or the second-last character of `wild` if`wild` ends with a '*'. + * @param wild_it + * @param wild_end_it + * @return Whether the match has reached the end of `tame` and `wild`. + */ +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it); + inline bool advance_tame_to_next_match( - char const*& tame_current, - char const*& tame_bookmark, - char const* tame_end, - char const*& wild_current + string_view::const_iterator tame_end_it, + string_view::const_iterator& tame_it, + string_view::const_iterator& tame_bookmark_it, + string_view::const_iterator& wild_it ) { - auto w = *wild_current; + auto w = *wild_it; if ('?' != w) { - // No need to check for '*' since the caller ensures wild doesn't - // contain consecutive '*' + // No need to check for '*' since the caller ensures `wild` doesn't contain consecutive '*' // Handle escaped characters if ('\\' == w) { - ++wild_current; - // This is safe without a bounds check since this the caller ensures - // there are no dangling escape characters - w = *wild_current; + // Safe without a bounds check + ++wild_it; + w = *wild_it; } - // Advance tame_current until it matches wild_current + // Advance `tame_it` until it matches `w` while (true) { - if (tame_end == tame_current) { - // Wild group is longer than last group in tame, so can't match - // e.g. "*abc" doesn't match "zab" - return false; - } - auto t = *tame_current; - if (t == w) { + if (*tame_it == w) { break; } - ++tame_current; + ++tame_it; + if (tame_end_it == tame_it) { + return false; + } } } - tame_bookmark = tame_current; + tame_bookmark_it = tame_it; return true; } + +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it) { + return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); +} } // namespace -namespace clp::string_utils { size_t find_first_of( string const& haystack, char const* needles, @@ -182,114 +194,159 @@ bool wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensiti /** * The algorithm basically works as follows: - * Given a wild string "*abc*def*ghi*", it can be broken into groups of - * characters delimited by one or more '*' characters. The goal of the algorithm - * is then to determine whether the tame string contains each of those groups in - * the same order. + * Given a wildcard string (a.k.a. "wild") like "abc*def*ghi*", it can be broken into groups of + * characters delimited by one or more '*' characters. The goal of the algorithm is then to + * determine whether the "tame" string contains each of those groups in the same order. * - * Thus, the algorithm: - * 1. searches for the start of one of these groups in wild, - * 2. searches for a group in tame starting with the same character, and then - * 3. checks if the two match. If not, the search repeats with the next group in - * tame. + * Matching a group in `wild` against `tame` requires iteratively matching each character in `tame` + * against each character in the group, with the exception of the '?' wildcard and escaped + * characters ('*', '?', or '\'). When a mismatch occurs, there are two possibilities: + * + * 1. The mismatch occurs before the first '*' in `wild`, meaning that the entire wildcard match + * fails. + * 2. The mismatch occurs after a '*' in `wild`. This case requires additional handling explained + * below. + * + * Consider `tame` = "ccd", `wild` = "*cd". When we start matching `tame` against the first group + * in `wild`, the first 'c' will match, but the second 'c' won't match 'd'. In this case, we should + * restart the matching process from the second 'c'. + * + * To generalize this, we need to maintain bookmarks for both `tame` and `wild`. Whenever we have a + * mismatch, we should reset `wild` to its bookmark and `tame` to its bookmark + 1, and then try + * the match again. If we get to the end of `tame` without getting to the end of the group in + * `wild`, the entire wildcard match fails. + * + * NOTE: + * - This method is on the critical path for searches in clg/clp-s/glt, so any modifications must be + * benchmarked to ensure performance is not significantly affected. + * - Since the caller guarantees that there are no consecutive '*', we don't need to handle the + * case where a group in `wild` is empty. + * - Since the caller guarantees that every '\' is followed by a character, we can advance passed + * '\' without doing a subsequent bounds check. + * - The second part of this method could be rewritten in the following form: + * + * ``` + * while(true) { + * if (false == advance_tame_to_next_match(...)) return false; + * + * while (true) { + * // Advance iterators + * // If we reached the end of `tame` before the end of `wild`, break + * // If we see a '*' in `wild`, break + * // If we see a mismatch, break + * } + * } + * ``` + * + * However, this form is ~2% slower. */ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { - auto const tame_length = tame.length(); - auto const wild_length = wild.length(); - char const* tame_current = tame.data(); - char const* wild_current = wild.data(); - char const* tame_bookmark = nullptr; - char const* wild_bookmark = nullptr; - char const* tame_end = tame_current + tame_length; - char const* wild_end = wild_current + wild_length; - - // Handle wild or tame being empty - if (0 == wild_length) { - return 0 == tame_length; - } else { - if (0 == tame_length) { - return "*" == wild; + // Handle `tame` or `wild` being empty + if (wild.empty()) { + return tame.empty(); + } + if (tame.empty()) { + return "*" == wild; + } + + auto tame_it = tame.cbegin(); + auto wild_it = wild.cbegin(); + auto const tame_end_it = tame.cend(); + auto const wild_end_it = wild.cend(); + string_view::const_iterator tame_bookmark_it{}; + string_view::const_iterator wild_bookmark_it{}; + + // Match `tame` against `wild` against until we reach the first '*' in `wild` or they no longer + // match + while (true) { + auto w = *wild_it; + if ('*' == w) { + break; + } + if ('?' != w) { + // Handle escaped characters + if ('\\' == w) { + // Safe without a bounds check + ++wild_it; + w = *wild_it; + } + + // Handle a mismatch + if (w != *tame_it) { + return false; + } + } + + ++tame_it; + ++wild_it; + + // Handle boundary conditions + // NOTE: The bodies of these if-blocks depend on the order of these conditions. + if (tame_end_it == tame_it) { + return is_end_of_wild(wild_it, wild_end_it); + } + if (wild_end_it == wild_it) { + return false; } } - char w; - char t; - bool is_escaped = false; + // Find a match in `tame` for every group of characters between '*' in `wild` while (true) { - w = *wild_current; + auto w = *wild_it; if ('*' == w) { - ++wild_current; - if (wild_end == wild_current) { - // Trailing '*' means everything remaining in tame will match + ++wild_it; + if (wild_end_it == wild_it) { + // `wild` ending with '*' means that it'll match the rest of `tame` return true; } - // Set wild and tame bookmarks - wild_bookmark = wild_current; + // Set `tame` and `wild` bookmarks + wild_bookmark_it = wild_it; if (false - == advance_tame_to_next_match(tame_current, tame_bookmark, tame_end, wild_current)) + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) { return false; } - } else { + } else if ('?' != w) { // Handle escaped characters if ('\\' == w) { - is_escaped = true; - ++wild_current; - // This is safe without a bounds check since this the caller - // ensures there are no dangling escape characters - w = *wild_current; + // Safe without a bounds check + ++wild_it; + w = *wild_it; } // Handle a mismatch - t = *tame_current; - if (!((false == is_escaped && '?' == w) || t == w)) { - if (nullptr == wild_bookmark) { - // No bookmark to return to + if (w != *tame_it) { + // Reset to bookmarks + tame_it = tame_bookmark_it + 1; + if (tame_end_it == tame_it) { return false; } - - wild_current = wild_bookmark; - tame_current = tame_bookmark + 1; + wild_it = wild_bookmark_it; if (false - == advance_tame_to_next_match( - tame_current, - tame_bookmark, - tame_end, - wild_current - )) + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) { return false; } } } - ++tame_current; - ++wild_current; + ++tame_it; + ++wild_it; - // Handle reaching the end of tame or wild - if (tame_end == tame_current) { - return (wild_end == wild_current - || ('*' == *wild_current && (wild_current + 1) == wild_end)); - } else { - if (wild_end == wild_current) { - if (nullptr == wild_bookmark) { - // No bookmark to return to - return false; - } else { - wild_current = wild_bookmark; - tame_current = tame_bookmark + 1; - if (false - == advance_tame_to_next_match( - tame_current, - tame_bookmark, - tame_end, - wild_current - )) - { - return false; - } - } + // Handle boundary conditions + // NOTE: The bodies of these if-blocks depend on the order of these conditions. + if (tame_end_it == tame_it) { + return is_end_of_wild(wild_it, wild_end_it); + } + if (wild_end_it == wild_it) { + // Reset to bookmarks + tame_it = tame_bookmark_it + 1; + wild_it = wild_bookmark_it; + if (false + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) + { + return false; } } } diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 747ebe000..091e385b1 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -1,21 +1,174 @@ -#include +#include +#include +#include +#include +#include +#include +#include -#include -#include #include #include +#include "FileReader.hpp" +#include "spdlog_with_specializations.hpp" + using clp::string_utils::clean_up_wildcard_search_string; using clp::string_utils::convert_string_to_int; using clp::string_utils::wildcard_match_unsafe; using clp::string_utils::wildcard_match_unsafe_case_sensitive; -using std::chrono::duration; using std::chrono::high_resolution_clock; -using std::cout; -using std::endl; +using std::span; using std::string; +using std::string_view; using std::vector; +namespace { +/** + * All possible alphabets that could appear in a wildcard string. Note that the alphabets are + * conceptual (e.g. EscapedAsterisk) rather than concrete (e.g. "\\*"). + */ +enum class WildcardStringAlphabet : uint8_t { + Empty = 0, + AnyChar, + Asterisk, + QuestionMark, + EscapedAsterisk, + EscapedQuestionMark, + EscapedBackslash, +}; + +/** + * Recursively generates strings that will match the given wildcard string and tests that they + * match. + * @param chosen_alphabets + * @param wild + * @param tame Returns the string generated so far. + */ +void generate_and_test_tame_str( + span chosen_alphabets, + string_view wild, + string& tame +); + +/** + * Recursively generates and tests a wildcard string using the given template. Testing requires + * generating one or more matching strings. + * @param template_wildcard_str + * @param chosen_alphabets Returns the alphabets chosen so far. + * @param wild Returns the string generated so far. + */ +void generate_and_test_wildcard_str( + span> template_wildcard_str, + vector& chosen_alphabets, + string& wild +); + +// NOLINTNEXTLINE(misc-no-recursion) +void generate_and_test_tame_str( + span chosen_alphabets, + string_view wild, + string& tame +) { + // Base case + if (chosen_alphabets.empty()) { + INFO("tame: \"" << tame << "\", wild: \"" << wild << "\""); + REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); + return; + } + + auto const tame_size_before_modification = tame.size(); + auto alphabet = chosen_alphabets.front(); + auto const next_chosen_alphabets = chosen_alphabets.subspan(1); + switch (alphabet) { + case WildcardStringAlphabet::Empty: + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + return; + case WildcardStringAlphabet::AnyChar: + tame += 'a'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::Asterisk: + // Generate "", "a", and "ab" + for (size_t i = 0; i < 3; ++i) { + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + + tame += static_cast('a' + i); + } + break; + case WildcardStringAlphabet::QuestionMark: + tame += 'a'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedAsterisk: + tame += '*'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedQuestionMark: + tame += '?'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedBackslash: + tame += '\\'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + default: + REQUIRE(false); + } + + tame.resize(tame_size_before_modification); +} + +// NOLINTNEXTLINE(misc-no-recursion) +void generate_and_test_wildcard_str( + span> template_wildcard_str, + vector& chosen_alphabets, + string& wild +) { + // Base case + if (template_wildcard_str.empty()) { + string tame; + generate_and_test_tame_str(chosen_alphabets, wild, tame); + return; + } + + auto const wild_size_before_modification = wild.size(); + + auto const& test_alphabet = template_wildcard_str.front(); + for (auto alphabet : test_alphabet) { + switch (alphabet) { + case WildcardStringAlphabet::Empty: + break; + case WildcardStringAlphabet::AnyChar: + wild += 'a'; + break; + case WildcardStringAlphabet::Asterisk: + wild += '*'; + break; + case WildcardStringAlphabet::QuestionMark: + wild += '?'; + break; + case WildcardStringAlphabet::EscapedAsterisk: + wild += "\\*"; + break; + case WildcardStringAlphabet::EscapedQuestionMark: + wild += "\\?"; + break; + case WildcardStringAlphabet::EscapedBackslash: + wild += "\\\\"; + break; + default: + REQUIRE(false); + } + + chosen_alphabets.push_back(alphabet); + generate_and_test_wildcard_str(template_wildcard_str.subspan(1), chosen_alphabets, wild); + chosen_alphabets.pop_back(); + + wild.resize(wild_size_before_modification); + } +} +} // namespace + TEST_CASE("to_lower", "[to_lower]") { string str = "test123TEST"; clp::string_utils::to_lower(str); @@ -54,464 +207,130 @@ TEST_CASE("clean_up_wildcard_search_string", "[clean_up_wildcard_search_string]" REQUIRE(clean_up_wildcard_search_string(str) == "abc"); } -SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard]") { - std::string tameString, wildString; - - WHEN("Match is expected if wild card character is \"*\"") { - GIVEN("Single wild with no suffix char") { - tameString = "abcd", wildString = "a*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild with no prefix char") { - tameString = "abcd", wildString = "*d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has 1st char as literal") { - tameString = "abcd", wildString = "*a*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has middle char as literal") { - tameString = "abcd", wildString = "*b*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has last char as literal") { - tameString = "abcd", wildString = "*d*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild only") { - tameString = "abcd", wildString = "*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - } - - WHEN("Match is expected if Wild card character is \"?\"") { - GIVEN("Single wild in the middle") { - tameString = "abcd", wildString = "a?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild in the beginning") { - tameString = "abcd", wildString = "?bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild at the end") { - tameString = "abcd", wildString = "abc?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the middle") { - tameString = "abcd", wildString = "a??d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the beginning") { - tameString = "abcd", wildString = "??cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the end") { - tameString = "abcd", wildString = "ab??"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild in the beginning and end") { - tameString = "abcd", wildString = "?bc?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild anywhere") { - tameString = "abcdef", wildString = "a?c?ef"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("All wild") { - tameString = "abcd", wildString = "????"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - } - - WHEN("Match is expected if wild card character has both \"*\" and \"?\"") { - GIVEN("Wild begins with \"*?\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "*?bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"?*\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "?*bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"*?\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "*?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"?*\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "*?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"*?\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "abc*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"?*\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "abc*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"*?\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "ab*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"?*\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "ab?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with exactly \"*?\" pattern") { - tameString = "abcd", wildString = "*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); +TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { + // We want to test all varieties of wildcard strings and strings that can be matched by them. + // We do this by using a kind of template wildcard string---where each character has a set of + // possibilities---to generate this variety. For each wildcard string, we also generate one or + // more strings that can be matched by the wildcard string. + + // The template below is meant to test 1-2 groups of WildcardStringAlphabets separated by '*'. + // The groups allow contiguous repeats of all possible alphabets except '*' since + // `wildcard_match_unsafe_case_sensitive` doesn't accept such wildcard strings. Each alphabet in + // the template may be empty except at least one in each group (so we don't unintentionally + // create two contiguous '*'). Overall, this should cover all matching cases. + vector const nullable_asterisk_template{ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::Asterisk, + }; + vector const nullable_non_asterisk_template{ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }; + vector const non_asterisk_template{ + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }; + vector> template_wildcard_str; + for (size_t i = 0; i < 2; ++i) { + if (0 == i) { + template_wildcard_str.emplace_back(nullable_asterisk_template); + template_wildcard_str.emplace_back(nullable_non_asterisk_template); + template_wildcard_str.emplace_back(non_asterisk_template); + template_wildcard_str.emplace_back(nullable_non_asterisk_template); + template_wildcard_str.push_back(nullable_asterisk_template); + } else { + // Insert "*" before the last asterisk + // NOTE: We insert in reverse since we're using the same iterator for all inserts + auto insert_pos_it = template_wildcard_str.end() - 1; + template_wildcard_str.insert(insert_pos_it, nullable_non_asterisk_template); + template_wildcard_str.insert(insert_pos_it, non_asterisk_template); + template_wildcard_str.insert(insert_pos_it, nullable_non_asterisk_template); + template_wildcard_str.insert( + insert_pos_it, + { + WildcardStringAlphabet::Asterisk, + } + ); } - GIVEN("Wild begins with exactly \"?*\" pattern") { - tameString = "abcd", wildString = "?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + vector chosen_alphabets; + string wild; + generate_and_test_wildcard_str(template_wildcard_str, chosen_alphabets, wild); } - WHEN("Match unexpected containing wild card character(s)") { - GIVEN("Missing literal character w/ \"*\"") { - tameString = "abcd", wildString = "ac*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("More literals in wild than tame w/ \"*\" in the middle") { - tameString = "abcd", wildString = "abc*de"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with \"*\" in the middle") { - tameString = "abcd", wildString = "b**d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the end with \"*\" in the middle") { - tameString = "abcd", wildString = "a**c"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with both \"*\" and \"?\" in the middle" - ) { - tameString = "abcd", wildString = "b*?d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with \"?\" at the beginning") { - tameString = "abcd", wildString = "?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the end with both \"?\" at the end") { - tameString = "abcd", wildString = "ab?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } + // We test non-matching cases using a tame string that matches a diverse wildcard string as + // follows. We test that every substring (anchored at index 0) of tame doesn't match the + // complete wildcard string. + constexpr string_view tame{"abcdef?*?ghixyz"}; + constexpr string_view wild{R"(*a?c*\?\*\?*x?z*)"}; + // Sanity-check that they match. + REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); + auto tame_begin_it = tame.cbegin(); + for (auto it = tame.cend() - 1; tame_begin_it != it; --it) { + auto const tame_substr = string_view{tame_begin_it, it}; + INFO("tame: \"" << tame_substr << "\", wild: \"" << wild << "\""); + REQUIRE((false == wildcard_match_unsafe_case_sensitive(tame_substr, wild))); } +} - WHEN("Match is expected when escape character(s) are used") { - GIVEN("Escaping \"*\"") { - tameString = "a*cd", wildString = "a\\*cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"?\"") { - tameString = "a?cd", wildString = "a\\?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"*\" and \"?\"") { - tameString = "a?c*e", wildString = "a\\?c\\*e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"\\\"") { - tameString = "a\\cd", wildString = "a\\\\cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"?\" when fast forwarding") { - tameString = "abc?e", wildString = "a*\\?e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"*\" when fast forwarding") { - tameString = "abc*e", wildString = "a*\\*e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"\\\" when fast forwarding") { - tameString = "abc\\e", wildString = "a*\\\\e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } +TEST_CASE("wildcard_match_unsafe", "[wildcard]") { + constexpr string_view tame{"0!2#4%6&8(aBcDeFgHiJkLmNoPqRsTuVwXyZ"}; + string wild; - GIVEN("Escaping \"?\" when rewinding") { - tameString = "\\ab\\ab\\c?ef", wildString = "*ab\\\\c\\?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "0!2#4%6&8(AbCdEfGhIjKlMnOpQrStUvWxYz"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); - GIVEN("Escaping \"*\" when rewinding") { - tameString = "\\ab\\ab\\c*ef", wildString = "*ab\\\\c\\**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "0?2?4?6?8?A?C?E?G?I?K?M?O?Q?S?U?W?Y?"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); - GIVEN("Escaping \"\\\" when rewinding") { - tameString = "\\ab\\ab\\c\\ef", wildString = "*ab\\\\c\\\\*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "?!?#?%?&?(?b?d?f?h?j?l?n?p?r?t?v?x?z"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); - GIVEN("Silently ignore unsupported escape sequence \\a") { - tameString = "ab?d", wildString = "\\ab?d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - } + wild = "*?b?d?f?h?j?l?n?p?r?t?v?x?z*"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); - WHEN("Case wild card match is case insensitive") { - // This test is meant to supplement the case sensitive wild card implementation - // The case insensitive implementation is exactly the same as case sensitive except it - // automatically adjust the inputs to lower case when needed before doing comparison. It is - // rarely used due to lower performance - bool isCaseSensitive = false; - GIVEN("All lower case tame and all upper case wild") { - tameString = "abcde", wildString = "A?C*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); - } + wild = "*?A?C?E?G?I?K?M?O?Q?S?U?W?Y?*"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); +} - GIVEN("All lower case tame and mixed lower and upper case wild") { - tameString = "abcde", wildString = "A?c*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); +SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performance]") { + auto const tests_dir = std::filesystem::path{__FILE__}.parent_path(); + auto const log_file_path = tests_dir / "test_network_reader_src" / "random.log"; - tameString = "abcde", wildString = "A?c*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); - } + clp::FileReader file_reader; + file_reader.open(log_file_path.string()); + string line; + vector lines; + while (file_reader.read_to_delimiter('\n', false, false, line)) { + lines.push_back(line); } + file_reader.close(); - WHEN("Tested with a bunch of additional test cases found online") { - bool allPassed = true; - - GIVEN("Case with repeating character sequences") { - allPassed &= wildcard_match_unsafe_case_sensitive("abcccd", "*ccd"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississipissippi", "*issip*ss*"); - allPassed - &= !wildcard_match_unsafe_case_sensitive("xxxx*zzzzzzzzy*f", "xxxx*zzy*fffff"); - allPassed &= wildcard_match_unsafe_case_sensitive("xxxx*zzzzzzzzy*f", "xxx*zzy*f"); - allPassed &= !wildcard_match_unsafe_case_sensitive("xxxxzzzzzzzzyf", "xxxx*zzy*fffff"); - allPassed &= wildcard_match_unsafe_case_sensitive("xxxxzzzzzzzzyf", "xxxx*zzy*f"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyzyxyz", "xy*z*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississippi", "*sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyxyz", "xy*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississippi", "mi*sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("ababac", "*abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("ababac", "*abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("aaazz", "a*zz*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a12b12", "*12*23"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a12b12", "a12b"); - allPassed &= wildcard_match_unsafe_case_sensitive("a12b12", "*12*12*"); - REQUIRE(allPassed == true); - } - - GIVEN("Additional cases where the '*' char appears in the tame string") { - allPassed &= wildcard_match_unsafe_case_sensitive("*", "*"); - allPassed &= wildcard_match_unsafe_case_sensitive("a*abab", "a*b"); - allPassed &= wildcard_match_unsafe_case_sensitive("a*r", "a*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a*ar", "a*aar"); - REQUIRE(allPassed == true); - } - - GIVEN("More double wildcard scenarios") { - allPassed &= wildcard_match_unsafe_case_sensitive("XYXYXYZYXYz", "XY*Z*XYz"); - allPassed &= wildcard_match_unsafe_case_sensitive("missisSIPpi", "*SIP*"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississipPI", "*issip*PI"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyxyz", "xy*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("miSsissippi", "mi*sip*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("miSsissippi", "mi*Sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abAbac", "*Abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abAbac", "*Abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("aAazz", "a*zz*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("A12b12", "*12*23"); - allPassed &= wildcard_match_unsafe_case_sensitive("a12B12", "*12*12*"); - allPassed &= wildcard_match_unsafe_case_sensitive("oWn", "*oWn*"); - REQUIRE(allPassed == true); - } - - GIVEN("Completely tame (no wildcards) cases") { - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bLah"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLah", "bLaH"); - REQUIRE(allPassed == true); - } - - GIVEN("Simple mixed wildcard tests suggested by IBMer Marlin Deckert") { - allPassed &= wildcard_match_unsafe_case_sensitive("a", "*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("abc", "*?"); - REQUIRE(allPassed == true); - } - - GIVEN("More mixed wildcard tests including coverage for false positives") { - allPassed &= !wildcard_match_unsafe_case_sensitive("a", "??"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "?*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "*?*?*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abcd", "?b*??"); - allPassed &= !wildcard_match_unsafe_case_sensitive("abcd", "?a*??"); - allPassed &= wildcard_match_unsafe_case_sensitive("abcde", "?*b*?*d*?"); - REQUIRE(allPassed == true); - } - - GIVEN("Single-character-match cases") { - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bL?h"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLaaa", "bLa?"); - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bLa?"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLaH", "?Lah"); - allPassed &= wildcard_match_unsafe_case_sensitive("bLaH", "?LaH"); - REQUIRE(allPassed == true); - } - - GIVEN("Many-wildcard scenarios") { - allPassed &= wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - "aaaaaaaaaaaab", - "a*a*a*a*a*a*aa*aaa*a*a*b" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*a*aa*aaa*fa*ga*b*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*a*x*aaa*fa*ga*b*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*aaaa*fa*ga*gggg*b*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*aaaa*fa*ga*ggg*b*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive("aaabbaabbaab", "*aabbaa*a*"); - allPassed &= wildcard_match_unsafe_case_sensitive( - "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", - "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaaa", - "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaa", - "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*" - "abcdefghijkl*abcdefghijklm*abcdefghijklmn", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*" - "abcdefghijkl*abcdefghijklm*abcdefghijklmn", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcd*abc*abcd", - "abc*abc*abc*abc*abc" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcd*abc*abcd*abcd*abc*abcd*abc*abc*abcd", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abcd" - ); - REQUIRE(allPassed == true); - } - - GIVEN("A case-insensitive algorithm test") { - bool isCaseSensitive = false; - allPassed &= wildcard_match_unsafe("mississippi", "*issip*PI", isCaseSensitive); - REQUIRE(allPassed == true); + auto const begin_timestamp = high_resolution_clock::now(); + for (size_t i = 0; i < 10'000; ++i) { + for (auto const& tame : lines) { + wildcard_match_unsafe_case_sensitive(tame, "*to*blk_1073742594_1770*"); } } -} + auto const end_timestamp = high_resolution_clock::now(); -SCENARIO("Test wild card performance", "[wildcard performance]") { - // This test is to ensure there is no performance regression - // We use our current implementation vs the next best implementation as a reference - // If performance becomes slower than our next best implementation, then it is considered a fail - - high_resolution_clock::time_point t1, t2; - string tameStr, wildStr; - - int const nReps = 1'000'000; - int testReps; - bool allPassed_currentImplementation = true; - bool allPassed_nextBestImplementation = true; - - /*********************************************************************************************** - * Inputs Begin - **********************************************************************************************/ - vector tameVec, wildVec; - - // Typical apache log - tameVec.push_back("64.242.88.10 - - [07/Mar/2004:16:06:51 -0800] \"GET " - "/twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1" - ".3&rev2=1.2 HTTP/1.1\" 200 4523"); - wildVec.push_back("*64.242.88.10*Mar/2004*GET*200*"); - - /*********************************************************************************************** - * Inputs End - **********************************************************************************************/ - - // Profile current implementation - testReps = nReps; - t1 = high_resolution_clock::now(); - while (testReps--) { - BOOST_FOREACH (boost::tie(tameStr, wildStr), boost::combine(tameVec, wildVec)) { - allPassed_currentImplementation - &= wildcard_match_unsafe_case_sensitive(tameStr, wildStr); - } - } - t2 = high_resolution_clock::now(); - duration timeSpan_currentImplementation = t2 - t1; - - // Profile next best implementation - testReps = nReps; - t1 = high_resolution_clock::now(); - while (testReps--) { - // Replace this part with slow implementation - BOOST_FOREACH (boost::tie(tameStr, wildStr), boost::combine(tameVec, wildVec)) { - allPassed_currentImplementation - &= wildcard_match_unsafe_case_sensitive(tameStr, wildStr); - } - } - t2 = high_resolution_clock::now(); - duration timeSpan_nextBestImplementation = t2 - t1; - REQUIRE(allPassed_currentImplementation == true); - - if (allPassed_currentImplementation) { - cout << "Passed performance test in " << (timeSpan_currentImplementation.count() * 1000) - << " milliseconds." << endl; - } else { - cout << "Failed performance test in " << (timeSpan_currentImplementation.count() * 1000) - << " milliseconds." << endl; - } + SPDLOG_INFO( + "wildcard_match_unsafe_case_sensitive performance test took {} milliseconds.", + std::chrono::duration_cast(end_timestamp - begin_timestamp) + .count() + ); } TEST_CASE("convert_string_to_int", "[convert_string_to_int]") {