Remove duplicate string utils functions in clp-s and use clp::string_…

…utils
Bill-hbrhbr · Dec 9, 2024 · f4f7335 · f4f7335
1 parent 4071667
commit f4f7335
Show file tree

Hide file tree

Showing 11 changed files with 56 additions and 446 deletions.
diff --git a/components/core/src/clp_s/DictionaryReader.hpp b/components/core/src/clp_s/DictionaryReader.hpp
@@ -6,11 +6,13 @@
 #include <unordered_set>
 
 #include <boost/algorithm/string/case_conv.hpp>
+#include <clp/string_utils/string_utils.hpp>
 
 #include "DictionaryEntry.hpp"
-#include "Utils.hpp"
 
 namespace clp_s {
+using clp::string_utils::wildcard_match_unsafe;
+
 template <typename DictionaryIdType, typename EntryType>
 class DictionaryReader {
 public:
@@ -200,7 +202,7 @@ void DictionaryReader<DictionaryIdType, EntryType>::get_entries_matching_wildcar
         std::unordered_set<EntryType const*>& entries
 ) const {
     for (auto const& entry : m_entries) {
-        if (StringUtils::wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) {
+        if (wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) {
             entries.insert(&entry);
         }
     }

diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <stack>
 
+#include <clp/string_utils/string_utils.hpp>
 #include <simdjson.h>
 #include <spdlog/spdlog.h>
 
@@ -23,7 +24,7 @@ JsonParser::JsonParser(JsonParserOption const& option)
 
     if (false == m_timestamp_key.empty()) {
         if (false
-            == clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
+            == clp::string_utils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
         {
             SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
             throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);

diff --git a/components/core/src/clp_s/TimestampDictionaryReader.cpp b/components/core/src/clp_s/TimestampDictionaryReader.cpp
@@ -1,9 +1,8 @@
 #include "TimestampDictionaryReader.hpp"
 
+#include <clp/string_utils/string_utils.hpp>
 #include <unordered_set>
 
-#include "Utils.hpp"
-
 namespace clp_s {
 void TimestampDictionaryReader::open(std::string const& dictionary_path) {
     if (m_is_open) {
@@ -44,7 +43,7 @@ void TimestampDictionaryReader::read_new_entries() {
         TimestampEntry entry;
         std::vector<std::string> tokens;
         entry.try_read_from_file(m_dictionary_decompressor);
-        if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
+        if (false == clp::string_utils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
             throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
         }
         m_entries.emplace_back(std::move(entry));

diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
@@ -1,12 +1,16 @@
 #include "Utils.hpp"
 
 #include <boost/filesystem.hpp>
+#include <clp/string_utils/string_utils.hpp>
 #include <spdlog/spdlog.h>
 
+namespace clp_s {
 using std::string;
 using std::string_view;
+using clp::string_utils::is_alphabet;
+using clp::string_utils::is_decimal_digit;
+using clp::string_utils::is_delim;
 
-namespace clp_s {
 bool FileUtils::find_all_files(std::string const& path, std::vector<std::string>& file_paths) {
     try {
         if (false == boost::filesystem::is_directory(path)) {
@@ -110,68 +114,6 @@ bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, s
     return (msg_length != begin_pos);
 }
 
-size_t StringUtils::find_first_of(
-        string const& haystack,
-        char const* needles,
-        size_t search_start_pos,
-        size_t& needle_ix
-) {
-    size_t haystack_length = haystack.length();
-    size_t needles_length = strlen(needles);
-    for (size_t i = search_start_pos; i < haystack_length; ++i) {
-        for (needle_ix = 0; needle_ix < needles_length; ++needle_ix) {
-            if (haystack[i] == needles[needle_ix]) {
-                return i;
-            }
-        }
-    }
-
-    return string::npos;
-}
-
-string StringUtils::replace_characters(
-        char const* characters_to_escape,
-        char const* replacement_characters,
-        string const& value,
-        bool escape
-) {
-    string new_value;
-    size_t search_start_pos = 0;
-    while (true) {
-        size_t replace_char_ix;
-        size_t char_to_replace_pos
-                = find_first_of(value, characters_to_escape, search_start_pos, replace_char_ix);
-        if (string::npos == char_to_replace_pos) {
-            new_value.append(value, search_start_pos, string::npos);
-            break;
-        } else {
-            new_value.append(value, search_start_pos, char_to_replace_pos - search_start_pos);
-            if (escape) {
-                new_value += "\\";
-            }
-            new_value += replacement_characters[replace_char_ix];
-            search_start_pos = char_to_replace_pos + 1;
-        }
-    }
-    return new_value;
-}
-
-void StringUtils::to_lower(string& str) {
-    std::transform(str.cbegin(), str.cend(), str.begin(), [](unsigned char c) {
-        return std::tolower(c);
-    });
-}
-
-bool StringUtils::is_wildcard(char c) {
-    static constexpr char cWildcards[] = "?*";
-    for (size_t i = 0; i < strlen(cWildcards); ++i) {
-        if (cWildcards[i] == c) {
-            return true;
-        }
-    }
-    return false;
-}
-
 bool StringUtils::has_unescaped_wildcards(std::string const& str) {
     for (size_t i = 0; i < str.size(); ++i) {
         if ('*' == str[i] || '?' == str[i]) {
@@ -184,42 +126,6 @@ bool StringUtils::has_unescaped_wildcards(std::string const& str) {
     return false;
 }
 
-string StringUtils::clean_up_wildcard_search_string(string_view str) {
-    string cleaned_str;
-
-    bool is_escaped = false;
-    auto str_end = str.cend();
-    for (auto current = str.cbegin(); current != str_end;) {
-        auto c = *current;
-        if (is_escaped) {
-            is_escaped = false;
-
-            if (is_wildcard(c) || '\\' == c) {
-                // Keep escaping if c is a wildcard character or an escape character
-                cleaned_str += '\\';
-            }
-            cleaned_str += c;
-            ++current;
-        } else if ('*' == c) {
-            cleaned_str += c;
-
-            // Skip over all '*' to find the next non-'*'
-            do {
-                ++current;
-            } while (current != str_end && '*' == *current);
-        } else {
-            if ('\\' == c) {
-                is_escaped = true;
-            } else {
-                cleaned_str += c;
-            }
-            ++current;
-        }
-    }
-
-    return cleaned_str;
-}
-
 bool StringUtils::advance_tame_to_next_match(
         char const*& tame_current,
         char const*& tame_bookmark,
@@ -261,154 +167,6 @@ bool StringUtils::advance_tame_to_next_match(
     return true;
 }
 
-bool StringUtils::wildcard_match_unsafe(
-        string_view tame,
-        string_view wild,
-        bool case_sensitive_match
-) {
-    if (case_sensitive_match) {
-        return wildcard_match_unsafe_case_sensitive(tame, wild);
-    } else {
-        // We convert to lowercase (rather than uppercase) anticipating that
-        // callers use lowercase more frequently, so little will need to change.
-        string lowercase_tame(tame);
-        to_lower(lowercase_tame);
-        string lowercase_wild(wild);
-        to_lower(lowercase_wild);
-        return wildcard_match_unsafe_case_sensitive(lowercase_tame, lowercase_wild);
-    }
-}
-
-/**
- * The algorithm basically works as follows:
- * Given a wild string "*abc*def*ghi*", it can be broken into groups of
- * characters delimited by one or more '*' characters. The goal of the
- * algorithm is then to determine whether the tame string contains each of
- * those groups in the same order.
- *
- * Thus, the algorithm:
- * 1. searches for the start of one of these groups in wild,
- * 2. searches for a group in tame starting with the same character, and then
- * 3. checks if the two match. If not, the search repeats with the next group in
- *    tame.
- */
-bool StringUtils::wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) {
-    auto const tame_length = tame.length();
-    auto const wild_length = wild.length();
-    char const* tame_current = tame.data();
-    char const* wild_current = wild.data();
-    char const* tame_bookmark = nullptr;
-    char const* wild_bookmark = nullptr;
-    char const* tame_end = tame_current + tame_length;
-    char const* wild_end = wild_current + wild_length;
-
-    // Handle wild or tame being empty
-    if (0 == wild_length) {
-        return 0 == tame_length;
-    } else {
-        if (0 == tame_length) {
-            return "*" == wild;
-        }
-    }
-
-    char w;
-    char t;
-    bool is_escaped = false;
-    while (true) {
-        w = *wild_current;
-        if ('*' == w) {
-            ++wild_current;
-            if (wild_end == wild_current) {
-                // Trailing '*' means everything remaining in tame will match
-                return true;
-            }
-
-            // Set wild and tame bookmarks
-            wild_bookmark = wild_current;
-            if (!advance_tame_to_next_match(
-                        tame_current,
-                        tame_bookmark,
-                        tame_end,
-                        wild_current,
-                        wild_bookmark
-                ))
-            {
-                return false;
-            }
-        } else {
-            // Handle escaped characters
-            if ('\\' == w) {
-                is_escaped = true;
-                ++wild_current;
-                // This is safe without a bounds check since this the caller
-                // ensures there are no dangling escape characters
-                w = *wild_current;
-            }
-
-            // Handle a mismatch
-            t = *tame_current;
-            if (false == ((false == is_escaped && '?' == w) || t == w)) {
-                if (nullptr == wild_bookmark) {
-                    // No bookmark to return to
-                    return false;
-                }
-
-                wild_current = wild_bookmark;
-                tame_current = tame_bookmark + 1;
-                if (!advance_tame_to_next_match(
-                            tame_current,
-                            tame_bookmark,
-                            tame_end,
-                            wild_current,
-                            wild_bookmark
-                    ))
-                {
-                    return false;
-                }
-            }
-        }
-
-        ++tame_current;
-        ++wild_current;
-
-        // Handle reaching the end of tame or wild
-        if (tame_end == tame_current) {
-            return (wild_end == wild_current
-                    || ('*' == *wild_current && (wild_current + 1) == wild_end));
-        } else {
-            if (wild_end == wild_current) {
-                if (nullptr == wild_bookmark) {
-                    // No bookmark to return to
-                    return false;
-                } else {
-                    wild_current = wild_bookmark;
-                    tame_current = tame_bookmark + 1;
-                    if (!advance_tame_to_next_match(
-                                tame_current,
-                                tame_bookmark,
-                                tame_end,
-                                wild_current,
-                                wild_bookmark
-                        ))
-                    {
-                        return false;
-                    }
-                }
-            }
-        }
-    }
-}
-
-bool StringUtils::convert_string_to_int64(std::string_view raw, int64_t& converted) {
-    auto raw_end = raw.cend();
-    auto result = std::from_chars(raw.cbegin(), raw_end, converted);
-    if (raw_end != result.ptr) {
-        return false;
-    } else {
-        return result.ec == std::errc();
-    }
-}
-
 bool StringUtils::convert_string_to_double(std::string const& raw, double& converted) {
     if (raw.empty()) {
         // Can't convert an empty string
@@ -426,35 +184,4 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
     converted = raw_as_double;
     return true;
 }
-
-bool StringUtils::tokenize_column_descriptor(
-        std::string const& descriptor,
-        std::vector<std::string>& tokens
-) {
-    // TODO: add support for unicode sequences e.g. \u263A
-    std::string cur_tok;
-    for (size_t cur = 0; cur < descriptor.size(); ++cur) {
-        if ('\\' == descriptor[cur]) {
-            ++cur;
-            if (cur >= descriptor.size()) {
-                return false;
-            }
-        } else if ('.' == descriptor[cur]) {
-            if (cur_tok.empty()) {
-                return false;
-            }
-            tokens.push_back(cur_tok);
-            cur_tok.clear();
-            continue;
-        }
-        cur_tok.push_back(descriptor[cur]);
-    }
-
-    if (cur_tok.empty()) {
-        return false;
-    }
-
-    tokens.push_back(cur_tok);
-    return true;
-}
 }  // namespace clp_s