Skip to content

Commit

Permalink
Remove duplicate string utils functions in clp-s and use clp::string_…
Browse files Browse the repository at this point in the history
…utils
  • Loading branch information
Bingran Hu committed Dec 9, 2024
1 parent 4071667 commit f4f7335
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 446 deletions.
6 changes: 4 additions & 2 deletions components/core/src/clp_s/DictionaryReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
#include <unordered_set>

#include <boost/algorithm/string/case_conv.hpp>
#include <clp/string_utils/string_utils.hpp>

#include "DictionaryEntry.hpp"
#include "Utils.hpp"

namespace clp_s {
using clp::string_utils::wildcard_match_unsafe;

template <typename DictionaryIdType, typename EntryType>
class DictionaryReader {
public:
Expand Down Expand Up @@ -200,7 +202,7 @@ void DictionaryReader<DictionaryIdType, EntryType>::get_entries_matching_wildcar
std::unordered_set<EntryType const*>& entries
) const {
for (auto const& entry : m_entries) {
if (StringUtils::wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) {
if (wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) {
entries.insert(&entry);
}
}
Expand Down
3 changes: 2 additions & 1 deletion components/core/src/clp_s/JsonParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <iostream>
#include <stack>

#include <clp/string_utils/string_utils.hpp>
#include <simdjson.h>
#include <spdlog/spdlog.h>

Expand All @@ -23,7 +24,7 @@ JsonParser::JsonParser(JsonParserOption const& option)

if (false == m_timestamp_key.empty()) {
if (false
== clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
== clp::string_utils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
{
SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
Expand Down
5 changes: 2 additions & 3 deletions components/core/src/clp_s/TimestampDictionaryReader.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#include "TimestampDictionaryReader.hpp"

#include <clp/string_utils/string_utils.hpp>
#include <unordered_set>

#include "Utils.hpp"

namespace clp_s {
void TimestampDictionaryReader::open(std::string const& dictionary_path) {
if (m_is_open) {
Expand Down Expand Up @@ -44,7 +43,7 @@ void TimestampDictionaryReader::read_new_entries() {
TimestampEntry entry;
std::vector<std::string> tokens;
entry.try_read_from_file(m_dictionary_decompressor);
if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
if (false == clp::string_utils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
}
m_entries.emplace_back(std::move(entry));
Expand Down
283 changes: 5 additions & 278 deletions components/core/src/clp_s/Utils.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#include "Utils.hpp"

#include <boost/filesystem.hpp>
#include <clp/string_utils/string_utils.hpp>
#include <spdlog/spdlog.h>

namespace clp_s {
using std::string;
using std::string_view;
using clp::string_utils::is_alphabet;
using clp::string_utils::is_decimal_digit;
using clp::string_utils::is_delim;

namespace clp_s {
bool FileUtils::find_all_files(std::string const& path, std::vector<std::string>& file_paths) {
try {
if (false == boost::filesystem::is_directory(path)) {
Expand Down Expand Up @@ -110,68 +114,6 @@ bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, s
return (msg_length != begin_pos);
}

size_t StringUtils::find_first_of(
string const& haystack,
char const* needles,
size_t search_start_pos,
size_t& needle_ix
) {
size_t haystack_length = haystack.length();
size_t needles_length = strlen(needles);
for (size_t i = search_start_pos; i < haystack_length; ++i) {
for (needle_ix = 0; needle_ix < needles_length; ++needle_ix) {
if (haystack[i] == needles[needle_ix]) {
return i;
}
}
}

return string::npos;
}

string StringUtils::replace_characters(
char const* characters_to_escape,
char const* replacement_characters,
string const& value,
bool escape
) {
string new_value;
size_t search_start_pos = 0;
while (true) {
size_t replace_char_ix;
size_t char_to_replace_pos
= find_first_of(value, characters_to_escape, search_start_pos, replace_char_ix);
if (string::npos == char_to_replace_pos) {
new_value.append(value, search_start_pos, string::npos);
break;
} else {
new_value.append(value, search_start_pos, char_to_replace_pos - search_start_pos);
if (escape) {
new_value += "\\";
}
new_value += replacement_characters[replace_char_ix];
search_start_pos = char_to_replace_pos + 1;
}
}
return new_value;
}

void StringUtils::to_lower(string& str) {
std::transform(str.cbegin(), str.cend(), str.begin(), [](unsigned char c) {
return std::tolower(c);
});
}

bool StringUtils::is_wildcard(char c) {
static constexpr char cWildcards[] = "?*";
for (size_t i = 0; i < strlen(cWildcards); ++i) {
if (cWildcards[i] == c) {
return true;
}
}
return false;
}

bool StringUtils::has_unescaped_wildcards(std::string const& str) {
for (size_t i = 0; i < str.size(); ++i) {
if ('*' == str[i] || '?' == str[i]) {
Expand All @@ -184,42 +126,6 @@ bool StringUtils::has_unescaped_wildcards(std::string const& str) {
return false;
}

string StringUtils::clean_up_wildcard_search_string(string_view str) {
string cleaned_str;

bool is_escaped = false;
auto str_end = str.cend();
for (auto current = str.cbegin(); current != str_end;) {
auto c = *current;
if (is_escaped) {
is_escaped = false;

if (is_wildcard(c) || '\\' == c) {
// Keep escaping if c is a wildcard character or an escape character
cleaned_str += '\\';
}
cleaned_str += c;
++current;
} else if ('*' == c) {
cleaned_str += c;

// Skip over all '*' to find the next non-'*'
do {
++current;
} while (current != str_end && '*' == *current);
} else {
if ('\\' == c) {
is_escaped = true;
} else {
cleaned_str += c;
}
++current;
}
}

return cleaned_str;
}

bool StringUtils::advance_tame_to_next_match(
char const*& tame_current,
char const*& tame_bookmark,
Expand Down Expand Up @@ -261,154 +167,6 @@ bool StringUtils::advance_tame_to_next_match(
return true;
}

bool StringUtils::wildcard_match_unsafe(
string_view tame,
string_view wild,
bool case_sensitive_match
) {
if (case_sensitive_match) {
return wildcard_match_unsafe_case_sensitive(tame, wild);
} else {
// We convert to lowercase (rather than uppercase) anticipating that
// callers use lowercase more frequently, so little will need to change.
string lowercase_tame(tame);
to_lower(lowercase_tame);
string lowercase_wild(wild);
to_lower(lowercase_wild);
return wildcard_match_unsafe_case_sensitive(lowercase_tame, lowercase_wild);
}
}

/**
* The algorithm basically works as follows:
* Given a wild string "*abc*def*ghi*", it can be broken into groups of
* characters delimited by one or more '*' characters. The goal of the
* algorithm is then to determine whether the tame string contains each of
* those groups in the same order.
*
* Thus, the algorithm:
* 1. searches for the start of one of these groups in wild,
* 2. searches for a group in tame starting with the same character, and then
* 3. checks if the two match. If not, the search repeats with the next group in
* tame.
*/
bool StringUtils::wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) {
auto const tame_length = tame.length();
auto const wild_length = wild.length();
char const* tame_current = tame.data();
char const* wild_current = wild.data();
char const* tame_bookmark = nullptr;
char const* wild_bookmark = nullptr;
char const* tame_end = tame_current + tame_length;
char const* wild_end = wild_current + wild_length;

// Handle wild or tame being empty
if (0 == wild_length) {
return 0 == tame_length;
} else {
if (0 == tame_length) {
return "*" == wild;
}
}

char w;
char t;
bool is_escaped = false;
while (true) {
w = *wild_current;
if ('*' == w) {
++wild_current;
if (wild_end == wild_current) {
// Trailing '*' means everything remaining in tame will match
return true;
}

// Set wild and tame bookmarks
wild_bookmark = wild_current;
if (!advance_tame_to_next_match(
tame_current,
tame_bookmark,
tame_end,
wild_current,
wild_bookmark
))
{
return false;
}
} else {
// Handle escaped characters
if ('\\' == w) {
is_escaped = true;
++wild_current;
// This is safe without a bounds check since this the caller
// ensures there are no dangling escape characters
w = *wild_current;
}

// Handle a mismatch
t = *tame_current;
if (false == ((false == is_escaped && '?' == w) || t == w)) {
if (nullptr == wild_bookmark) {
// No bookmark to return to
return false;
}

wild_current = wild_bookmark;
tame_current = tame_bookmark + 1;
if (!advance_tame_to_next_match(
tame_current,
tame_bookmark,
tame_end,
wild_current,
wild_bookmark
))
{
return false;
}
}
}

++tame_current;
++wild_current;

// Handle reaching the end of tame or wild
if (tame_end == tame_current) {
return (wild_end == wild_current
|| ('*' == *wild_current && (wild_current + 1) == wild_end));
} else {
if (wild_end == wild_current) {
if (nullptr == wild_bookmark) {
// No bookmark to return to
return false;
} else {
wild_current = wild_bookmark;
tame_current = tame_bookmark + 1;
if (!advance_tame_to_next_match(
tame_current,
tame_bookmark,
tame_end,
wild_current,
wild_bookmark
))
{
return false;
}
}
}
}
}
}

bool StringUtils::convert_string_to_int64(std::string_view raw, int64_t& converted) {
auto raw_end = raw.cend();
auto result = std::from_chars(raw.cbegin(), raw_end, converted);
if (raw_end != result.ptr) {
return false;
} else {
return result.ec == std::errc();
}
}

bool StringUtils::convert_string_to_double(std::string const& raw, double& converted) {
if (raw.empty()) {
// Can't convert an empty string
Expand All @@ -426,35 +184,4 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
converted = raw_as_double;
return true;
}

bool StringUtils::tokenize_column_descriptor(
std::string const& descriptor,
std::vector<std::string>& tokens
) {
// TODO: add support for unicode sequences e.g. \u263A
std::string cur_tok;
for (size_t cur = 0; cur < descriptor.size(); ++cur) {
if ('\\' == descriptor[cur]) {
++cur;
if (cur >= descriptor.size()) {
return false;
}
} else if ('.' == descriptor[cur]) {
if (cur_tok.empty()) {
return false;
}
tokens.push_back(cur_tok);
cur_tok.clear();
continue;
}
cur_tok.push_back(descriptor[cur]);
}

if (cur_tok.empty()) {
return false;
}

tokens.push_back(cur_tok);
return true;
}
} // namespace clp_s
Loading

0 comments on commit f4f7335

Please sign in to comment.