Skip to content

Commit

Permalink
clp-core: Replace calls to incomplete UTF-8 validation function with …
Browse files Browse the repository at this point in the history
…new and complete implementation. (y-scope#477)
  • Loading branch information
LinZhihao-723 authored Jul 11, 2024
1 parent 6e21665 commit 437607a
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 46 deletions.
2 changes: 2 additions & 0 deletions components/core/src/clp/clp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ set(
../TimestampPattern.hpp
../TraceableException.hpp
../type_utils.hpp
../utf8_utils.cpp
../utf8_utils.hpp
../Utils.cpp
../Utils.hpp
../VariableDictionaryEntry.cpp
Expand Down
9 changes: 5 additions & 4 deletions components/core/src/clp/clp/FileCompressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "../LogSurgeonReader.hpp"
#include "../Profiler.hpp"
#include "../streaming_archive/writer/utils.hpp"
#include "../utf8_utils.hpp"
#include "utils.hpp"

using clp::ir::eight_byte_encoded_variable_t;
Expand Down Expand Up @@ -145,8 +146,8 @@ bool FileCompressor::compress_file(
size_t peek_size{0};
m_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
bool succeeded = true;
auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
if (use_heuristic) {
parse_and_encode_with_heuristic(
target_data_size_of_dicts,
Expand Down Expand Up @@ -359,8 +360,8 @@ bool FileCompressor::try_compressing_as_archive(
size_t peek_size{0};
m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
string file_path{m_libarchive_reader.get_path()};
auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
auto const utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_encoded({utf8_validation_buf, utf8_validation_buf_len})) {
auto boost_path_for_compression = parent_boost_path / file_path;
if (use_heuristic) {
parse_and_encode_with_heuristic(
Expand Down
34 changes: 0 additions & 34 deletions components/core/src/clp/clp/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,40 +86,6 @@ bool find_all_files_and_empty_directories(
return true;
}

bool is_utf8_sequence(size_t sequence_length, char const* sequence) {
size_t num_utf8_bytes_to_read = 0;
for (size_t i = 0; i < sequence_length; ++i) {
auto byte = sequence[i];

if (num_utf8_bytes_to_read > 0) {
// Validate that byte matches 0b10xx_xxxx
if ((byte & 0xC0) != 0x80) {
return false;
}
--num_utf8_bytes_to_read;
} else {
if (byte & 0x80) {
// Check if byte is valid UTF-8 length-indicator
if ((byte & 0xF8) == 0xF0) {
// Matches 0b1111_0xxx
num_utf8_bytes_to_read = 3;
} else if ((byte & 0xF0) == 0xE0) {
// Matches 0b1110_xxxx
num_utf8_bytes_to_read = 2;
} else if ((byte & 0xE0) == 0xC0) {
// Matches 0b110x_xxxx
num_utf8_bytes_to_read = 1;
} else {
// Invalid UTF-8 length-indicator
return false;
}
} // else byte is ASCII
}
}

return true;
}

bool read_input_paths(string const& list_path, vector<string>& paths) {
ErrorCode error_code = read_list_of_paths(list_path, paths);
if (ErrorCode_Success != error_code) {
Expand Down
8 changes: 0 additions & 8 deletions components/core/src/clp/clp/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,6 @@ bool find_all_files_and_empty_directories(
std::vector<std::string>& empty_directory_paths
);

/**
* Checks if the given sequence is valid UTF-8
* @param sequence_length
* @param sequence
* @return true if valid, false otherwise
*/
bool is_utf8_sequence(size_t sequence_length, char const* sequence);

/**
* Reads a list of input paths
* @param list_path
Expand Down

0 comments on commit 437607a

Please sign in to comment.