From 249816b5cb0412ce40ac79cb57f5bfbbd6900c3e Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Thu, 27 Jun 2024 00:40:34 -0400 Subject: [PATCH] Add support for validating and escaping UTF-8 strings. (#453) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/core/CMakeLists.txt | 5 + components/core/src/clp/ffi/utils.cpp | 89 +++++++++ components/core/src/clp/ffi/utils.hpp | 31 ++++ components/core/src/clp/utf8_utils.cpp | 55 ++++++ components/core/src/clp/utf8_utils.hpp | 144 +++++++++++++++ components/core/tests/test-utf8_utils.cpp | 209 ++++++++++++++++++++++ 6 files changed, 533 insertions(+) create mode 100644 components/core/src/clp/ffi/utils.cpp create mode 100644 components/core/src/clp/ffi/utils.hpp create mode 100644 components/core/src/clp/utf8_utils.cpp create mode 100644 components/core/src/clp/utf8_utils.hpp create mode 100644 components/core/tests/test-utf8_utils.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index e3b62047a..7cba49acb 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -326,6 +326,8 @@ set(SOURCE_FILES_unitTest src/clp/ffi/search/Subquery.hpp src/clp/ffi/search/WildcardToken.cpp src/clp/ffi/search/WildcardToken.hpp + src/clp/ffi/utils.cpp + src/clp/ffi/utils.hpp src/clp/FileDescriptor.cpp src/clp/FileDescriptor.hpp src/clp/FileReader.cpp @@ -438,6 +440,8 @@ set(SOURCE_FILES_unitTest src/clp/TraceableException.hpp src/clp/time_types.hpp src/clp/type_utils.hpp + src/clp/utf8_utils.cpp + src/clp/utf8_utils.hpp src/clp/Utils.cpp src/clp/Utils.hpp src/clp/VariableDictionaryEntry.cpp @@ -472,6 +476,7 @@ set(SOURCE_FILES_unitTest tests/test-StreamingCompression.cpp tests/test-string_utils.cpp tests/test-TimestampPattern.cpp + tests/test-utf8_utils.cpp tests/test-Utils.cpp ) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest}) diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp new file mode 100644 index 000000000..c85c47701 --- /dev/null +++ b/components/core/src/clp/ffi/utils.cpp @@ -0,0 +1,89 @@ +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../utf8_utils.hpp" + +using std::string; +using std::string_view; + +namespace clp::ffi { +auto validate_and_escape_utf8_string(string_view raw) -> std::optional { + std::optional ret_val; + auto& escaped{ret_val.emplace()}; + escaped.reserve(raw.size() + (raw.size() / 2)); + if (false == validate_and_append_escaped_utf8_string(raw, escaped)) { + return std::nullopt; + } + return ret_val; +} + +auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool { + string_view::const_iterator next_char_to_copy_it{src.cbegin()}; + + auto escape_handler = [&](string_view::const_iterator it) -> void { + // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte + // used by `snprintf` to append '\0' + constexpr size_t cControlCharacterBufSize{7}; + std::array buf{}; + std::string_view escaped_char; + bool escape_required{true}; + switch (*it) { + case '\b': + escaped_char = "\\b"; + break; + case '\t': + escaped_char = "\\t"; + break; + case '\n': + escaped_char = "\\n"; + break; + case '\f': + escaped_char = "\\f"; + break; + case '\r': + escaped_char = "\\r"; + break; + case '\\': + escaped_char = "\\\\"; + break; + case '"': + escaped_char = "\\\""; + break; + default: { + constexpr uint8_t cLargestControlCharacter{0x1F}; + auto const byte{static_cast(*it)}; + if (cLargestControlCharacter >= byte) { + std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte); + escaped_char = {buf.data(), buf.size() - 1}; + } else { + escape_required = false; + } + break; + } + } + if (escape_required) { + dst.append(next_char_to_copy_it, it); + dst += escaped_char; + next_char_to_copy_it = it + 1; + } + }; + + if (false == validate_utf8_string(src, escape_handler)) { + return false; + } + + if (src.cend() != next_char_to_copy_it) { + dst.append(next_char_to_copy_it, src.cend()); + } + + return true; +} +} // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp new file mode 100644 index 000000000..26823da9c --- /dev/null +++ b/components/core/src/clp/ffi/utils.hpp @@ -0,0 +1,31 @@ +#ifndef CLP_FFI_UTILS_HPP +#define CLP_FFI_UTILS_HPP + +#include +#include +#include + +namespace clp::ffi { +/** + * Validates whether the given string is UTF-8 encoded, and escapes any characters to make the + * string compatible with the JSON specification. + * @param raw The raw string to escape. + * @return The escaped string on success. + * @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences. + */ +[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw +) -> std::optional; + +/** + * Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any + * characters to make the appended string compatible with the JSON specification. + * @param src The string to validate and escape. + * @param dst Returns `dst` with an escaped version of `src` appended. + * @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded, + * `dst` may be modified. + */ +[[nodiscard]] auto +validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool; +} // namespace clp::ffi + +#endif // CLP_FFI_UTILS_HPP diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp new file mode 100644 index 000000000..06fafd659 --- /dev/null +++ b/components/core/src/clp/utf8_utils.cpp @@ -0,0 +1,55 @@ +#include "utf8_utils.hpp" + +#include +#include +#include + +namespace clp { +auto is_utf8_encoded(std::string_view str) -> bool { + auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {}; + return validate_utf8_string(str, escape_handler); +} + +namespace utf8_utils_internal { +auto parse_and_validate_lead_byte( + uint8_t byte, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool { + if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) { + num_continuation_bytes = 3; + code_point = (~cFourByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound; + } else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) { + num_continuation_bytes = 2; + code_point = (~cThreeByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound; + } else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) { + num_continuation_bytes = 1; + code_point = (~cTwoByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound; + } else { + return false; + } + return true; +} + +auto is_ascii_char(uint8_t byte) -> bool { + return cOneByteUtf8CharCodePointUpperBound >= byte; +} + +auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { + return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader; +} + +auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { + return (code_point << cUtf8NumContinuationByteCodePointBits) + + (continuation_byte & cUtf8ContinuationByteCodePointMask); +} +} // namespace utf8_utils_internal +} // namespace clp diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp new file mode 100644 index 000000000..fe9569b00 --- /dev/null +++ b/components/core/src/clp/utf8_utils.hpp @@ -0,0 +1,144 @@ +#ifndef CLP_UTF8_UTILS_HPP +#define CLP_UTF8_UTILS_HPP + +#include +#include +#include + +namespace clp { +// Constants +// Lead byte signature +constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx +constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx +constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx +constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx +constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx +constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx + +// Code point ranges (inclusive) +constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0}; +constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F}; +constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80}; +constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF}; +constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800}; +constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF}; +constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000}; +constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF}; + +// Continuation byte +constexpr uint32_t cUtf8ContinuationByteMask{0xC0}; +constexpr uint32_t cUtf8ContinuationByteHeader{0x80}; +constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F}; +constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6}; + +/** + * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using + * the given handler. + * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. + * @param src + * @param escape_handler + * @return Whether the input is a valid UTF-8 encoded string. + */ +template +requires std::is_invocable_v +[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; + +/** + * @param str + * @return Whether the input is a valid UTF-8 encoded string. + */ +[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; + +namespace utf8_utils_internal { +/** + * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses + * the byte, and returns the parsed properties as well as associated properties. + * @param byte Byte to validate. + * @param num_continuation_bytes Returns the number of continuation bytes expected. + * @param code_point Returns the code point bits parsed from the lead byte. + * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 + * character. + * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 + * character. + * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. + */ +[[nodiscard]] auto parse_and_validate_lead_byte( + uint8_t byte, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool; + +/** + * @param byte + * @return Whether the given byte is a valid ASCII character. + */ +[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; + +/* + * @param byte + * @return Whether the input byte is a valid UTF-8 continuation byte. + */ +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; + +/** + * Parses the code-point bits from the given continuation byte and combines them with the given + * code point. + * @param code_point + * @param continuation_byte + * @return The updated code point. + */ +[[nodiscard]] auto +parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; +} // namespace utf8_utils_internal + +template +requires std::is_invocable_v +auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { + size_t num_continuation_bytes_to_validate{0}; + uint32_t code_point{}; + uint32_t code_point_lower_bound{}; + uint32_t code_point_upper_bound{}; + + // NOLINTNEXTLINE(readability-qualified-auto) + for (auto it{src.cbegin()}; it != src.cend(); ++it) { + auto const byte{static_cast(*it)}; + if (0 == num_continuation_bytes_to_validate) { + if (utf8_utils_internal::is_ascii_char(byte)) { + escape_handler(it); + } else if (false + == utf8_utils_internal::parse_and_validate_lead_byte( + byte, + num_continuation_bytes_to_validate, + code_point, + code_point_lower_bound, + code_point_upper_bound + )) + { + return false; + } + } else { + if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) { + return false; + } + code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte); + --num_continuation_bytes_to_validate; + if (0 == num_continuation_bytes_to_validate + && (code_point < code_point_lower_bound || code_point_upper_bound < code_point)) + { + return false; + } + } + } + + if (0 != num_continuation_bytes_to_validate) { + // Incomplete UTF-8 character + return false; + } + + return true; +} +} // namespace clp + +#endif // CLP_UTF8_UTILS_HPP diff --git a/components/core/tests/test-utf8_utils.cpp b/components/core/tests/test-utf8_utils.cpp new file mode 100644 index 000000000..77324eaf9 --- /dev/null +++ b/components/core/tests/test-utf8_utils.cpp @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../src/clp/ffi/utils.hpp" +#include "../src/clp/utf8_utils.hpp" + +using clp::ffi::validate_and_escape_utf8_string; +using clp::is_utf8_encoded; + +namespace { +/** + * @param raw + * @return The input string after escaping any characters that are invalid in JSON strings. + */ +[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; + +/** + * Generates a UTF-8 encoded byte sequence with the given code point and number of continuation + * bytes. The range of the code point is not validated, which means the generated byte sequence can + * be invalid (overlong or exceeding the valid range of UTF-8 code points). + * @param code_point + * @param num_continuation_bytes + * @return The encoded UTF-8 byte sequence. + */ +[[nodiscard]] auto +generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) -> std::string; + +auto get_expected_escaped_string(std::string_view raw) -> std::string { + nlohmann::json const json_str = raw; // Don't use '{}' initializer + auto const dumped_str{json_str.dump()}; + // Strip the quotes that nlohmann::json adds + return {dumped_str.begin() + 1, dumped_str.end() - 1}; +} + +auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) + -> std::string { + REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3)); + std::vector encoded_bytes; + while (encoded_bytes.size() < num_continuation_bytes) { + auto const least_significant_byte{static_cast(code_point)}; + encoded_bytes.push_back(static_cast( + (least_significant_byte & ~clp::cUtf8ContinuationByteMask) + | clp::cUtf8ContinuationByteHeader + )); + code_point >>= clp::cUtf8NumContinuationByteCodePointBits; + } + + uint8_t lead_byte_code_point_mask{}; + uint8_t lead_byte_header{}; + if (1 == num_continuation_bytes) { + lead_byte_code_point_mask = static_cast(~clp::cTwoByteUtf8CharHeaderMask); + lead_byte_header = clp::cTwoByteUtf8CharHeader; + } else if (2 == num_continuation_bytes) { + lead_byte_code_point_mask = static_cast(~clp::cThreeByteUtf8CharHeaderMask); + lead_byte_header = clp::cThreeByteUtf8CharHeader; + } else { // 3 == num_continuation_bytes + lead_byte_code_point_mask = static_cast(~clp::cFourByteUtf8CharHeaderMask); + lead_byte_header = clp::cFourByteUtf8CharHeader; + } + encoded_bytes.push_back(static_cast( + (static_cast(code_point) & lead_byte_code_point_mask) | lead_byte_header + )); + + return {encoded_bytes.rbegin(), encoded_bytes.rend()}; +} +} // namespace + +TEST_CASE("escape_utf8_string_basic", "[utf8_utils]") { + std::string test_str; + std::optional actual; + + // Test empty string + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string that has nothing to escape + test_str = "This string has nothing to escape :)"; + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string with all single byte UTF-8 characters, including those we escape. + test_str.clear(); + for (uint8_t i{0}; i <= static_cast(INT8_MAX); ++i) { + test_str.push_back(static_cast(i)); + } + // Shuffle characters randomly + // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp) + std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{}); + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test valid UTF-8 chars with continuation bytes + std::vector const valid_utf8{ + "\n", + "\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F + "a", + "\xE4\xB8\xAD", // https://en.wiktionary.org/wiki/%E4%B8%AD + "\x1F", + "\xC2\xA2", // ยข + "\\" + }; + test_str.clear(); + for (auto const& str : valid_utf8) { + test_str.append(str); + } + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); +} + +TEST_CASE("escape_utf8_string_with_invalid_continuation", "[utf8_utils]") { + std::string test_str; + + auto const valid_utf8_byte_sequence = GENERATE( + generate_utf8_byte_sequence(0x80, 1), + generate_utf8_byte_sequence(0x800, 2), + generate_utf8_byte_sequence(0x1'0000, 3) + ); + + // Test incomplete continuation bytes + auto const begin_it{valid_utf8_byte_sequence.cbegin()}; + std::string const valid{"Valid"}; + for (auto end_it{valid_utf8_byte_sequence.cend() - 1}; + valid_utf8_byte_sequence.cbegin() != end_it; + --end_it) + { + std::string const incomplete_byte_sequence{begin_it, end_it}; + + test_str = valid + incomplete_byte_sequence; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = incomplete_byte_sequence + valid; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + } + + // Test invalid lead byte + test_str = valid_utf8_byte_sequence; + constexpr char cInvalidLeadByte{'\xFF'}; + test_str.front() = cInvalidLeadByte; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid continuation bytes + for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) { + test_str = valid_utf8_byte_sequence; + constexpr uint8_t cInvalidContinuationByteMask{0x40}; + test_str.at(idx) |= cInvalidContinuationByteMask; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + } +} + +TEST_CASE("validate_utf8_code_point_ranges", "[utf8_utils]") { + // Test 1 byte encoding code point range + for (auto code_point{clp::cOneByteUtf8CharCodePointLowerBound}; + code_point <= clp::cOneByteUtf8CharCodePointUpperBound; + ++code_point) + { + REQUIRE(is_utf8_encoded(std::string{static_cast(code_point)})); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 2 byte encoding code point range + for (auto code_point{clp::cTwoByteUtf8CharCodePointLowerBound}; + code_point <= clp::cTwoByteUtf8CharCodePointUpperBound; + ++code_point) + { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 3 byte encoding code point range + for (auto code_point{clp::cThreeByteUtf8CharCodePointLowerBound}; + code_point <= clp::cThreeByteUtf8CharCodePointUpperBound; + ++code_point) + { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 4 byte encoding code point range + for (auto code_point{clp::cFourByteUtf8CharCodePointLowerBound}; + code_point <= clp::cFourByteUtf8CharCodePointUpperBound; + ++code_point) + { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))); + } + + // Test 4 byte encoding code point out of range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (auto code_point{clp::cFourByteUtf8CharCodePointUpperBound + 1}; code_point <= 0x1F'FFFF; + ++code_point) + { + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } +}