From 249816b5cb0412ce40ac79cb57f5bfbbd6900c3e Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Thu, 27 Jun 2024 00:40:34 -0400
Subject: [PATCH] Add support for validating and escaping UTF-8 strings. (#453)

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 components/core/CMakeLists.txt            |   5 +
 components/core/src/clp/ffi/utils.cpp     |  89 +++++++++
 components/core/src/clp/ffi/utils.hpp     |  31 ++++
 components/core/src/clp/utf8_utils.cpp    |  55 ++++++
 components/core/src/clp/utf8_utils.hpp    | 144 +++++++++++++++
 components/core/tests/test-utf8_utils.cpp | 209 ++++++++++++++++++++++
 6 files changed, 533 insertions(+)
 create mode 100644 components/core/src/clp/ffi/utils.cpp
 create mode 100644 components/core/src/clp/ffi/utils.hpp
 create mode 100644 components/core/src/clp/utf8_utils.cpp
 create mode 100644 components/core/src/clp/utf8_utils.hpp
 create mode 100644 components/core/tests/test-utf8_utils.cpp
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index e3b62047a..7cba49acb 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -326,6 +326,8 @@ set(SOURCE_FILES_unitTest
         src/clp/ffi/search/Subquery.hpp
         src/clp/ffi/search/WildcardToken.cpp
         src/clp/ffi/search/WildcardToken.hpp
+        src/clp/ffi/utils.cpp
+        src/clp/ffi/utils.hpp
         src/clp/FileDescriptor.cpp
         src/clp/FileDescriptor.hpp
         src/clp/FileReader.cpp
@@ -438,6 +440,8 @@ set(SOURCE_FILES_unitTest
         src/clp/TraceableException.hpp
         src/clp/time_types.hpp
         src/clp/type_utils.hpp
+        src/clp/utf8_utils.cpp
+        src/clp/utf8_utils.hpp
         src/clp/Utils.cpp
         src/clp/Utils.hpp
         src/clp/VariableDictionaryEntry.cpp
@@ -472,6 +476,7 @@ set(SOURCE_FILES_unitTest
         tests/test-StreamingCompression.cpp
         tests/test-string_utils.cpp
         tests/test-TimestampPattern.cpp
+        tests/test-utf8_utils.cpp
         tests/test-Utils.cpp
         )
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
new file mode 100644
index 000000000..c85c47701
--- /dev/null
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -0,0 +1,89 @@
+#include "utils.hpp"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+
+#include "../utf8_utils.hpp"
+
+using std::string;
+using std::string_view;
+
+namespace clp::ffi {
+auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
+    std::optional<std::string> ret_val;
+    auto& escaped{ret_val.emplace()};
+    escaped.reserve(raw.size() + (raw.size() / 2));
+    if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
+        return std::nullopt;
+    }
+    return ret_val;
+}
+
+auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
+    string_view::const_iterator next_char_to_copy_it{src.cbegin()};
+
+    auto escape_handler = [&](string_view::const_iterator it) -> void {
+        // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
+        // used by `snprintf` to append '\0'
+        constexpr size_t cControlCharacterBufSize{7};
+        std::array<char, cControlCharacterBufSize> buf{};
+        std::string_view escaped_char;
+        bool escape_required{true};
+        switch (*it) {
+            case '\b':
+                escaped_char = "\\b";
+                break;
+            case '\t':
+                escaped_char = "\\t";
+                break;
+            case '\n':
+                escaped_char = "\\n";
+                break;
+            case '\f':
+                escaped_char = "\\f";
+                break;
+            case '\r':
+                escaped_char = "\\r";
+                break;
+            case '\\':
+                escaped_char = "\\\\";
+                break;
+            case '"':
+                escaped_char = "\\\"";
+                break;
+            default: {
+                constexpr uint8_t cLargestControlCharacter{0x1F};
+                auto const byte{static_cast<uint8_t>(*it)};
+                if (cLargestControlCharacter >= byte) {
+                    std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
+                    escaped_char = {buf.data(), buf.size() - 1};
+                } else {
+                    escape_required = false;
+                }
+                break;
+            }
+        }
+        if (escape_required) {
+            dst.append(next_char_to_copy_it, it);
+            dst += escaped_char;
+            next_char_to_copy_it = it + 1;
+        }
+    };
+
+    if (false == validate_utf8_string(src, escape_handler)) {
+        return false;
+    }
+
+    if (src.cend() != next_char_to_copy_it) {
+        dst.append(next_char_to_copy_it, src.cend());
+    }
+
+    return true;
+}
+}  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
new file mode 100644
index 000000000..26823da9c
--- /dev/null
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -0,0 +1,31 @@
+#ifndef CLP_FFI_UTILS_HPP
+#define CLP_FFI_UTILS_HPP
+
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace clp::ffi {
+/**
+ * Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
+ * string compatible with the JSON specification.
+ * @param raw The raw string to escape.
+ * @return The escaped string on success.
+ * @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
+ */
+[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
+) -> std::optional<std::string>;
+
+/**
+ * Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
+ * characters to make the appended string compatible with the JSON specification.
+ * @param src The string to validate and escape.
+ * @param dst Returns `dst` with an escaped version of `src` appended.
+ * @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
+ * `dst` may be modified.
+ */
+[[nodiscard]] auto
+validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
+}  // namespace clp::ffi
+
+#endif  // CLP_FFI_UTILS_HPP
diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp
new file mode 100644
index 000000000..06fafd659
--- /dev/null
+++ b/components/core/src/clp/utf8_utils.cpp
@@ -0,0 +1,55 @@
+#include "utf8_utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+auto is_utf8_encoded(std::string_view str) -> bool {
+    auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
+    return validate_utf8_string(str, escape_handler);
+}
+
+namespace utf8_utils_internal {
+auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool {
+    if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
+        num_continuation_bytes = 3;
+        code_point = (~cFourByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
+        num_continuation_bytes = 2;
+        code_point = (~cThreeByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
+        num_continuation_bytes = 1;
+        code_point = (~cTwoByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+auto is_ascii_char(uint8_t byte) -> bool {
+    return cOneByteUtf8CharCodePointUpperBound >= byte;
+}
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
+}
+
+auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
+    return (code_point << cUtf8NumContinuationByteCodePointBits)
+           + (continuation_byte & cUtf8ContinuationByteCodePointMask);
+}
+}  // namespace utf8_utils_internal
+}  // namespace clp
diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp
new file mode 100644
index 000000000..fe9569b00
--- /dev/null
+++ b/components/core/src/clp/utf8_utils.hpp
@@ -0,0 +1,144 @@
+#ifndef CLP_UTF8_UTILS_HPP
+#define CLP_UTF8_UTILS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+// Constants
+// Lead byte signature
+constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0};  // 0b111x_xxxx
+constexpr uint8_t cTwoByteUtf8CharHeader{0xC0};  // 0b110x_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0};  // 0b1111_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeader{0xE0};  // 0b1110_xxxx
+constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8};  // 0b1111_1xxx
+constexpr uint8_t cFourByteUtf8CharHeader{0xF0};  // 0b1111_0xxx
+
+// Code point ranges (inclusive)
+constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
+constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
+constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
+constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
+constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
+constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
+constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
+constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};
+
+// Continuation byte
+constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
+constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
+constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
+constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};
+
+/**
+ * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
+ * the given handler.
+ * @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
+ * @param src
+ * @param escape_handler
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
+
+/**
+ * @param str
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
+
+namespace utf8_utils_internal {
+/**
+ * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
+ * the byte, and returns the parsed properties as well as associated properties.
+ * @param byte Byte to validate.
+ * @param num_continuation_bytes Returns the number of continuation bytes expected.
+ * @param code_point Returns the code point bits parsed from the lead byte.
+ * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
+ * character.
+ * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
+ * character.
+ * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
+ */
+[[nodiscard]] auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool;
+
+/**
+ * @param byte
+ * @return Whether the given byte is a valid ASCII character.
+ */
+[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
+
+/*
+ * @param byte
+ * @return Whether the input byte is a valid UTF-8 continuation byte.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Parses the code-point bits from the given continuation byte and combines them with the given
+ * code point.
+ * @param code_point
+ * @param continuation_byte
+ * @return The updated code point.
+ */
+[[nodiscard]] auto
+parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+}  // namespace utf8_utils_internal
+
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
+    size_t num_continuation_bytes_to_validate{0};
+    uint32_t code_point{};
+    uint32_t code_point_lower_bound{};
+    uint32_t code_point_upper_bound{};
+
+    // NOLINTNEXTLINE(readability-qualified-auto)
+    for (auto it{src.cbegin()}; it != src.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        if (0 == num_continuation_bytes_to_validate) {
+            if (utf8_utils_internal::is_ascii_char(byte)) {
+                escape_handler(it);
+            } else if (false
+                       == utf8_utils_internal::parse_and_validate_lead_byte(
+                               byte,
+                               num_continuation_bytes_to_validate,
+                               code_point,
+                               code_point_lower_bound,
+                               code_point_upper_bound
+                       ))
+            {
+                return false;
+            }
+        } else {
+            if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
+                return false;
+            }
+            code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
+            --num_continuation_bytes_to_validate;
+            if (0 == num_continuation_bytes_to_validate
+                && (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
+            {
+                return false;
+            }
+        }
+    }
+
+    if (0 != num_continuation_bytes_to_validate) {
+        // Incomplete UTF-8 character
+        return false;
+    }
+
+    return true;
+}
+}  // namespace clp
+
+#endif  // CLP_UTF8_UTILS_HPP
diff --git a/components/core/tests/test-utf8_utils.cpp b/components/core/tests/test-utf8_utils.cpp
new file mode 100644
index 000000000..77324eaf9
--- /dev/null
+++ b/components/core/tests/test-utf8_utils.cpp
@@ -0,0 +1,209 @@
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+#include <json/single_include/nlohmann/json.hpp>
+
+#include "../src/clp/ffi/utils.hpp"
+#include "../src/clp/utf8_utils.hpp"
+
+using clp::ffi::validate_and_escape_utf8_string;
+using clp::is_utf8_encoded;
+
+namespace {
+/**
+ * @param raw
+ * @return The input string after escaping any characters that are invalid in JSON strings.
+ */
+[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
+
+/**
+ * Generates a UTF-8 encoded byte sequence with the given code point and number of continuation
+ * bytes. The range of the code point is not validated, which means the generated byte sequence can
+ * be invalid (overlong or exceeding the valid range of UTF-8 code points).
+ * @param code_point
+ * @param num_continuation_bytes
+ * @return The encoded UTF-8 byte sequence.
+ */
+[[nodiscard]] auto
+generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) -> std::string;
+
+auto get_expected_escaped_string(std::string_view raw) -> std::string {
+    nlohmann::json const json_str = raw;  // Don't use '{}' initializer
+    auto const dumped_str{json_str.dump()};
+    // Strip the quotes that nlohmann::json adds
+    return {dumped_str.begin() + 1, dumped_str.end() - 1};
+}
+
+auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes)
+        -> std::string {
+    REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3));
+    std::vector<char> encoded_bytes;
+    while (encoded_bytes.size() < num_continuation_bytes) {
+        auto const least_significant_byte{static_cast<uint8_t>(code_point)};
+        encoded_bytes.push_back(static_cast<char>(
+                (least_significant_byte & ~clp::cUtf8ContinuationByteMask)
+                | clp::cUtf8ContinuationByteHeader
+        ));
+        code_point >>= clp::cUtf8NumContinuationByteCodePointBits;
+    }
+
+    uint8_t lead_byte_code_point_mask{};
+    uint8_t lead_byte_header{};
+    if (1 == num_continuation_bytes) {
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cTwoByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cTwoByteUtf8CharHeader;
+    } else if (2 == num_continuation_bytes) {
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cThreeByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cThreeByteUtf8CharHeader;
+    } else {  // 3 == num_continuation_bytes
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cFourByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cFourByteUtf8CharHeader;
+    }
+    encoded_bytes.push_back(static_cast<char>(
+            (static_cast<uint8_t>(code_point) & lead_byte_code_point_mask) | lead_byte_header
+    ));
+
+    return {encoded_bytes.rbegin(), encoded_bytes.rend()};
+}
+}  // namespace
+
+TEST_CASE("escape_utf8_string_basic", "[utf8_utils]") {
+    std::string test_str;
+    std::optional<std::string> actual;
+
+    // Test empty string
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string that has nothing to escape
+    test_str = "This string has nothing to escape :)";
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string with all single byte UTF-8 characters, including those we escape.
+    test_str.clear();
+    for (uint8_t i{0}; i <= static_cast<uint8_t>(INT8_MAX); ++i) {
+        test_str.push_back(static_cast<char>(i));
+    }
+    // Shuffle characters randomly
+    // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp)
+    std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{});
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test valid UTF-8 chars with continuation bytes
+    std::vector<std::string> const valid_utf8{
+            "\n",
+            "\xF0\xA0\x80\x8F",  // https://en.wiktionary.org/wiki/%F0%A0%80%8F
+            "a",
+            "\xE4\xB8\xAD",  // https://en.wiktionary.org/wiki/%E4%B8%AD
+            "\x1F",
+            "\xC2\xA2",  // ¢
+            "\\"
+    };
+    test_str.clear();
+    for (auto const& str : valid_utf8) {
+        test_str.append(str);
+    }
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+}
+
+TEST_CASE("escape_utf8_string_with_invalid_continuation", "[utf8_utils]") {
+    std::string test_str;
+
+    auto const valid_utf8_byte_sequence = GENERATE(
+            generate_utf8_byte_sequence(0x80, 1),
+            generate_utf8_byte_sequence(0x800, 2),
+            generate_utf8_byte_sequence(0x1'0000, 3)
+    );
+
+    // Test incomplete continuation bytes
+    auto const begin_it{valid_utf8_byte_sequence.cbegin()};
+    std::string const valid{"Valid"};
+    for (auto end_it{valid_utf8_byte_sequence.cend() - 1};
+         valid_utf8_byte_sequence.cbegin() != end_it;
+         --end_it)
+    {
+        std::string const incomplete_byte_sequence{begin_it, end_it};
+
+        test_str = valid + incomplete_byte_sequence;
+        REQUIRE((false == is_utf8_encoded(test_str)));
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+        test_str = incomplete_byte_sequence + valid;
+        REQUIRE((false == is_utf8_encoded(test_str)));
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+    }
+
+    // Test invalid lead byte
+    test_str = valid_utf8_byte_sequence;
+    constexpr char cInvalidLeadByte{'\xFF'};
+    test_str.front() = cInvalidLeadByte;
+    REQUIRE((false == is_utf8_encoded(test_str)));
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid continuation bytes
+    for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) {
+        test_str = valid_utf8_byte_sequence;
+        constexpr uint8_t cInvalidContinuationByteMask{0x40};
+        test_str.at(idx) |= cInvalidContinuationByteMask;
+        REQUIRE((false == is_utf8_encoded(test_str)));
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+    }
+}
+
+TEST_CASE("validate_utf8_code_point_ranges", "[utf8_utils]") {
+    // Test 1 byte encoding code point range
+    for (auto code_point{clp::cOneByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cOneByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
+        REQUIRE(is_utf8_encoded(std::string{static_cast<char>(code_point)}));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 2 byte encoding code point range
+    for (auto code_point{clp::cTwoByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cTwoByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 3 byte encoding code point range
+    for (auto code_point{clp::cThreeByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cThreeByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 4 byte encoding code point range
+    for (auto code_point{clp::cFourByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cFourByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)));
+    }
+
+    // Test 4 byte encoding code point out of range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (auto code_point{clp::cFourByteUtf8CharCodePointUpperBound + 1}; code_point <= 0x1F'FFFF;
+         ++code_point)
+    {
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+}