diff --git a/CMakeLists.txt b/CMakeLists.txt index 438780b..deb52ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ option(LIBUNICODE_BENCHMARK "libunicode: Enables building of benchmark for libun option(LIBUNICODE_TOOLS "libunicode: Builds CLI tools [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT}) option(LIBUNICODE_BUILD_STATIC "libunicode: provide static library instead of dynamic [default: ${LIBUNICODE_BUILD_STATIC_DEFAULT}]" ${LIBUNICODE_BUILD_STATIC_DEFAULT}) option(LIBUNICODE_USE_INTRINSICS "libunicode: Use SIMD extenstion during text read [default: ON]" ON) -option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ${LIBUNICODE_USE_INTRINSICS}) +option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ON) option(LIBUNICODE_TABLEGEN_FASTBUILD "libunicode: Use fast table generation (takes more memory in final tables) [default: OFF]" OFF) set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE) diff --git a/cmake/presets/common.json b/cmake/presets/common.json index f794d2b..2895c4c 100644 --- a/cmake/presets/common.json +++ b/cmake/presets/common.json @@ -1,7 +1,7 @@ { "version": 6, "configurePresets": [ - { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON" } }, + { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON", "LIBUNICODE_TRACE": "ON" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "arch-native", "hidden": true, "cacheVariables": { "CMAKE_CXX_FLAGS": "-march=native" } }, { "name": "clang", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "clang++" } }, diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 59dfcee..eca42b3 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -1,5 +1,7 @@ include(GNUInstallDirs) +option(LIBUNICODE_TRACE "Enable trace logging" OFF) + function(ExtractZipArchive ZIP_FILE OUTPUT_DIR) if(CMAKE_VERSION VERSION_LESS 3.18) # Use the older method for versions prior to CMake 3.18 @@ -102,7 +104,6 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} codepoint_properties.cpp emoji_segmenter.cpp grapheme_segmenter.cpp - scan.cpp script_segmenter.cpp utf8.cpp width.cpp @@ -114,10 +115,10 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} ) if(LIBUNICODE_USE_STD_SIMD) - target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) + target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_STD_SIMD) endif() if(LIBUNICODE_USE_INTRINSICS) - target_compile_definitions(unicode PRIVATE USE_INTRINSICS) + target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_INTRINSICS) endif() set(public_headers @@ -125,11 +126,11 @@ set(public_headers codepoint_properties.h convert.h emoji_segmenter.h + grapheme_line_segmenter.h grapheme_segmenter.h intrinsics.h multistage_table_view.h run_segmenter.h - scan.h script_segmenter.h support.h utf8.h @@ -150,6 +151,10 @@ set_target_properties(unicode PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}" ) +if(LIBUNICODE_TRACE) + target_compile_definitions(unicode PUBLIC LIBUNICODE_TRACE) +endif() + add_library(unicode::unicode ALIAS unicode) add_library(unicode::core ALIAS unicode) target_include_directories(unicode PUBLIC $ @@ -161,7 +166,6 @@ add_executable(unicode_tablegen tablegen.cpp) set_target_properties(unicode_tablegen PROPERTIES CMAKE_BUILD_TYPE Release) target_link_libraries(unicode_tablegen PRIVATE unicode::loader) - # {{{ installation set(LIBUNICODE_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/libunicode" CACHE PATH "Installation directory for cmake files, a relative path that will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute path.") set(LIBUNICODE_INSTALL_CMAKE_FILES ${MASTER_PROJECT} CACHE BOOL "Decides whether or not to install CMake config and -version files.") @@ -220,9 +224,9 @@ if(LIBUNICODE_TESTING) capi_test.cpp convert_test.cpp emoji_segmenter_test.cpp + grapheme_line_segmenter_test.cpp grapheme_segmenter_test.cpp run_segmenter_test.cpp - scan_test.cpp script_segmenter_test.cpp test_main.cpp unicode_test.cpp @@ -247,8 +251,6 @@ if(LIBUNICODE_TESTING) endif() # }}} - - # {{{ unicode_test if(LIBUNICODE_BENCHMARK) if(NOT benchmark_FOUND) diff --git a/src/libunicode/benchmark.cpp b/src/libunicode/benchmark.cpp index f80e18a..908f7f1 100644 --- a/src/libunicode/benchmark.cpp +++ b/src/libunicode/benchmark.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -14,7 +14,7 @@ static void benchmarkWithLength(benchmark::State& benchmarkState) auto TestText = std::string(L, 'a') + "\u00A9"; for (auto _: benchmarkState) { - benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10)); + benchmark::DoNotOptimize(unicode::detail::process_only_ascii(std::string_view(TestText).substr(0, L + 10))); } } @@ -24,7 +24,9 @@ static void benchmarkWithOffset(benchmark::State& benchmarkState) auto TestText = std::string(L, 'a') + "\U0001F600" + std::string(1000, 'a'); for (auto _: benchmarkState) { - benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10)); + auto state = unicode::detail::unicode_process_state {}; + auto eventHandler = unicode::detail::EventHandler{}; + benchmark::DoNotOptimize(unicode::detail::process_only_complex_unicode(eventHandler, state, TestText, L + 10)); } } diff --git a/src/libunicode/grapheme_line_segmenter.h b/src/libunicode/grapheme_line_segmenter.h new file mode 100644 index 0000000..ee01e43 --- /dev/null +++ b/src/libunicode/grapheme_line_segmenter.h @@ -0,0 +1,825 @@ +/** + * This file is part of the "libunicode" project + * Copyright (c) 2024 Christian Parpart + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#if defined(LIBUNICODE_TRACE) + #include + #include + + #define LIBUNICODE_TRACE_SEGMENTER(...) std::cout << std::format(__VA_ARGS__) +#else + #define LIBUNICODE_TRACE_SEGMENTER(...) ((void) 0) +#endif + +#if defined(LIBUNICODE_USE_INTRINSICS) + #include +#endif +#include +#include + +#include +#include +#include + +// clang-format off +#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) + #define LIBUNICODE_HAS_STD_SIMD + #include + namespace stdx = std::experimental; +#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) + #define LIBUNICODE_HAS_STD_SIMD + #include + namespace stdx = std; +#endif +// clang-format on + +namespace unicode +{ + +// Represents the reason why the processing stopped. +enum class StopCondition +{ + // Unexpected input usually means control characters. + UnexpectedInput, + // End of input range is reached. + EndOfInput, + // Total number of columns (east asian widths) to process at most has been reached. + EndOfWidth, +}; + +// Represents the result of a single call to process(). +struct grapheme_segmentation_result +{ + // Represents the text that was scanned. + std::string_view text; + + // Represents the sum of all east asian widths of the grapheme clusters of the given text. + unsigned width; + + // Represents the reason why the processing stopped. + StopCondition stop_condition; + + constexpr auto operator<=>(grapheme_segmentation_result const& rhs) const noexcept = default; +}; + +// Convenience listener interface for grapheme segmentation events. +class grapheme_segmentation_listener +{ + public: + virtual ~grapheme_segmentation_listener() = default; + + virtual void on_invalid(std::string_view invalid) noexcept = 0; + virtual void on_ascii(std::string_view text) noexcept = 0; + virtual void on_grapheme_cluster(std::string_view text, unsigned width) noexcept = 0; +}; + +template +concept GraphemeSegmentationListenerConcept = requires(T t, T const& u) { + t.on_invalid(std::string_view {}); + t.on_ascii(std::string_view {}); + t.on_grapheme_cluster(std::string_view {}, unsigned {}); +}; + +template +concept OptionalGraphemeSegmentationListenerConcept = GraphemeSegmentationListenerConcept || std::same_as; + +// {{{ grapheme_line_segmenter details +namespace detail +{ + template + struct EventHandler; + + template + struct EventHandler + { + EventListener& listener; + constexpr void on_invalid(std::string_view s) noexcept { listener.on_invalid(s); } + constexpr void on_ascii(std::string_view s) noexcept { listener.on_ascii(s); } + constexpr void on_grapheme_cluster(std::string_view s, unsigned w) noexcept { listener.on_grapheme_cluster(s, w); } + }; + + template + EventHandler(EventListener) -> EventHandler; + + template <> + struct EventHandler + { + void on_invalid(std::string_view) noexcept {} + void on_ascii(std::string_view) noexcept {} + void on_grapheme_cluster(std::string_view, unsigned) noexcept {} + }; + + template <> + struct EventHandler<>: EventHandler + { + }; + + [[maybe_unused]] inline int countTrailingZeroBits(unsigned int value) noexcept + { +#if defined(_WIN32) + // return _tzcnt_u32(value); + // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. + unsigned long r = 0; + _BitScanForward(&r, value); + return r; +#else + return __builtin_ctz(value); +#endif + } + + constexpr bool is_control(char ch) noexcept + { + return static_cast(ch) < 0x20; + } + + // Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. + constexpr bool is_complex(char ch) noexcept + { + return static_cast(ch) & 0x80; + } + + // Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. + constexpr bool is_ascii(char ch) noexcept + { + return !is_control(ch) && !is_complex(ch); + } + + enum class State + { + EndOfInput, + EndOfWidth, + C0, + ASCII, + ComplexUnicode, + }; + + constexpr State make_state(char const* input, char const* end, size_t processedTotalWidth, size_t maxWidth) noexcept + { +#if defined(__clang__) + // Clang's doing an AMAZING job here, because + // it's able to optimize this function to a single branchless instruction sequence on x86-64. + // + // I would prefer to use an if-chain with return statements here, but that would + // only work if we're NOT doing the pointer comparison. + // + // NOTE: The order of comparison is intentionally in reverse order! + State s = State::ASCII; + if (is_complex(*input)) + s = State::ComplexUnicode; + if (is_control(*input)) + s = State::C0; + if (processedTotalWidth == maxWidth) + s = State::EndOfWidth; + if (input == end) + s = State::EndOfInput; + return s; +#else + // This is the ideal implementation, but no compiler is able to produce branchless code. + // Clang fails here, because the pointer comparison breaks the optimization. + // GCC fails every attempt to optimize this to branchless code. + // MSVC also fails in every attempt to optimize this to branchless code. + if (input == end) + return State::EndOfInput; + if (processedTotalWidth == maxWidth) + return State::EndOfWidth; + if (is_control(*input)) + return State::C0; + if (is_complex(*input)) + return State::ComplexUnicode; + return State::ASCII; + +#endif + } + + struct unicode_process_state + { + // Holds the UTF-8 decoding state between calls to process(). + utf8_decoder_state utf8 {}; + + // Holds the pointer to the next UTF-8 byte to process when resuming processing. + char const* utf8DecodeNext {}; + + // Start position of current grapheme cluster. + char const* currentClusterStart {}; + + // Start position of current codepoint. + char const* currentCodepointStart {}; + + // Current grapheme cluster's East Asian Width. + unsigned currentClusterWidth = 0; + + // Holds the previously processed codepoint. + // This information is used to decide if we have hit the boundary of a grapheme cluster + // of a complex Unicode codepoint sequence or not. + // This value is not needed when processing trivial (US-ASCII) codepoints. + char32_t lastCodepointHint = 0; + }; + + // Holds the result of complex Unicode processing. + struct unicode_process_result + { + // Sum of all east asian widths of the grapheme clusters processed. + unsigned totalWidth {}; + + // The end position of the last processed byte. + char const* end {}; + + StopCondition stop_condition = StopCondition::UnexpectedInput; + }; + + constexpr unicode_process_result make_scan_result(unsigned consumedWidths, + char const* const end, + StopCondition stopCondition) noexcept + { + return { .totalWidth = consumedWidths, .end = end, .stop_condition = stopCondition }; + } + + // Flushes out pending grapheme cluster, if any. + template + LIBUNICODE_INLINE unsigned flush_grapheme_cluster(EventHandlerT& eventHandler, + unicode_process_state& state, + char const* const current, + unsigned maxWidth) noexcept + { + if (state.utf8.expectedLength) // Incomplete UTF-8 sequence hit. + { + assert(state.currentClusterStart <= current); + if (1 <= maxWidth) + { + eventHandler.on_invalid(std::string_view(state.currentClusterStart, current)); + state = { + .utf8DecodeNext = current, + .currentClusterStart = current, + .currentCodepointStart = current, + }; + return 1; + } + else + return 0; + } + else if (state.currentClusterStart && state.currentClusterStart < current) + { + // Current grapheme cluster is complete. + unsigned const width = state.currentClusterWidth; + if (width <= maxWidth) + { + assert(state.currentClusterStart < current); + LIBUNICODE_TRACE_SEGMENTER("grapheme_line_segmenter.flush_grapheme_cluster: \"{}\" ({} len, {} width)\n", + std::string_view(state.currentClusterStart, current), + (unsigned) std::distance(state.currentClusterStart, current), + width); + eventHandler.on_grapheme_cluster(std::string_view(state.currentClusterStart, current), width); + state = { + .utf8DecodeNext = current, + .currentClusterStart = current, + .currentCodepointStart = current, + }; + return width; + } + else + { + // Currently scanned grapheme cluster won't fit. Revert to cluster start. + auto const revertPoint = state.currentClusterStart; + state = { + .utf8DecodeNext = revertPoint, + .currentClusterStart = revertPoint, + .currentCodepointStart = revertPoint, + }; + return 0; + } + } + else + { + // Current grapheme cluster is empty. + assert(state.currentClusterStart == current); + return 0; + } + } + + enum class GraphemeClusterBoundary + { + Found, + NotFound, + }; + + /// Feeds the next Unicode codepoint into the grapheme cluster processor, + /// determining if the current grapheme cluster is complete (grapheme cluster boundary detected). + /// + /// @retval true on grapheme cluster boundary detected + /// @retval false on grapheme cluster boundary not detected + LIBUNICODE_INLINE GraphemeClusterBoundary feed_grapheme_cluster(unicode_process_state& state, + char32_t const nextCodepoint, + char const* const input) noexcept + { + // We've successfully decoded the next UTF-8 codepoint. + + auto const prevCodepoint = state.lastCodepointHint; + state.lastCodepointHint = nextCodepoint; + + if (!prevCodepoint) + { + state.currentCodepointStart = input; + state.currentClusterWidth = unicode::width(nextCodepoint); + + return GraphemeClusterBoundary::NotFound; + } + + else if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint)) + { + // state.currentCodepointStart = input; + return GraphemeClusterBoundary::Found; + } + else + { + // Increase width on VS16 but do not decrease on VS15. + if (nextCodepoint == 0xFE0F) // VS16 + state.currentClusterWidth = 2; + else if (nextCodepoint == 0xFE0E) // VS15 + { +#if 0 + state.currentClusterWidth = 1; +#endif + } + // else + // state.currentClusterWidth = std::max(state.currentClusterWidth, unicode::width(nextCodepoint)); + + state.currentCodepointStart = input; + return GraphemeClusterBoundary::NotFound; + } + } + + // Flushes out pending grapheme cluster, if any. + // + // @param eventHandler the event listener interface to report events to + // @param state the current state of the complex Unicode processing + // @param maxWidth the maximum number of widths to fill in the current line + // + // @note A call to this function is idempotent. + // + // @returns the result of the processing + template + LIBUNICODE_INLINE grapheme_segmentation_result flush_complex_unicode(EventHandlerT& eventHandler, + unicode_process_state& state, + unsigned maxWidth) noexcept + { + auto const clusterStart = state.currentClusterStart; + auto const count = flush_grapheme_cluster(eventHandler, state, state.utf8DecodeNext, maxWidth); + return { + .text = std::string_view { clusterStart, state.utf8DecodeNext }, + .width = count, + .stop_condition = StopCondition::EndOfInput, + }; + } + + // Processes up to [start, end) ASCII characters. + // + // A call to this function will never process control characters nor non-ASCII (complex Unicode) + // characters. + // + // @returns the number of ASCII characters processed (equal to the sum of East Asian Width for each). + LIBUNICODE_INLINE std::pair process_only_ascii(char const* start, char const* end) noexcept + { + auto input = start; + +#if defined(LIBUNICODE_HAS_STD_SIMD) + using char8_type = unsigned char; + constexpr auto batchSize = stdx::simd_abi::max_fixed_size; + using batch_type = stdx::fixed_size_simd; + while (input <= end - batchSize) + { + auto const batch = batch_type { input, stdx::element_aligned }; + auto const testPack = (batch < 0x20) | (batch >= 0x80); + if (stdx::popcount(testPack) > 0) + return { StopCondition::UnexpectedInput, input + stdx::find_first_set(testPack) }; + input += batchSize; + } +#elif defined(LIBUNICODE_USE_INTRINSICS) + auto constexpr BatchSize = sizeof(intrinsics::m128i); + auto const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F + auto const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) + + while (input <= end - BatchSize) + { + auto const batch = intrinsics::load_unaligned((intrinsics::m128i*) input); + auto const isControl = intrinsics::compare_less(batch, ControlCodeMax); + auto const isComplex = intrinsics::and128(batch, Complex); + auto const testPack = intrinsics::or128(isControl, isComplex); + if (auto const check = static_cast(intrinsics::movemask_epi8(testPack)); check != 0) + return { StopCondition::UnexpectedInput, input + countTrailingZeroBits(check) }; + input += BatchSize; + } +#endif + + while (true) + { + if (input == end) + return { StopCondition::EndOfInput, input }; + if (!is_ascii(*input)) + return { StopCondition::UnexpectedInput, input }; + ++input; + } + } + + LIBUNICODE_INLINE std::pair process_only_ascii(std::string_view text) noexcept + { + return process_only_ascii(text.data(), text.data() + text.size()); + } + + // Processes up to maxWidth grapheme clusters. + // + // @param events event listener interface to report events to + // @param utf8 the UTF-8 decoding state to use + // @param lastCodepointHint the last codepoint to use resuming processing + // the first grapheme cluster. This value is 0 if there is no such hint. + // @param start the start of the text to process + // @param end the end of the text to process (this is one byte past the last byte) + // @param maxWidth the maximum number of widths to fill in the current line + // + // - The returned width is the sum of all east asian widths of the grapheme + // clusters of the returned text. + // - The returned text is a substring of the input text. + // - And the returned text is guaranteed to not contain any control characters, + // nor any incomplete UTF-8 sequences. + // + // @returns a sequence of grapheme clusters up to maxWidth width. + template + LIBUNICODE_INLINE auto process_only_complex_unicode( + EventHandlerT& eventHandler, unicode_process_state& state, char const* start, char const* end, unsigned maxWidth) noexcept + -> detail::unicode_process_result + { + if (!state.utf8DecodeNext) + { + // Initialize state, as it's the first call to process_only_complex_unicode() for this line. + state = { + .utf8DecodeNext = start, + .currentClusterStart = start, + .currentCodepointStart = start, + .currentClusterWidth = 0, + .lastCodepointHint = 0, + }; + } + + char const* input = state.utf8DecodeNext; // current input processing position + unsigned consumedWidths = 0; // sum of all widths consumed for the current line + + while (true) + { + if (not(input != end)) + { + // We've reached the end of the input. + // There may be an incomplete grapheme cluster left, that we need to resume processing with on the next call. + state.utf8DecodeNext = input; + return make_scan_result(consumedWidths, input, StopCondition::EndOfInput); + } + assert(consumedWidths <= maxWidth); + + if (!detail::is_complex(*input)) + { + consumedWidths += flush_grapheme_cluster(eventHandler, state, input, maxWidth); + state.utf8DecodeNext = input; + return make_scan_result(consumedWidths, input, StopCondition::UnexpectedInput); + } + + auto const result = from_utf8(state.utf8, static_cast(*input++)); + + if (std::holds_alternative(result)) + { + continue; + } + else if (auto const* success = std::get_if(&result); success) + { + LIBUNICODE_TRACE_SEGMENTER( + "grapheme_line_segmenter.process_only_complex_unicode: got codepoint: U+{:X}, next: {}\n", + (unsigned) success->value, + (void*) input); + auto const currentCodepoint = success->value; + if (feed_grapheme_cluster(state, currentCodepoint, input) == GraphemeClusterBoundary::Found) + { + if (consumedWidths + state.currentClusterWidth <= maxWidth) + { + LIBUNICODE_TRACE_SEGMENTER( + "grapheme_line_segmenter.process_only_complex_unicode: grapheme cluster: \"{}\" ({} width)\n", + std::string_view(state.currentClusterStart, input), + state.currentClusterWidth); + consumedWidths += state.currentClusterWidth; + assert(state.currentClusterStart <= state.currentCodepointStart); + eventHandler.on_grapheme_cluster(std::string_view(state.currentClusterStart, state.currentCodepointStart), + state.currentClusterWidth); + state.lastCodepointHint = currentCodepoint; + state.currentClusterWidth = unicode::width(currentCodepoint); + state.currentClusterStart = state.currentCodepointStart; + state.currentCodepointStart = input; + + if (consumedWidths == maxWidth) + { + // We've reached the end of the line. + state.utf8DecodeNext = state.currentClusterStart; + return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth); + } + } + else + { + // Currently scanned grapheme cluster won't fit. Revert to cluster start. + state.utf8DecodeNext = state.currentClusterStart; + return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth); + } + } + } + else if (std::holds_alternative(result)) + { + state.currentClusterWidth = 1; + if (consumedWidths + state.currentClusterWidth <= maxWidth) + { + eventHandler.on_invalid(std::string_view(state.currentClusterStart, input)); + consumedWidths += state.currentClusterWidth; + state.currentClusterStart = input; + } + else + { + // Currently scanned grapheme cluster won't fit. Revert to cluster start. + state.utf8DecodeNext = state.currentClusterStart; + return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth); + } + } + } + } + + template + LIBUNICODE_INLINE unicode_process_result process_only_complex_unicode(EventHandlerT& eventHandler, + unicode_process_state& state, + std::string_view text, + unsigned maxWidth) noexcept + + { + return process_only_complex_unicode(eventHandler, state, text.data(), text.data() + text.size(), maxWidth); + } + +} // namespace detail +// }}} + +template +class grapheme_line_segmenter; + +// Segments UTF-8 encoded text into sequences of grapheme clusters up to a given total width. +// +// This grapheme cluster segmenter is stateful and can be used to scan text in chunks. +// It can resume scanning where it left off. +// +// A segment consists of a sequence of zero or more grapheme clusters, +// including zero or more ASCII characters, and including invalid UTF-8 sequences. +// +// Invalid UTF-8 sequences are reported as a single invalid grapheme cluster with +// an east asian width Narrow (width 1), because when used for rendering, they'll be +// rendered as a single replacement character (U+FFFD). +// +// A sequence of grapheme clusters will never contain control characters, +// and process() will never process control characters but stop at the first one. +// +// Use move_forward_to() to move the internal state forward to a given position, +// e.g. to skip over control characters. +template +class grapheme_line_segmenter +{ + public: + grapheme_line_segmenter() = default; + + template + explicit grapheme_line_segmenter(Listener& listener, std::string_view text) noexcept: _eventHandler { listener } + { + if (!text.empty()) + reset(text); + } + + // Resets the grapheme line segmenter and re-initializes it with the given buffer. + // + // @param buffer the buffer to scan. Its underlying storage must be used by the + // subsequent calls to process() + LIBUNICODE_INLINE void reset(std::string_view buffer) noexcept + { + _buffer = buffer; + + _complexUnicodeState = { + .utf8DecodeNext = buffer.data(), + .currentClusterStart = buffer.data(), + .currentCodepointStart = buffer.data(), + }; + } + + // Expands the internal buffer by the given number of bytes. + // + // @param size the number of bytes to expand the buffer by + LIBUNICODE_INLINE void expand_buffer_by(size_t size) noexcept + { + assert(size > 0); + _buffer = std::string_view(_buffer.data(), _buffer.size() + size); + } + + // Moves the internal state forward to the given position. + // + // A call to this function will also reset the internal UTF-8 decoding state, + // and will reset the last codepoint hint. + // + // @p pos must be a pointer to a position within the current buffer + LIBUNICODE_INLINE void move_forward_to(char const* pos) noexcept + { + assert(_buffer.data() <= pos && pos <= _buffer.data() + _buffer.size()); + _complexUnicodeState = { + .utf8DecodeNext = pos, + .currentClusterStart = pos, + .currentCodepointStart = pos, + .currentClusterWidth = 0, + .lastCodepointHint = 0, + }; + } + + // Processes the given text. + // + // Subsequent calls to this function will continue processing the text + // where the previous call left off. + // + // @param maxWidth the maximum number of width to fill in the current line + LIBUNICODE_INLINE auto process(unsigned maxWidth) noexcept -> grapheme_segmentation_result + { + using detail::State; + + if (_complexUnicodeState.utf8DecodeNext == end()) + return { .text = {}, .width = 0, .stop_condition = StopCondition::EndOfInput }; + + // Points to the beginning of a grapheme cluster. + char const* const resultStart = _complexUnicodeState.currentClusterStart; + char const* const endAtMaxWidth = std::min(end(), next() + maxWidth); + + // Total number of widths used in the current line. + unsigned processedTotalWidth = 0; + + while (true) + { + switch (detail::make_state(next(), end(), processedTotalWidth, maxWidth)) + { + case State::EndOfInput: + return { .text = { resultStart, _complexUnicodeState.currentClusterStart }, + .width = processedTotalWidth, + .stop_condition = StopCondition::EndOfInput }; + case State::EndOfWidth: + return { .text = { resultStart, _complexUnicodeState.currentClusterStart }, + .width = processedTotalWidth, + .stop_condition = StopCondition::EndOfWidth }; + case State::C0: + return { .text = { resultStart, _complexUnicodeState.currentClusterStart }, + .width = processedTotalWidth, + .stop_condition = StopCondition::UnexpectedInput }; + case State::ASCII: { + assert(processedTotalWidth < maxWidth); + assert(_complexUnicodeState.utf8.expectedLength == 0); + assert(_complexUnicodeState.utf8.currentLength == 0); + auto const [stop, consumedEnd] = detail::process_only_ascii(next(), endAtMaxWidth); + LIBUNICODE_TRACE_SEGMENTER("grapheme_line_segmenter.process: ascii: \"{}\" (len {}, stop {})\n", + std::string_view(next(), consumedEnd), + (long) std::distance(next(), consumedEnd), + (int) stop); + assert(consumedEnd > next()); + auto const consumedWidth = static_cast(std::distance(next(), consumedEnd)); + assert(processedTotalWidth + consumedWidth <= maxWidth); + auto const asciiTextChunk = std::string_view(next(), consumedWidth); + _eventHandler.on_ascii(asciiTextChunk); + _complexUnicodeState.utf8DecodeNext = consumedEnd; + _complexUnicodeState.currentClusterStart = consumedEnd; + _complexUnicodeState.currentCodepointStart = consumedEnd; + assert(!asciiTextChunk.empty()); + _complexUnicodeState.lastCodepointHint = asciiTextChunk.back(); + processedTotalWidth += consumedWidth; + assert(processedTotalWidth <= maxWidth); + break; + } + case State::ComplexUnicode: { + // We know we have complex UTF-8 codepoints here. + // We need to process them one by one to determine their width. + // We also need to check if the current grapheme cluster fits into the current line. + // If not, we need to stop processing and return the current line. + // If the input is exhausted, we need to return the current line. + assert(processedTotalWidth < maxWidth); + auto const chunk = detail::process_only_complex_unicode( + _eventHandler, _complexUnicodeState, _complexUnicodeState.utf8DecodeNext, end(), maxWidth); + assert(_complexUnicodeState.utf8DecodeNext <= chunk.end); + LIBUNICODE_TRACE_SEGMENTER("grapheme_line_segmenter.process: complex: \"{}\" (len {}, width {}, stop {})\n", + std::string_view(_complexUnicodeState.utf8DecodeNext, chunk.end), + (long) std::distance(_complexUnicodeState.utf8DecodeNext, chunk.end), + chunk.totalWidth, + (int) chunk.stop_condition); + processedTotalWidth += chunk.totalWidth; + assert(processedTotalWidth <= maxWidth); + if (chunk.stop_condition != StopCondition::UnexpectedInput) + // The most recent grapheme cluster does not fit into the current line or the input is exhausted. + return { .text = std::string_view { resultStart, _complexUnicodeState.currentClusterStart }, + .width = processedTotalWidth, + .stop_condition = chunk.stop_condition }; + break; + } + } + } + } + + // Flushes out pending grapheme cluster, if any. + // + // @param maxWidth the maximum number of widths to fill in the current line + // + // @note A call to this function is idempotent. + // + // @returns the result of the processing + LIBUNICODE_INLINE grapheme_segmentation_result flush(unsigned maxWidth) noexcept + { + return flush_complex_unicode(_eventHandler, _complexUnicodeState, maxWidth); + } + + // Processes a single byte. + // + // Subsequent calls to this function will continue processing the text + // + // @param byte the byte to process + // + // @return the result of the processing + ConvertResult process_single_byte(uint8_t byte) noexcept { return from_utf8(_complexUnicodeState.utf8, byte); } + + // Tests whether there is an incomplete UTF-8 codepoint pending. + bool is_utf8_byte_pending() const noexcept { return _complexUnicodeState.utf8.expectedLength > 0; } + + // Returns a copy of the current UTF-8 decoding state. + [[nodiscard]] utf8_decoder_state utf8_state() const noexcept { return _complexUnicodeState.utf8; } + + // Returns the last processed UTF-32 codepoint. + constexpr char32_t last_codepoint_hint() const noexcept { return _complexUnicodeState.lastCodepointHint; } + + // Resets the last codepoint hint. + constexpr void reset_last_codepoint_hint(char32_t value = 0) noexcept { _complexUnicodeState.lastCodepointHint = value; } + + void update_next_utf8_decode(char const* next) noexcept { _complexUnicodeState.utf8DecodeNext = next; } + char const* next() const noexcept { return _complexUnicodeState.utf8DecodeNext; } + char const* end() const noexcept { return _buffer.data() + _buffer.size(); } + + static std::pair process_only_ascii(std::string_view text) noexcept + { + return detail::process_only_ascii(text.data(), text.data() + text.size()); + } + + private: + detail::unicode_process_state _complexUnicodeState {}; + + // The buffer to scan. Its underlying storage must be used by the subsequent calls to process(). + // While consuming the buffer, the front of the buffer will be moved forward. + std::string_view _buffer; + + detail::EventHandler _eventHandler; +}; + +template <> +class grapheme_line_segmenter<>: public grapheme_line_segmenter +{ + public: + grapheme_line_segmenter() = default; +}; + +template +grapheme_line_segmenter(Listener) -> grapheme_line_segmenter; + +template +grapheme_line_segmenter(Listener&, std::string_view) -> grapheme_line_segmenter; + +} // namespace unicode + +// {{{ ostream support +namespace std +{ +inline std::ostream& operator<<(std::ostream& os, unicode::StopCondition value) +{ + std::string_view name; + switch (value) + { + case unicode::StopCondition::UnexpectedInput: name = "UnexpectedInput"; break; + case unicode::StopCondition::EndOfInput: name = "EndOfInput"; break; + case unicode::StopCondition::EndOfWidth: name = "EndOfWidth"; break; + default: name = "INVALID"; break; + } + return os << name; +} + +inline std::ostream& operator<<(std::ostream& os, unicode::grapheme_segmentation_result const& value) +{ + return os << "{text: \"" << std::string(value.text) << "\", len: " << value.text.size() << ", width: " << value.width + << ", stop: " << value.stop_condition << "}"; +} +} // namespace std +// }}} diff --git a/src/libunicode/grapheme_line_segmenter.md b/src/libunicode/grapheme_line_segmenter.md new file mode 100644 index 0000000..7bef730 --- /dev/null +++ b/src/libunicode/grapheme_line_segmenter.md @@ -0,0 +1,72 @@ + +# Processing UTF-8 byte sequences into grapheme clusters + +## The Objective + +๐Ÿ‘ช๐Ÿ‘ช๐Ÿ‘ช + +- Process a consecutive sequence text in UTF-8 encoding into a group of grapheme clusters +- Stop processing on one of the conditions: + - end of input stream is reached + - a control character (such as newline or escape character) has been found + - the maximum number of grapheme clusters in narrow width (aka. page width) have been consumed (while wide characters count as two narrow characters) +- Allow resuming processing text when we previously stopped in the middle of a grapheme cluster +- The algorithm must be as resource efficient as possible: + - do not require any dynamic memory allocations during text processing + - reduce instruction branching as much as possible + - utilize SIMD to improve throughput performance +- Invalid codepoints are treated with east asian width Narrow (1 column) +- The event emitting mechanism must be zero-overhead. If these events are not needed, they must not penalize performance. + +## Corollaries + +- When reaching the end of the input, but a single UTF-8 sequence has not been fully processed yet, no event will be emitted. The subsequent call to process() will resume UTF-8 decoding. +- When no grapheme boundary has been found while the end of input has been reached, no grapheme cluster will be reported. The subsequent calls that detect the boundary will report the **full** grapheme cluster from the sum of all calls. + +## Implementation + +Scanning US-ASCII can be easily implemented using SIMD, increasing scanning performance dramatically. + +Scanning non-US-ASCII text, complex Unicode codepoints, is way more complex, because more depth is involved. + +In order to reliably stop scanning at the page width - we must take into account +that the character we see on the screen is not necessarily just a single byte, +nor even a single UTF-32 codepoint, but rather a sequence of UTF-32 codepoints. +This is what we call **grapheme cluster**. A grapheme cluster is a user perceived single grapheme entity, +that can be one or more Unicode codepoints. + +We therefore must be able to determine the border of when a grapheme cluster ends and the next one begins. + +Because scanning US-ASCII text can be implemented using SIMD but complex Unicode cannot, we split both +tasks into their own sub tasks, and then alter between the two in order to scan the sum of all Unicode text. + +In this article, we'll befocusing on scanning for complex Unicode. + +We also must be able to suspend and resume scanning text at any arbitrary point +in time, because we are not guaranteed to always have all bytes available in a single call. + +## Example Processing: Family Emoji + +``` +UTF-8 | F0 9F 91 A8 | E2 80 8D | F0 9F 91 A9 | E2 80 8D | F0 9F 91 A7 | E2 80 8D | F0 9F 91 A6 +UTF-32 | U+1F468 (๐Ÿ‘จ) | U+200D | U+1F469 (๐Ÿ‘ฉ) | U+200D | U+1F467 (๐Ÿ‘ง) | 0x200D | U+1F466 (๐Ÿ‘ฆ) +GC | ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ +``` + +## Events + +### On complete valid UTF-32 codepoint + +- remember byte pointer to mark start of next Unicode codepoint +- if last codepoint and next codepoint are GC breakable: then report grapheme cluster + +### On complete grapheme cluster + +- assert: last and next valid codepoint are GC breakable +- if the new grapheme cluster fits into the page width, then report grapheme cluster +- otherwise, remember that GC range and terminate + + +## Test cases + +- Have a sequence of 2 wide emoji (e.g. U+1F600), but only a page width of 3. We can only align one emoji per line. diff --git a/src/libunicode/grapheme_line_segmenter_test.cpp b/src/libunicode/grapheme_line_segmenter_test.cpp new file mode 100644 index 0000000..742a1e7 --- /dev/null +++ b/src/libunicode/grapheme_line_segmenter_test.cpp @@ -0,0 +1,645 @@ +/** + * This file is part of the "libunicode" project + * Copyright (c) 2024 Christian Parpart + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include + +#include +#include + +#include +#include + +using namespace std::string_view_literals; +using namespace std::string_literals; +using std::pair; +using StopCondition = unicode::StopCondition; + +struct expectation +{ + size_t offset; + size_t size; + + unsigned width; + + auto operator<=>(expectation const&) const = default; +}; + +// {{{ operator<<(ostream, ...) overrides +namespace std +{ + +std::ostream& operator<<(std::ostream& os, expectation const& e) +{ + return os << "{ offset: " << e.offset << ", size: " << e.size << ", width: " << e.width << " }"; +} + +[[maybe_unused]] std::ostream& operator<<(std::ostream& os, std::pair const& v) +{ + return os << "{" << v.first << ", " << v.second << "}"; +} + +} // namespace std +// }}} + +// {{{ helpers +namespace +{ + +using segmentation_result = unicode::grapheme_segmentation_result; +using unicode::grapheme_line_segmenter; + +enum class NumericEscape +{ + Octal, + Hex +}; + +std::string escape(uint8_t ch, NumericEscape numericEscape = NumericEscape::Hex) +{ + switch (ch) + { + case '\\': return "\\\\"; + case 0x1B: return "\\e"; + case '\t': return "\\t"; + case '\r': return "\\r"; + case '\n': return "\\n"; + case '"': return "\\\""; + default: + if (0x20 <= ch && ch < 0x7E) + return fmt::format("{}", static_cast(ch)); + else if (numericEscape == NumericEscape::Hex) + return fmt::format("\\x{:02x}", static_cast(ch) & 0xFF); + else + return fmt::format("\\{:03o}", static_cast(ch) & 0xFF); + } +} + +inline std::string e(std::string_view s, NumericEscape numericEscape = NumericEscape::Hex) +{ + auto result = std::string {}; + for (char c: s) + result += escape(static_cast(c), numericEscape); + return result; +} + +struct invalid_sequence +{ + std::string_view value; + auto operator<=>(invalid_sequence const&) const = default; +}; + +struct ascii_sequence +{ + std::string_view value; + auto operator<=>(ascii_sequence const&) const = default; +}; + +struct complex_unicode_sequence +{ + std::string_view value; + unsigned width; + auto operator<=>(complex_unicode_sequence const&) const = default; +}; + +[[maybe_unused]] std::ostream& operator<<(std::ostream& os, complex_unicode_sequence const& seq) +{ + return os << "{ value: \"" << e(seq.value) << "\", width: " << seq.width << " }"; +} + +} // namespace + +namespace +{ + +using Record = std::variant; + +auto constexpr FamilyEmoji = U"\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466"sv; +auto constexpr SmileyEmoji = U"\U0001F600"sv; +auto constexpr CopyrightSign = U"\u00A9"sv; + +template +auto u8(T text) +{ + return unicode::convert_to(text); +} + +class event_logger final: public unicode::grapheme_segmentation_listener +{ + public: + static event_logger& get() + { + static event_logger instance; + return instance; + } + + void on_invalid(std::string_view invalid) noexcept + { + UNSCOPED_INFO(fmt::format("[event_logger] on_invalid: {}\n", e(invalid))); + } + + void on_ascii(std::string_view text) noexcept { UNSCOPED_INFO(fmt::format("[event_logger] on_ascii: {}\n", text)); } + + void on_grapheme_cluster(std::string_view text, unsigned width) noexcept + { + UNSCOPED_INFO(fmt::format("[event_logger] on_grapheme_cluster: {} (width: {})\n", text, width)); + } +}; + +class event_recorder final: public unicode::grapheme_segmentation_listener +{ + public: + explicit event_recorder(std::string_view text): _text(text) {} + + void reset(std::string_view text) noexcept + { + _text = text; + _records.clear(); + } + + std::string_view text() const noexcept { return _text; } + + void on_invalid(std::string_view sequence) noexcept + { + UNSCOPED_INFO(fmt::format("[event_logger] on_invalid: {}\n", e(sequence))); + _records.emplace_back(invalid_sequence { sequence }); + } + + void on_ascii(std::string_view sequence) noexcept + { + UNSCOPED_INFO(fmt::format("[event_logger] on_ascii: {}\n", sequence)); + _records.emplace_back(ascii_sequence { sequence }); + } + + void on_grapheme_cluster(std::string_view cluster, unsigned width) noexcept + { + UNSCOPED_INFO(fmt::format("[event_recorder] grapheme cluster {}+{} '{}' (width: {}, u8: {})\n", + std::distance(_text.data(), cluster.data()), + cluster.size(), + cluster, + width, + e(cluster))); + _records.emplace_back(complex_unicode_sequence { cluster, width }); + } + + std::vector const& records() const noexcept { return _records; } + size_t size() const noexcept { return _records.size(); } + + complex_unicode_sequence const& cluster(size_t i) const noexcept + { + return std::get(_records.at(i)); + } + + expectation at(size_t i) const noexcept + { + auto const& cu = cluster(i); + return expectation { .offset = (size_t) std::distance(_text.data(), cu.value.data()), + .size = cu.value.size(), + .width = cu.width }; + } + + private: + std::string_view _text; + std::vector _records; +}; + +segmentation_result scan_text(std::string_view text, unsigned width) +{ + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, text }; + UNSCOPED_INFO(fmt::format("Processing {} bytes @{}: \"{}\"\n", text.size(), (void*) text.data(), e(text))); + auto const main = segmenter.process(width); + if (main.width == width) + return main; + auto const fin = segmenter.flush(width - main.width); + if (fin.text.empty()) + return main; + return segmentation_result { + .text = std::string_view { text.data(), fin.text.data() + fin.text.size() }, + .width = main.width + fin.width, + .stop_condition = main.stop_condition, + }; +} + +} // namespace +// }}} + +// {{{ helper method tests for ASCII-only +inline auto process_only_ascii(std::string_view text) +{ + auto const result = unicode::detail::process_only_ascii(text); + return pair { result.first, static_cast(std::distance(text.data(), result.second)) }; +} + +TEST_CASE("grapheme_line_segmenter.process_only_ascii") +{ + // clang-format off + // ensure SIMD-enabled processing stops at control characters and complex Unicode characters at the beginning + CHECK(process_only_ascii("\rABCD") == pair { StopCondition::UnexpectedInput, 0 }); + CHECK(process_only_ascii("\nABCD") == pair { StopCondition::UnexpectedInput, 0 }); + CHECK(process_only_ascii("\033ABCD") == pair { StopCondition::UnexpectedInput, 0 }); + CHECK(process_only_ascii(u8(CopyrightSign)) == pair { StopCondition::UnexpectedInput, 0 }); + CHECK(process_only_ascii(u8(SmileyEmoji)) == pair { StopCondition::UnexpectedInput, 0 }); + + // ensure SIMD-enabled processing stops at control characters + CHECK(process_only_ascii("1234\033") == pair { StopCondition::UnexpectedInput, 4 }); + CHECK(process_only_ascii("12345678\033") == pair { StopCondition::UnexpectedInput, 8 }); + CHECK(process_only_ascii("0123456789ABCDEF\033") == pair { StopCondition::UnexpectedInput, 16 }); + CHECK(process_only_ascii("0123456789ABCDEF1\033") == pair { StopCondition::UnexpectedInput, 17 }); + CHECK(process_only_ascii("0123456789ABCDEF1" + u8(SmileyEmoji)) == pair { StopCondition::UnexpectedInput, 17 }); + CHECK(process_only_ascii("0123456789ABCDEF0123456789ABCDE\033") == pair { StopCondition::UnexpectedInput, 31 }); + + // ensure SIMD-enabled processing stops at complex Unicode + CHECK(process_only_ascii("0123456789ABCDEF0123456789ABCDE\x80") == pair { StopCondition::UnexpectedInput, 31 }); + CHECK(process_only_ascii("0123456789ABCDEF0123456789ABCDE\x81") == pair { StopCondition::UnexpectedInput, 31 }); + CHECK(process_only_ascii("0123456789ABCDEF0123456789ABCDE\xFF") == pair { StopCondition::UnexpectedInput, 31 }); + + // test minimal input + CHECK(process_only_ascii("") == pair { StopCondition::EndOfInput, 0 }); + CHECK(process_only_ascii("0") == pair { StopCondition::EndOfInput, 1 }); + + // test at and around SIMD (SSE2) boundary + CHECK(process_only_ascii("0123456789ABCDE") == pair { StopCondition::EndOfInput, 15 }); + CHECK(process_only_ascii("0123456789ABCDEF") == pair { StopCondition::EndOfInput, 16 }); + CHECK(process_only_ascii("0123456789ABCDEF1") == pair { StopCondition::EndOfInput, 17 }); + // clang-format on +} +// }}} + +// {{{ helper method tests for complex unicode only +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.0") +{ + auto const text = ""s; + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(result.end == text.data()); + CHECK(result.totalWidth == 0); + CHECK(recorder.size() == 0); + + unicode::detail::flush_complex_unicode(recorder, state, 80); + CHECK(recorder.size() == 0); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.1") +{ + // 1 complex grapheme cluster and 1 simple grapheme cluster + auto const text = u8(SmileyEmoji) + "."s; + + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(result.end == text.data() + 4); + CHECK(result.totalWidth == 2); + CHECK(recorder.size() == 1); + CHECK(recorder.at(0) == expectation { .offset = 0, .size = 4, .width = 2 }); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.2") +{ + // 2 complex grapheme cluster and 1 simple grapheme cluster + // NB: We append the trailing dot to force the grapheme line segmenter to process the last grapheme cluster. + auto const text = u8(SmileyEmoji) + u8(SmileyEmoji) + "."s; + + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(result.end == text.data() + 8); + CHECK(result.totalWidth == 4); + CHECK(recorder.size() == 2); + CHECK(recorder.at(0) == expectation { .offset = 0, .size = 4, .width = 2 }); + CHECK(recorder.at(1) == expectation { .offset = 4, .size = 4, .width = 2 }); + + unicode::detail::flush_complex_unicode(recorder, state, 80); + CHECK(recorder.size() == 2); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.3") +{ + // 3 simple grapheme clusters (and have no bytes next to them) + auto const text = u8(SmileyEmoji) + u8(SmileyEmoji) + u8(SmileyEmoji); + + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(std::distance(text.data(), result.end) == 3 * 4lu); + CHECK(result.totalWidth == 4); + CHECK(recorder.size() == 2); + CHECK(recorder.at(0) == expectation { .offset = 0, .size = 4, .width = 2 }); + CHECK(recorder.at(1) == expectation { .offset = 4, .size = 4, .width = 2 }); + REQUIRE(long(std::distance((char const*) text.data(), state.utf8DecodeNext)) == 12); + + unicode::detail::flush_complex_unicode(recorder, state, 80); + CHECK(recorder.size() == 3); + CHECK(recorder.at(2) == expectation { .offset = 8, .size = 4, .width = 2 }); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.invalid.1") +{ + auto const text = "\xFF"sv; + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(result.end == text.data() + 1); + CHECK(result.totalWidth == 1); + REQUIRE(recorder.size() == 1); + REQUIRE(std::holds_alternative(recorder.records().at(0))); + CHECK(std::get(recorder.records().at(0)).value == "\xFF"sv); + + unicode::detail::flush_complex_unicode(recorder, state, 80); + CHECK(recorder.size() == 1); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.invalid.2") +{ + auto const text = "\xFF\xFFx"sv; + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const result = unicode::detail::process_only_complex_unicode(recorder, state, text, 80); + CHECK(result.end == text.data() + 2); + CHECK(result.totalWidth == 2); + CHECK(recorder.size() == 2); + + unicode::detail::flush_complex_unicode(recorder, state, 80); + CHECK(recorder.size() == 2); +} + +TEST_CASE("grapheme_line_segmenter.process_only_complex_unicode.sliced_calls") +{ + auto constexpr text = "\xF0\x9F\x98\x80\033\\0123456789ABCDEF"sv; // U+1F600 + auto constexpr splitOffset = 3; + auto constexpr chunk1 = text.substr(0, splitOffset); + + auto recorder = event_recorder { text }; + auto state = unicode::detail::unicode_process_state {}; + auto const r1 = unicode::detail::process_only_complex_unicode(recorder, state, chunk1, 80); + REQUIRE(state.utf8.expectedLength == 4); + REQUIRE(state.utf8.currentLength == 3); + // We must not have emitted any grapheme cluster yet. + CHECK(std::distance(chunk1.data(), r1.end) == 3); + CHECK(r1.totalWidth == 0); + + auto constexpr chunk2 = text.substr(splitOffset); + auto const r2 = unicode::detail::process_only_complex_unicode(recorder, state, chunk2, 80); + CHECK(r2.end == chunk2.data() + 1); + CHECK(r2.totalWidth == 2); + CHECK(state.utf8.expectedLength == 0); + CHECK(state.utf8.currentLength == 0); +} + +// }}} + +TEST_CASE("grapheme_line_segmenter.nocallbacks") +{ + auto segmenter = unicode::grapheme_line_segmenter {}; + segmenter.reset("Hello\033[m"sv); + auto const result = segmenter.process(80); + CHECK(result.text == "Hello"sv); + CHECK(result.width == 5); + CHECK(result.stop_condition == StopCondition::UnexpectedInput); +} + +// {{{ test ASCII-only +TEST_CASE("grapheme_line_segmenter.ascii.empty") +{ + CHECK(scan_text(""sv, 4) == segmentation_result { "", 0, StopCondition::EndOfInput }); +} + +TEST_CASE("grapheme_line_segmenter.ascii.32") +{ + auto const text = "0123456789ABCDEF0123456789ABCDEF"sv; + + auto const shortStr = [&](size_t len) { + return text.substr(0, len); + }; + + // clang-format off + CHECK(scan_text(text, 32) == segmentation_result { .text = text, .width = 32, .stop_condition = StopCondition::EndOfInput }); + CHECK(scan_text(text, 16) == segmentation_result { .text = shortStr(16), .width = 16, .stop_condition = StopCondition::EndOfWidth }); + CHECK(scan_text(text, 8) == segmentation_result { .text = shortStr(8), .width = 8, .stop_condition = StopCondition::EndOfWidth }); + CHECK(scan_text(text, 1) == segmentation_result { .text = shortStr(1), .width = 1, .stop_condition = StopCondition::EndOfWidth }); + CHECK(scan_text(text, 0) == segmentation_result { .text = shortStr(0), .width = 0, .stop_condition = StopCondition::EndOfWidth }); + // clang-format on +} + +TEST_CASE("grapheme_line_segmenter.ascii.mixed_with_controls") +{ + // clang-format off + CHECK(scan_text("\0331234", 80) == segmentation_result { "", 0, StopCondition::UnexpectedInput }); + CHECK(scan_text("1234\033", 80) == segmentation_result { "1234", 4, StopCondition::UnexpectedInput }); + CHECK(scan_text("12345678\033", 80) == segmentation_result { "12345678", 8, StopCondition::UnexpectedInput }); + CHECK(scan_text("0123456789ABCDEF\033", 80) == segmentation_result { "0123456789ABCDEF", 16, StopCondition::UnexpectedInput }); + CHECK(scan_text("0123456789ABCDEF1\033", 80) == segmentation_result { "0123456789ABCDEF1", 17, StopCondition::UnexpectedInput }); + CHECK(scan_text("0123456789ABCDEF0123456789ABCD\033F", 80) == segmentation_result { "0123456789ABCDEF0123456789ABCD", 30, StopCondition::UnexpectedInput }); + // clang-format on +} +// }}} + +// {{{ test complex unicode only +TEST_CASE("grapheme_line_segmenter.complex.grapheme_cluster.1") +{ + auto constexpr OUmlaut = "\xC3\xB6"sv; + CHECK(scan_text(OUmlaut, 2) == segmentation_result { OUmlaut, 1, StopCondition::EndOfInput }); + + auto const familyEmoji8 = u8(FamilyEmoji); + auto const result = scan_text(familyEmoji8, 80); + CHECK(result == segmentation_result { familyEmoji8, 2, StopCondition::EndOfInput }); +} + +TEST_CASE("grapheme_line_segmenter.complex.grapheme_cluster.2") +{ + auto const familyEmoji8 = u8(FamilyEmoji) + u8(FamilyEmoji); + auto const result = scan_text(familyEmoji8, 80); + CHECK(result == segmentation_result { .text = familyEmoji8, .width = 4, .stop_condition = StopCondition::EndOfInput }); +} +// }}} + +// {{{ test invalid UTF-8 sequences +TEST_CASE("grapheme_line_segmenter.invalid_char.1") +{ + auto const text = "1234\x80"sv; + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, text }; + auto const result = segmenter.process(80); + UNSCOPED_INFO("result: " << result); + REQUIRE(recorder.size() == 2); + REQUIRE(std::holds_alternative(recorder.records().at(0))); + REQUIRE(std::holds_alternative(recorder.records().at(1))); + REQUIRE(std::get(recorder.records().at(0)).value == "1234"sv); + REQUIRE(std::get(recorder.records().at(1)).value == "\x80"sv); +} +// }}} + +// {{{ mixed primitive ASCII and complex unicode +TEST_CASE("grapheme_line_segmenter.mixed.1") +{ + auto const text = "0123456789{\xE2\x94\x80}ABCDEF"sv; + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, text }; + auto const result = segmenter.process(80); + segmenter.flush(80); + + UNSCOPED_INFO("result: " << result); + CHECK(e(result.text) == e(text)); + CHECK(result.width == 19); + + CHECK(recorder.size() == 3); + auto const& records = recorder.records(); + CHECK(std::get(records.at(0)) == ascii_sequence { "0123456789{"sv }); + CHECK(std::get(records.at(1)) == complex_unicode_sequence { "\xE2\x94\x80"sv, 1 }); + CHECK(std::get(records.at(2)) == ascii_sequence { "}ABCDEF"sv }); +} + +TEST_CASE("grapheme_line_segmenter.mixed.2") +{ + auto const text = u8(FamilyEmoji); + auto segmenter = grapheme_line_segmenter { event_logger::get(), text }; + auto const main = segmenter.process(80); + auto const fini = segmenter.flush(80); + CHECK(static_cast(std::distance(text.data(), segmenter.next())) == text.size()); + CHECK(main == segmentation_result { .text = "", .width = 0, .stop_condition = StopCondition::EndOfInput }); + CHECK(fini == segmentation_result { .text = text, .width = 2, .stop_condition = StopCondition::EndOfInput }); +} + +TEST_CASE("grapheme_line_segmenter.mixed.3") +{ + auto const text = u8(FamilyEmoji) + "ABC"s; + auto segmenter = grapheme_line_segmenter { event_logger::get(), text }; + + auto const main = segmenter.process(80); + CHECK(static_cast(std::distance(text.data(), segmenter.next())) == text.size()); + CHECK(main == segmentation_result { .text = text, .width = 5, .stop_condition = StopCondition::EndOfInput }); + + // Because we've scanned it all already in the main call, the flush() call will return an empty result. + auto const fini = segmenter.flush(80); + CHECK(fini == segmentation_result { .text = {}, .width = 0, .stop_condition = StopCondition::EndOfInput }); +} + +TEST_CASE("grapheme_line_segmenter.mixed.4") +{ + auto const text = u8(FamilyEmoji) + "ABC"s + u8(FamilyEmoji); + auto const result = scan_text(text, 80); + CHECK(result == segmentation_result { .text = text, .width = 7, .stop_condition = StopCondition::EndOfInput }); +} +// }}} + +TEST_CASE("grapheme_line_segmenter.complex.half-overflowing") +{ + auto const oneEmoji = u8(SmileyEmoji); + auto const textStr = oneEmoji + oneEmoji + oneEmoji; + auto const text = std::string_view(textStr); + + INFO("match at boundary of first grapheme cluster"); + CHECK(scan_text(text, 2) == segmentation_result { text.substr(0, 1 * oneEmoji.size()), 2, StopCondition::EndOfWidth }); + + INFO("match at boundary of second grapheme cluster"); + CHECK(scan_text(text, 4) == segmentation_result { text.substr(0, 2 * oneEmoji.size()), 4, StopCondition::EndOfWidth }); + + INFO("second grapheme cluster is half overflowing"); + CHECK(scan_text(text, 3) == segmentation_result { text.substr(0, 1 * oneEmoji.size()), 2, StopCondition::EndOfWidth }); + + INFO("third grapheme cluster is half overflowing"); + CHECK(scan_text(text, 5) == segmentation_result { text.substr(0, 2 * oneEmoji.size()), 4, StopCondition::EndOfInput }); +} + +TEST_CASE("grapheme_line_segmenter.complex.half-overflowing.and-resume.1") +{ + auto const oneEmoji = u8(SmileyEmoji); + auto const textStr = oneEmoji + oneEmoji + oneEmoji; + auto const text = std::string_view(textStr); + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, text }; + + auto const one = segmenter.process(3); // first emoji (second is overlapping) + CHECK(one.text == text.substr(0, 4)); + CHECK(one.width == 2); + CHECK((void*) one.text.data() == (void*) (text.data() + 0)); + CHECK(recorder.size() == 1); + CHECK(recorder.at(0) == expectation { .offset = 0, .size = 4, .width = 2 }); + + auto const two = segmenter.process(2); // resume second emoji + CHECK(two.text == text.substr(4, 4)); + CHECK(two.width == 2); + CHECK((void*) two.text.data() == (void*) (text.data() + 4)); + CHECK(recorder.size() == 2); + CHECK(recorder.at(1) == expectation { .offset = 4, .size = 4, .width = 2 }); +} + +TEST_CASE("grapheme_line_segmenter.complex.half-overflowing.and-resume.2") +{ + auto const oneEmoji = u8(SmileyEmoji); + auto const textStr = oneEmoji + oneEmoji + oneEmoji; + auto const text = std::string_view(textStr); + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, text }; + INFO(fmt::format("oneEmoji: {} ({}), text: {}", e(oneEmoji), oneEmoji.size(), e(text))); + + // Process first two emoji (thrid is overlapping) + auto const one = segmenter.process(5); + INFO("one: " << one); + CHECK(one.text == text.substr(0, 8)); + CHECK(one.width == 4); + CHECK((void*) one.text.data() == (void*) (text.data() + 0)); + CHECK(recorder.size() == 2); + CHECK(recorder.at(0) == expectation { .offset = 0, .size = 4, .width = 2 }); + CHECK(recorder.at(1) == expectation { .offset = 4, .size = 4, .width = 2 }); + + // Resume processing third emoji + // This emoji cannot be emitted yet, because it is not known yet whether it's a full emoji or not + // (e.g. due to variation selectors). + auto const two = segmenter.process(2); // resume second emoji + INFO("two: " << two); + CHECK(two.text.empty()); + CHECK(two.width == 0); + CHECK(recorder.size() == 2); + + // Inform the segmenter that we've explicitly reached the end of the input. + // Thus, the segmenter must emit the third emoji now. + auto const fin = segmenter.flush(2); + CHECK(fin.text == text.substr(8, 4)); + CHECK(fin.width == 2); + CHECK((void*) fin.text.data() == (void*) (text.data() + 8)); + CHECK(recorder.size() == 3); + CHECK(recorder.at(2) == expectation { .offset = 8, .size = 4, .width = 2 }); +} + +TEST_CASE("grapheme_line_segmenter.complex.sliced_calls") +{ + // auto const text = u8(SmileyEmoji) + "\033\\0123456789ABCDEF"s; // U+1F600 + auto constexpr text = "\xF0\x9F\x98\x80\033\\0123456789ABCDEF"sv; // U+1F600 + auto constexpr splitOffset = 3; + auto constexpr chunkOne = text.substr(0, splitOffset); + auto constexpr chunkTwo = text.substr(splitOffset); + + auto recorder = event_recorder { text }; + auto segmenter = grapheme_line_segmenter { recorder, chunkOne }; + auto const result1 = segmenter.process(3); + + // For the 4-byte sequence, we've only read 3 bytes, so we expect the next call to continue reading the 4th byte. + REQUIRE(segmenter.utf8_state().expectedLength == 4); + REQUIRE(segmenter.utf8_state().currentLength == 3); + CHECK(result1.width == 0); // We must not have emitted any grapheme cluster yet. + CHECK(result1.text == ""); + CHECK(result1.stop_condition == StopCondition::EndOfInput); + + segmenter.expand_buffer_by(chunkTwo.size()); + auto const result2 = segmenter.process(80); + + REQUIRE(segmenter.utf8_state().expectedLength == 0); + CHECK(result2.width == 2); + CHECK(result2.stop_condition == StopCondition::UnexpectedInput); // control character \033 + REQUIRE(e(result2.text) == e(u8(SmileyEmoji))); +} diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp deleted file mode 100644 index c89b436..0000000 --- a/src/libunicode/scan.cpp +++ /dev/null @@ -1,329 +0,0 @@ -/** - * This file is part of the "libunicode" project - * Copyright (c) 2020 Christian Parpart - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -// clang-format off -#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) - #define USE_STD_SIMD - #include - namespace stdx = std::experimental; -#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) - #define USE_STD_SIMD - #include - namespace stdx = std; -#elif defined(__SSE2__) - #include -#endif -// clang-format on - -using std::distance; -using std::get; -using std::holds_alternative; -using std::max; -using std::min; -using std::string_view; - -namespace unicode -{ - -namespace -{ - [[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept - { -#if defined(_WIN32) - // return _tzcnt_u32(value); - // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. - unsigned long r = 0; - _BitScanForward(&r, value); - return r; -#else - return __builtin_ctz(value); -#endif - } - - template - constexpr bool ascending(T low, T val, T high) noexcept - { - return low <= val && val <= high; - } - - constexpr bool is_control(char ch) noexcept - { - return static_cast(ch) < 0x20; - } - - // Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. - constexpr bool is_complex(char ch) noexcept - { - return static_cast(ch) & 0x80; - } - - // Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. - constexpr bool is_ascii(char ch) noexcept - { - return !is_control(ch) && !is_complex(ch); - } -} // namespace - -size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept -{ - auto input = text.data(); - auto const end = text.data() + min(text.size(), maxColumnCount); -#if defined(USE_STD_SIMD) - constexpr int numberOfElements = stdx::simd_abi::max_fixed_size; - stdx::fixed_size_simd simd_text {}; - while (input < end - numberOfElements) - { - simd_text.copy_from(input, stdx::element_aligned); - - // check for control - // TODO check for complex - auto const simd_mask_text = (simd_text < 0x20); - if (stdx::popcount(simd_mask_text) > 0) - { - input += stdx::find_first_set(simd_mask_text); - break; - } - input += numberOfElements; - } -#elif defined(USE_INTRINSICS) - intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F - intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) - - while (input < end - sizeof(intrinsics::m128i)) - { - intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input); - intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax); - intrinsics::m128i isComplex = intrinsics::and128(batch, Complex); - // intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex); - intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex); - if (int const check = intrinsics::movemask_epi8(testPack); check != 0) - { - int advance = countTrailingZeroBits(static_cast(check)); - input += advance; - break; - } - input += sizeof(intrinsics::m128i); - } -#endif - - while (input != end && is_ascii(*input)) - ++input; - - // if (static_cast(distance(text.data(), input))) - // fmt::print( - // "countAsciiTextChars: {} bytes: \"{}\"\n", - // static_cast(distance(text.data(), input)), - // (string_view(text.data(), static_cast(distance(text.data(), input))))); - - return static_cast(distance(text.data(), input)); -} - -scan_result detail::scan_for_text_nonascii(scan_state& state, - string_view text, - size_t maxColumnCount, - grapheme_cluster_receiver& receiver) noexcept -{ - size_t count = 0; - - char const* start = text.data(); - char const* end = start + text.size(); - char const* input = start; - char const* clusterStart = start; - char const* lastCodepointStart = start; - - unsigned byteCount = 0; // bytes consume for the current codepoint - - // TODO: move currentClusterWidth to scan_state. - size_t currentClusterWidth = 0; // current grapheme cluster's East Asian Width - - char const* resultStart = state.utf8.expectedLength ? start - state.utf8.currentLength : start; - char const* resultEnd = resultStart; - - while (input != end && count <= maxColumnCount) - { - if (is_control(*input) || !is_complex(*input)) - { - // Incomplete UTF-8 sequence hit. That's invalid as well. - if (state.utf8.expectedLength) - { - ++count; - receiver.receiveInvalidGraphemeCluster(); - state.utf8 = {}; - } - state.lastCodepointHint = 0; - resultEnd = input; - break; - } - - auto const result = from_utf8(state.utf8, static_cast(*input++)); - ++byteCount; - - if (holds_alternative(result)) - continue; - - if (holds_alternative(result)) - { - auto const prevCodepoint = state.lastCodepointHint; - auto const nextCodepoint = get(result).value; - auto const nextWidth = max(currentClusterWidth, static_cast(width(nextCodepoint))); - state.lastCodepointHint = nextCodepoint; - if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint)) - { - // Flush out current grapheme cluster's East Asian Width. - count += currentClusterWidth; - - if (count + nextWidth > maxColumnCount) - { - // Currently scanned grapheme cluster won't fit. Break at start. - currentClusterWidth = 0; - input -= byteCount; - break; - } - receiver.receiveGraphemeCluster(string_view(clusterStart, byteCount), currentClusterWidth); - - // And start a new grapheme cluster. - currentClusterWidth = nextWidth; - clusterStart = lastCodepointStart; - lastCodepointStart = input - byteCount; - byteCount = 0; - resultEnd = input; - } - else - { - resultEnd = input; - // Increase width on VS16 but do not decrease on VS15. - if (nextCodepoint == 0xFE0F) // VS16 - { - currentClusterWidth = 2; - if (count + currentClusterWidth > maxColumnCount) - { - // Rewinding by {byteCount} bytes (overflow due to VS16). - currentClusterWidth = 0; - input = clusterStart; - break; - } - } - - // Consumed {byteCount} bytes for grapheme cluster. - lastCodepointStart = input - byteCount; - } - } - else - { - assert(holds_alternative(result)); - count++; - receiver.receiveInvalidGraphemeCluster(); - currentClusterWidth = 0; - state.lastCodepointHint = 0; - state.utf8.expectedLength = 0; - byteCount = 0; - } - } - count += currentClusterWidth; - - assert(resultStart <= resultEnd); - - state.next = input; - return { count, resultStart, resultEnd }; -} - -scan_result scan_text(scan_state& state, std::string_view text, size_t maxColumnCount) noexcept -{ - return scan_text(state, text, maxColumnCount, null_receiver::get()); -} - -scan_result scan_text(scan_state& state, - std::string_view text, - size_t maxColumnCount, - grapheme_cluster_receiver& receiver) noexcept -{ - // ----(a)---> A -------> END - // ^ | - // | | - // Start (a) (b) - // | | - // | v - // ----(b)---> B -------> END - - enum class NextState - { - Trivial, - Complex - }; - - auto result = scan_result { 0, text.data(), text.data() }; - - if (state.next == nullptr) - state.next = text.data(); - - // If state indicates that we previously started consuming a UTF-8 sequence but did not complete yet, - // attempt to finish that one first. - if (state.utf8.expectedLength != 0) - { - result = detail::scan_for_text_nonascii(state, text, maxColumnCount, receiver); - text = std::string_view(result.end, - static_cast(std::distance(result.end, text.data() + text.size()))); - } - - if (text.empty()) - return result; - - auto nextState = is_complex(text.front()) ? NextState::Complex : NextState::Trivial; - while (result.count < maxColumnCount && state.next != (text.data() + text.size())) - { - switch (nextState) - { - case NextState::Trivial: { - auto const count = detail::scan_for_text_ascii(text, maxColumnCount - result.count); - if (!count) - return result; - receiver.receiveAsciiSequence(text.substr(0, count)); - result.count += count; - state.next += count; - result.end += count; - nextState = NextState::Complex; - text.remove_prefix(count); - break; - } - case NextState::Complex: { - auto const sub = - detail::scan_for_text_nonascii(state, text, maxColumnCount - result.count, receiver); - if (!sub.count) - return result; - nextState = NextState::Trivial; - result.count += sub.count; - result.end = sub.end; - text.remove_prefix(static_cast(std::distance(sub.start, sub.end))); - break; - } - } - } - - assert(result.start <= result.end); - assert(result.end <= state.next); - - return result; -} - -} // namespace unicode diff --git a/src/libunicode/scan.h b/src/libunicode/scan.h deleted file mode 100644 index 902cc1c..0000000 --- a/src/libunicode/scan.h +++ /dev/null @@ -1,112 +0,0 @@ -/** - * This file is part of the "libunicode" project - * Copyright (c) 2020 Christian Parpart - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace unicode -{ - -/// Holds the result of a call to scan_test(). -struct scan_result -{ - /// Number of columns scanned. - /// One column equals a single narrow-width codepoint. - /// Codepoints with property East Asian Width Wide are treated as two columns. - size_t count; - - /// Pointer to UTF-8 grapheme cluster start. - char const* start; - - /// Pointer to UTF-8 grapheme cluster end, i.e. one byte behind - /// the last successfuly processed complete UTF-8 byte.. - char const* end; -}; - -/// Holds the state to keep through a consecutive sequence of calls to scan_test(). -/// -/// This state holds the UTF-8 decoding state, if processing had to be stopped -/// at an incomplete UTF-8 byte sequence, -/// and the last decoded Unicode codepoint necessary for grapheme cluster segmentation. -struct scan_state -{ - utf8_decoder_state utf8 {}; - char32_t lastCodepointHint {}; - - /// Pointer to one byte after the last scanned codepoint. - char const* next {}; -}; - -/// Callback-interface that allows precisely understanding the structure of a UTF-8 sequence. -class grapheme_cluster_receiver -{ - public: - virtual ~grapheme_cluster_receiver() = default; - - virtual void receiveAsciiSequence(std::string_view codepoints) noexcept = 0; - virtual void receiveGraphemeCluster(std::string_view codepoints, size_t columnCount) noexcept = 0; - virtual void receiveInvalidGraphemeCluster() noexcept = 0; -}; - -/// Quite obviousely, this grapheme_cluster_receiver will do nothing. -class null_receiver final: public grapheme_cluster_receiver -{ - public: - void receiveAsciiSequence(std::string_view) noexcept override {} - void receiveGraphemeCluster(std::string_view, size_t) noexcept override {} - void receiveInvalidGraphemeCluster() noexcept override {} - - static null_receiver& get() noexcept - { - static null_receiver instance {}; - return instance; - } -}; - -namespace detail -{ - size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept; - scan_result scan_for_text_nonascii(scan_state& state, - std::string_view text, - size_t maxColumnCount, - grapheme_cluster_receiver& receiver) noexcept; -} // namespace detail - -/// Scans a sequence of UTF-8 encoded bytes. -/// -/// This call will return early one of the conditions is met: -/// -/// - given the input sequence, the right most invalid or complete UTF-8 sequence is processed, -/// - maxColumnCount is reached and the next grapheme cluster would exceed the given limit, -/// - a control character is about to be processed. -/// -/// When this function returns, it is guaranteed to not contain an incomplete UTF-8 sequence -/// at the end of the output sequence. -/// -/// Calling this function again with more bytes will resume decoding that UTF-8 sequence -/// with the help of the passed UTF-8 decoder state. -/// -/// @return scanned textual result. This is, a sequence of -/// either valid or invalid UTF-8 codepoints, -/// but not incomplete codepoints at the end. -scan_result scan_text(scan_state& state, std::string_view text, size_t maxColumnCount) noexcept; - -scan_result scan_text(scan_state& state, - std::string_view text, - size_t maxColumnCount, - grapheme_cluster_receiver& receiver) noexcept; - -} // namespace unicode diff --git a/src/libunicode/scan_test.cpp b/src/libunicode/scan_test.cpp deleted file mode 100644 index df40ff6..0000000 --- a/src/libunicode/scan_test.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/** - * This file is part of the "libterminal" project - * Copyright (c) 2020 Christian Parpart - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include - -#include - -#include - -#include - -using std::string_view; - -using namespace std::string_literals; -using namespace std::string_view_literals; - -namespace -{ - -auto constexpr FamilyEmoji = U"\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466"sv; -auto constexpr SmileyEmoji = U"\U0001F600"sv; -auto constexpr CopyrightSign = U"\u00A9"sv; -auto constexpr ControlCodes = "\r\n"sv; // Used to ensure that C0 codes properly cancel scanning. - -template -auto u8(T text) -{ - return unicode::convert_to(text); -} - -std::string escape(uint8_t ch) -{ - switch (ch) - { - case '\\': return "\\\\"; - case 0x1B: return "\\e"; - case '\t': return "\\t"; - case '\r': return "\\r"; - case '\n': return "\\n"; - case '"': return "\\\""; - default: - if (ch < 0x20) - return fmt::format("\\{:03o}", static_cast(ch) & 0xFF); - else if (ch < 0x80) - return fmt::format("{}", static_cast(ch)); - else - return fmt::format("\\x{:02x}", static_cast(ch) & 0xFF); - } -} - -template -std::string escape(T begin, T end) -{ - static_assert(sizeof(*std::declval()) == 1, - "should be only 1 byte, such as: char, char8_t, uint8_t, byte, ..."); - auto result = std::string {}; - while (begin != end) - result += escape(static_cast(*begin++)); - return result; -} - -inline std::string escape(std::string_view s) -{ - return escape(begin(s), end(s)); -} - -class grapheme_cluster_collector final: public unicode::grapheme_cluster_receiver -{ - public: - std::vector output; - - void receiveAsciiSequence(std::string_view sequence) noexcept override - { - for (char const ch: sequence) - output.emplace_back(1, static_cast(ch)); - } - - void receiveGraphemeCluster(std::string_view cluster, size_t) noexcept override - { - output.emplace_back(unicode::convert_to(cluster)); - } - - void receiveInvalidGraphemeCluster() noexcept override - { - auto constexpr ReplacementCharacter = U'\uFFFD'; - output.emplace_back(1, ReplacementCharacter); - } -}; - -} // namespace - -using unicode::detail::scan_for_text_ascii; - -TEST_CASE("scan.ascii.empty") -{ - CHECK(scan_for_text_ascii("", 0) == 0); - CHECK(scan_for_text_ascii("", 1) == 0); -} - -TEST_CASE("scan.ascii.32") -{ - auto const text = "0123456789ABCDEF0123456789ABCDEF"sv; - CHECK(scan_for_text_ascii(text, 32) == 32); - CHECK(scan_for_text_ascii(text, 16) == 16); - CHECK(scan_for_text_ascii(text, 8) == 8); - CHECK(scan_for_text_ascii(text, 1) == 1); -} - -TEST_CASE("scan.ascii.mixed_with_controls") -{ - CHECK(scan_for_text_ascii("\0331234", 80) == 0); - CHECK(scan_for_text_ascii("1234\033", 80) == 4); - CHECK(scan_for_text_ascii("12345678\033", 80) == 8); - CHECK(scan_for_text_ascii("0123456789ABCDEF\033", 80) == 16); - CHECK(scan_for_text_ascii("0123456789ABCDEF1\033", 80) == 17); -} - -TEST_CASE("scan.ascii.until_complex") -{ - CHECK(scan_for_text_ascii("1234\x80", 80) == 4); - CHECK(scan_for_text_ascii("0123456789{\xE2\x94\x80}ABCDEF", 80) == 11); -} - -TEST_CASE("scan.complex.grapheme_cluster.1") -{ - auto state = unicode::scan_state {}; - auto const familyEmoji8 = u8(FamilyEmoji); - auto const result = - unicode::detail::scan_for_text_nonascii(state, familyEmoji8, 80, unicode::null_receiver::get()); - CHECK(result.count == 2); - CHECK(state.next == familyEmoji8.data() + familyEmoji8.size()); -} - -TEST_CASE("scan.complex.grapheme_cluster.2") -{ - auto state = unicode::scan_state {}; - auto const familyEmoji8 = u8(FamilyEmoji) + u8(FamilyEmoji); - auto const result = - unicode::detail::scan_for_text_nonascii(state, familyEmoji8, 80, unicode::null_receiver::get()); - CHECK(result.count == 4); - CHECK(state.next == familyEmoji8.data() + familyEmoji8.size()); -} - -TEST_CASE("scan.complex.mixed") -{ - auto state = unicode::scan_state {}; - auto const text = u8(FamilyEmoji) + "ABC"s + u8(FamilyEmoji); - auto const result = - unicode::detail::scan_for_text_nonascii(state, text, 80, unicode::null_receiver::get()); - CHECK(result.count == 2); - CHECK(state.next == text.data() + u8(FamilyEmoji).size()); -} - -TEST_CASE("scan.complex.half-overflowing") -{ - auto state = unicode::scan_state {}; - auto const oneEmoji = u8(SmileyEmoji); - auto const text = oneEmoji + oneEmoji + oneEmoji; - - // match at boundary - auto const result2 = - unicode::detail::scan_for_text_nonascii(state, text, 2, unicode::null_receiver::get()); - CHECK(result2.count == 2); - CHECK(state.next == text.data() + oneEmoji.size()); - - // one grapheme cluster is half overflowing - auto const result3 = - unicode::detail::scan_for_text_nonascii(state, text, 3, unicode::null_receiver::get()); - CHECK(result3.count == 2); - CHECK(state.next == text.data() + oneEmoji.size()); - - // match buondary - auto const result4 = - unicode::detail::scan_for_text_nonascii(state, text, 4, unicode::null_receiver::get()); - CHECK(result4.count == 4); - CHECK(state.next == text.data() + 2 * oneEmoji.size()); -} - -TEST_CASE("scan.any.tiny") -{ - // Ensure that we're really only scanning up to the input's size (1 byte, here). - auto state = unicode::scan_state {}; - auto const storage = "X{0123456789ABCDEF}"sv; - auto const input = storage.substr(0, 1); - auto const result = unicode::scan_text(state, input, 80); - CHECK(result.count == 1); - CHECK(state.next == input.data() + input.size()); - CHECK(*state.next == '{'); -} - -TEST_CASE("scan.complex.sliced_calls") -{ - auto state = unicode::scan_state {}; - auto const text = "\xF0\x9F\x98\x80\033\\0123456789ABCDEF"sv; // U+1F600 - auto constexpr splitOffset = 3; - auto const chunkOne = std::string_view(text.data(), splitOffset); - - auto result = unicode::scan_text(state, chunkOne, 80); - - REQUIRE(state.utf8.expectedLength == 4); - REQUIRE(state.utf8.currentLength == 3); - CHECK(result.count == 0); - CHECK(result.start == text.data()); - CHECK(result.end == text.data()); - CHECK(state.next == (text.data() + splitOffset)); - - auto const chunkTwo = - std::string_view(state.next, (size_t) std::distance(state.next, text.data() + text.size())); - result = unicode::scan_text(state, chunkTwo, 80, unicode::null_receiver::get()); - - REQUIRE(state.utf8.expectedLength == 0); - CHECK(result.count == 2); - REQUIRE(result.start == text.data()); - REQUIRE(result.end == text.data() + 4); - REQUIRE(state.next == text.data() + 4); - auto const resultingText = - string_view(result.start, static_cast(std::distance(result.start, result.end))); - REQUIRE(resultingText == text.substr(0, 4)); -} - -TEST_CASE("scan.any.ascii_complex_repeat") -{ - auto const oneComplex = u8(SmileyEmoji); // 2 - auto const oneSimple = "0123456789ABCDEF0123"s; // 20 - - for (size_t i = 1; i <= 6; ++i) - { - auto s = ""s; - for (size_t k = 1; k <= i; ++k) - s += (k % 2) != 0 ? oneSimple : oneComplex; - s += ControlCodes; - - auto state = unicode::scan_state {}; - auto const result = scan_text(state, s, 80); - auto const countSimple = ((i + 1) / 2) * 20; - auto const countComplex = (i / 2) * 2; - - INFO(fmt::format("i = {}, ascii# {}, complex# {}, count {}, actual {}, s = \"{}\"", - i, - countSimple, - countComplex, - result.count, - countSimple + countComplex, - escape(s))); - - CHECK(result.count == countSimple + countComplex); - CHECK(state.next == s.data() + s.size() - ControlCodes.size()); - } -} - -TEST_CASE("scan.any.complex_ascii_repeat") -{ - auto const oneComplex = u8(SmileyEmoji); // 2 - auto const oneSimple = "0123456789ABCDEF0123"s; // 20 - - for (size_t i = 1; i <= 6; ++i) - { - auto s = ""s; - for (size_t k = 1; k <= i; ++k) - s += (k % 2) != 0 ? oneComplex : oneSimple; - s += ControlCodes; - - auto state = unicode::scan_state {}; - auto const result = unicode::scan_text(state, s, 80); - CHECK(result.count == (i / 2) * 20 + ((i + 1) / 2) * 2); - CHECK(state.next == s.data() + s.size() - ControlCodes.size()); - } -} - -TEST_CASE("scan.complex.VS16") -{ - auto const oneComplex = u8(CopyrightSign); - auto const modifierVS16 = u8(U"\uFE0F"sv); - - // // narrow copyright sign - auto state = unicode::scan_state {}; - auto const result1 = unicode::scan_text(state, oneComplex, 80); - CHECK(result1.count == 1); - CHECK(state.next == oneComplex.data() + oneComplex.size()); - - // copyright sign in emoji presentation - state = {}; - auto const s = oneComplex + modifierVS16; - auto const result = unicode::scan_text(state, s, 80); - CHECK(result.count == 2); - CHECK(state.next == s.data() + s.size()); - - state = {}; - auto const result3 = unicode::scan_text(state, s, 1); - CHECK(result3.count == 0); - CHECK(state.next == s.data()); -} - -#if 0 -namespace -{ - -// NOLINTNEXTLINE(readability-identifier-naming) -struct ColumnCount -{ - size_t value; -}; - -constexpr ColumnCount operator""_columns(unsigned long long value) noexcept -{ - return ColumnCount { static_cast(value) }; -} - -std::vector operator""_bvec(char const* value, size_t n) -{ - std::vector v; - v.reserve(n); - while (*value) - v.push_back(static_cast(*value++)); - return v; -} - -template -std::string hex(T const& text) -{ - std::string encodedText; - for (auto const ch: text) - { - if (!encodedText.empty()) - encodedText += ' '; - char buf[3]; - snprintf(buf, sizeof(buf), "%02X", static_cast(ch)); - encodedText.append(buf, 2); - } - return encodedText; -} - -// Single scan from clean start to stopByte. -void testScanText(int lineNo, - ColumnCount expectedColumnCount, - std::vector const& expectation, - uint8_t stopByte, - std::vector const& analyzedGraphemeClusters) -{ - INFO(fmt::format("Testing scan segment from line {}: {} ({:02X})", lineNo, hex(expectation), stopByte)); - auto const maxColumnCount = 80; - - std::string fullText; - fullText.insert(fullText.end(), expectation.begin(), expectation.end()); - fullText.push_back(static_cast(stopByte)); - - auto graphemeClusterCollector = grapheme_cluster_collector {}; - - auto state = unicode::scan_state {}; - auto const result = unicode::scan_text(state, fullText, maxColumnCount, graphemeClusterCollector); - auto const start = (char const*) fullText.data(); - - CHECK(size_t(result.start - start) == 0); - CHECK(size_t(result.end - start) == expectation.size()); - CHECK(result.count == expectedColumnCount.value); - CHECK(result.next[0] == stopByte); - CHECK(result.next == fullText.data() + expectation.size()); - - CHECK(graphemeClusterCollector.output.size() == analyzedGraphemeClusters.size()); - auto const iMax = std::min(analyzedGraphemeClusters.size(), graphemeClusterCollector.output.size()); - for (size_t i = 0; i < iMax; ++i) - { - INFO(fmt::format("i: {}, lhs: {}, rhs: {}", - i, - u8(std::u32string_view(graphemeClusterCollector.output[i].data(), - graphemeClusterCollector.output[i].size())), - u8(analyzedGraphemeClusters[i]))); - CHECK(graphemeClusterCollector.output[i] == analyzedGraphemeClusters[i]); - } -} - -} // namespace - -TEST_CASE("scan.invalid") -{ - auto constexpr LF = '\n'; - auto const RC = U"\uFFFD"sv; - - // 0xB1 is an invalid UTF-8 byte - // 0xF5 is valid beginning of a 4-byte UTF-8 sequence but incomplete if not finished and hence, invalid. - - // clang-format off - testScanText(__LINE__, 0_columns, {}, LF, {}); - testScanText(__LINE__, 1_columns, { 'A' }, LF, { U"A" }); - testScanText(__LINE__, 2_columns, { 'A', 'B' }, LF, { U"A", U"B" }); - testScanText(__LINE__, 3_columns, { 'A', 0xB1, 'B' }, LF, { U"A", RC, U"B" }); // invalid UTF-8 - testScanText(__LINE__, 4_columns, { 'A', 0xB1, 0xB1, 'B' }, LF, { U"A", RC, RC, U"B" }); // invalid UTF-8 - testScanText(__LINE__, 3_columns, { 'A', 0xF5, 'B' }, LF, { U"A", RC, U"B" }); // incomplete UTF-8 - testScanText(__LINE__, 4_columns, { 'A', 0xB1, 0xF5, 'B' }, LF, { U"A", RC, RC, U"B" }); // mixed case of the 2 above - testScanText(__LINE__, 6_columns, { 'A', 0xB1, 0xF5, 'H', 'e', 'y' }, LF, { U"A", RC, RC, U"H", U"e", U"y" }); - testScanText(__LINE__, 2_columns, "\xf0\x9f\x98\200"_bvec, LF, { U"\U0001F600" }); // U+1F600 - testScanText(__LINE__, - 18_columns, - "\xf0\x9f\x98\2000123456789ABCDEF"_bvec, - LF, - { U"\U0001F600", - U"0", U"1", U"2", U"3", U"4", U"5", U"6", U"7", U"8", U"9", - U"A", U"B", U"C", U"D", U"E", U"F" }); - // clang-format on -} -#endif diff --git a/src/libunicode/support.h b/src/libunicode/support.h index aef6926..4910213 100644 --- a/src/libunicode/support.h +++ b/src/libunicode/support.h @@ -14,16 +14,29 @@ #pragma once #include -#include #include namespace unicode { +// Generally, the compiler may decide to inline or not, +// but when debugging, we want to make sure that certain functions are not inlined. +#if !defined(NDEBUG) + #if defined(__GNUC__) || defined(__clang__) + #define LIBUNICODE_INLINE __attribute__((noinline)) inline + #elif defined(_MSC_VER) + #define LIBUNICODE_INLINE __declspec(noinline) inline + #else + #define LIBUNICODE_INLINE inline + #endif +#else + #define LIBUNICODE_INLINE inline +#endif + #if defined(__GNUC__) || defined(__clang__) #define LIBUNICODE_PACKED __attribute__((packed)) #else - #define LIBUNICODE_PACKED /*!*/ + #define LIBUNICODE_PACKED #endif #if defined(__cpp_char8_t)