Introduce new API grapheme_line_segmenter to replace scan API

Signed-off-by: Christian Parpart <[email protected]>
contour-terminal · Mar 17, 2024 · ffb35d3 · ffb35d3
1 parent 5914df6
commit ffb35d3
Show file tree

Hide file tree

Showing 12 changed files with 1,480 additions and 876 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -47,7 +47,7 @@ option(LIBUNICODE_BENCHMARK "libunicode: Enables building of benchmark for libun
 option(LIBUNICODE_TOOLS "libunicode: Builds CLI tools [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT})
 option(LIBUNICODE_BUILD_STATIC "libunicode: provide static library instead of dynamic [default: ${LIBUNICODE_BUILD_STATIC_DEFAULT}]" ${LIBUNICODE_BUILD_STATIC_DEFAULT})
 option(LIBUNICODE_USE_INTRINSICS "libunicode: Use SIMD extenstion during text read [default: ON]" ON)
-option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ${LIBUNICODE_USE_INTRINSICS})
+option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ON)
 option(LIBUNICODE_TABLEGEN_FASTBUILD "libunicode: Use fast table generation (takes more memory in final tables) [default: OFF]" OFF)
 
 set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE)

diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt
@@ -102,7 +102,6 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
     codepoint_properties.cpp
     emoji_segmenter.cpp
     grapheme_segmenter.cpp
-    scan.cpp
     script_segmenter.cpp
     utf8.cpp
     width.cpp
@@ -114,22 +113,22 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
 )
 
 if(LIBUNICODE_USE_STD_SIMD)
-    target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD)
+    target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_STD_SIMD)
 endif()
 if(LIBUNICODE_USE_INTRINSICS)
-    target_compile_definitions(unicode PRIVATE USE_INTRINSICS)
+    target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_INTRINSICS)
 endif()
 
 set(public_headers
     capi.h
     codepoint_properties.h
     convert.h
     emoji_segmenter.h
+    grapheme_line_segmenter.h
     grapheme_segmenter.h
     intrinsics.h
     multistage_table_view.h
     run_segmenter.h
-    scan.h
     script_segmenter.h
     support.h
     utf8.h
@@ -161,7 +160,6 @@ add_executable(unicode_tablegen tablegen.cpp)
 set_target_properties(unicode_tablegen PROPERTIES CMAKE_BUILD_TYPE Release)
 target_link_libraries(unicode_tablegen PRIVATE unicode::loader)
 
-
 # {{{ installation
 set(LIBUNICODE_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/libunicode" CACHE PATH "Installation directory for cmake files, a relative path that will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute path.")
 set(LIBUNICODE_INSTALL_CMAKE_FILES ${MASTER_PROJECT} CACHE BOOL "Decides whether or not to install CMake config and -version files.")
@@ -220,9 +218,9 @@ if(LIBUNICODE_TESTING)
         capi_test.cpp
         convert_test.cpp
         emoji_segmenter_test.cpp
+        grapheme_line_segmenter_test.cpp
         grapheme_segmenter_test.cpp
         run_segmenter_test.cpp
-        scan_test.cpp
         script_segmenter_test.cpp
         test_main.cpp
         unicode_test.cpp
@@ -247,8 +245,6 @@ if(LIBUNICODE_TESTING)
 endif()
 # }}}
 
-
-
 # {{{ unicode_test
 if(LIBUNICODE_BENCHMARK)
     if(NOT benchmark_FOUND)

diff --git a/src/libunicode/benchmark.cpp b/src/libunicode/benchmark.cpp
@@ -1,5 +1,5 @@
 #include <libunicode/convert.h>
-#include <libunicode/scan.h>
+#include <libunicode/grapheme_line_segmenter.h>
 #include <libunicode/utf8.h>
 
 #include <string_view>
@@ -9,22 +9,26 @@
 using std::string_view;
 
 template <size_t L>
-static void benchmarkWithLength(benchmark::State& state)
+static void benchmarkWithLength(benchmark::State& benchmarkState)
 {
-    auto TestText = std::string(L, 'a') + "\u00A9";
-    for (auto _: state)
+    auto const TestTextString = std::string(L, 'a') + "\u00A9";
+    auto const TestText = std::string_view(TestTextString);
+    for (auto _: benchmarkState)
     {
-        benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
+        benchmark::DoNotOptimize(unicode::detail::process_only_ascii(std::string_view(TestText).substr(0, L + 10)));
     }
 }
 
 template <size_t L>
-static void benchmarkWithOffset(benchmark::State& state)
+static void benchmarkWithOffset(benchmark::State& benchmarkState)
 {
-    auto TestText = std::string(L, 'a') + "\u0001F600" + std::string(1000, 'a');
-    for (auto _: state)
+    auto const TestTextString = std::string(L, 'a') + "\U0001F600" + std::string(1000, 'a');
+    auto const TestText = std::string_view(TestTextString);
+    for (auto _: benchmarkState)
     {
-        benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
+        auto state = unicode::detail::unicode_process_state {};
+        auto eventHandler = unicode::detail::EventHandler{};
+        benchmark::DoNotOptimize(unicode::detail::process_only_complex_unicode(eventHandler, state, TestText, L + 10));
     }
 }
 

diff --git a/src/libunicode/capi.cpp b/src/libunicode/capi.cpp
@@ -49,7 +49,7 @@ int u32_gc_width(u32_char_t const* codepoints, size_t size, int mode)
     while (segmenter.codepointsAvailable())
     {
         auto const cluster = *segmenter;
-        int thisWidth = unicode::width(cluster.front());
+        int thisWidth = static_cast<int>(unicode::width(cluster.front()));
         if (mode != GC_WIDTH_MODE_NON_MODIFIABLE)
         {
             for (size_t i = 1; i < size; ++i)
@@ -60,7 +60,7 @@ int u32_gc_width(u32_char_t const* codepoints, size_t size, int mode)
                     {
                         case 0xFE0E: return 1;
                         case 0xFE0F: return 2;
-                        default: return unicode::width(codepoint);
+                        default: return static_cast<int>(unicode::width(codepoint));
                     }
                 }();
                 if (width && width != thisWidth)