From 9182545db7436601f0e37ca6ccfb1908338b289f Mon Sep 17 00:00:00 2001 From: Philipp Otterbein Date: Sun, 15 Dec 2024 02:29:56 +0100 Subject: [PATCH] try to calculate character width --- packaging/dependencies.nix | 16 +++++++ src/libutil-tests/terminal.cc | 4 ++ src/libutil/meson.build | 3 ++ src/libutil/package.nix | 2 + src/libutil/terminal.cc | 90 ++++++++++++++++++++++++----------- 5 files changed, 86 insertions(+), 29 deletions(-) diff --git a/packaging/dependencies.nix b/packaging/dependencies.nix index a8005ce16c9d..8abde78f714d 100644 --- a/packaging/dependencies.nix +++ b/packaging/dependencies.nix @@ -199,6 +199,22 @@ scope: { meta.platforms = lib.platforms.all; }); + widecharwidth = stdenv.mkDerivation { + name = "widecharwidth"; + dontConfigure = true; + dontBuild = true; + installPhase = '' + mkdir -p $out/include + cp $src/widechar_width.h $out/include + ''; + src = pkgs.fetchFromGitHub { + owner = "ridiculousfish"; + repo = "widecharwidth"; + rev = "533e50efb0b9b122a08f2273337dbf6b44b03cc7"; + hash = "sha256-Vy1jCv0wqV/4sNCQIYGKiHq5A8QGE6Q+1v8k3Cn6sJ4="; + }; + }; + inherit resolvePath filesetToSource; mkMesonDerivation = diff --git a/src/libutil-tests/terminal.cc b/src/libutil-tests/terminal.cc index 714d5a2378c5..f4fc6e770d21 100644 --- a/src/libutil-tests/terminal.cc +++ b/src/libutil-tests/terminal.cc @@ -55,6 +55,10 @@ TEST(filterANSIEscapes, utf8) ASSERT_EQ(filterANSIEscapes("fóóbär", true, 3), "fóó"); ASSERT_EQ(filterANSIEscapes("f€€bär", true, 4), "f€€b"); ASSERT_EQ(filterANSIEscapes("f𐍈𐍈bär", true, 4), "f𐍈𐍈b"); + ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 6), "f🔍bar"); + ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 3), "f🔍"); + ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 2), "f"); + ASSERT_EQ(filterANSIEscapes("foo\u0301", true, 3), "foó"); } TEST(filterANSIEscapes, osc8) diff --git a/src/libutil/meson.build b/src/libutil/meson.build index bbe7872cf104..8bde21d817f2 100644 --- a/src/libutil/meson.build +++ b/src/libutil/meson.build @@ -108,6 +108,9 @@ deps_private += cpuid nlohmann_json = dependency('nlohmann_json', version : '>= 3.9') deps_public += nlohmann_json +cxx = meson.get_compiler('cpp') +cxx.has_header('widechar_width.h', required : true) + config_h = configure_file( configuration : configdata, output : 'config-util.hh', diff --git a/src/libutil/package.nix b/src/libutil/package.nix index 69ebbf726e90..348617bcc1a9 100644 --- a/src/libutil/package.nix +++ b/src/libutil/package.nix @@ -9,6 +9,7 @@ , libsodium , nlohmann_json , openssl +, widecharwidth # Configuration Options @@ -42,6 +43,7 @@ mkMesonLibrary (finalAttrs: { brotli libsodium openssl + widecharwidth ] ++ lib.optional stdenv.hostPlatform.isx86_64 libcpuid ; diff --git a/src/libutil/terminal.cc b/src/libutil/terminal.cc index 4c127ddb0780..8a8373f1bf93 100644 --- a/src/libutil/terminal.cc +++ b/src/libutil/terminal.cc @@ -11,6 +11,53 @@ # include #endif #include +#include + +namespace { + +inline std::pair charWidthUTF8Helper(std::string_view s) +{ + size_t bytes = 1; + uint32_t ch = s[0]; + uint32_t max = 1U << 7; + if ((ch & 0x80U) == 0U) { + } else if ((ch & 0xe0U) == 0xc0U) { + ch &= 0x1fU; + bytes = 2; + max = 1U << 11; + } else if ((ch & 0xf0U) == 0xe0U) { + ch &= 0x0fU; + bytes = 3; + max = 1U << 16; + } else if ((ch & 0xf8U) == 0xf0U) { + ch &= 0x07U; + bytes = 4; + max = 0x110000U; + } else { + return {bytes, bytes}; // invalid UTF-8 start byte + } + for (size_t i = 1; i < bytes; i++) { + if (i < s.size() && (s[i] & 0xc0) == 0x80) { + ch = (ch << 6) | (s[i] & 0x3f); + } else { + return {i, i}; // invalid UTF-8 encoding; assume one character per byte + } + } + int width = bytes; // in case of overlong encoding + if (ch < max) { + width = widechar_wcwidth(ch); + if (width == widechar_ambiguous) { + width = 1; // just a guess... + } else if (width == widechar_widened_in_9) { + width = 2; + } else if (width < 0) { + width = 0; + } + } + return {width, bytes}; +} + +} namespace nix { @@ -30,7 +77,7 @@ std::string filterANSIEscapes(std::string_view s, bool filterAll, unsigned int w size_t w = 0; auto i = s.begin(); - while (w < (size_t) width && i != s.end()) { + while (i != s.end()) { if (*i == '\e') { std::string e; @@ -61,10 +108,12 @@ std::string filterANSIEscapes(std::string_view s, bool filterAll, unsigned int w } else if (*i == '\t') { - i++; t += ' '; w++; - while (w < (size_t) width && w % 8) { - t += ' '; w++; - } + do { + if (++w > (size_t) width) + return t; + t += ' '; + } while (w % 8); + i++; } else if (*i == '\r' || *i == '\a') @@ -72,35 +121,18 @@ std::string filterANSIEscapes(std::string_view s, bool filterAll, unsigned int w i++; else { - w++; - // Copy one UTF-8 character. - if ((*i & 0xe0) == 0xc0) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++; - } else if ((*i & 0xf0) == 0xe0) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++; - } - } else if ((*i & 0xf8) == 0xf0) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) { - t += *i++; - if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++; - } - } - } else - t += *i++; + auto [chWidth, bytes] = charWidthUTF8Helper({i, s.end()}); + w += chWidth; + if (w > (size_t) width) { + break; + } + t += {i, i + bytes}; + i += bytes; } } - return t; } - ////////////////////////////////////////////////////////////////////// static Sync> windowSize{{0, 0}};