From b7f6be5c60d8461c38397fb5f618f219c63ab7bd Mon Sep 17 00:00:00 2001 From: arch1t3cht Date: Fri, 1 Dec 2023 02:55:44 +0100 Subject: [PATCH] Fix SplitText ICU logic Include UBRK_WORD_IDEO and check the entire rules vec. This now matches the logic of boost::locale. --- libaegisub/ass/dialogue_parser.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/libaegisub/ass/dialogue_parser.cpp b/libaegisub/ass/dialogue_parser.cpp index b11ebb1a0d..e9c60ff38f 100644 --- a/libaegisub/ass/dialogue_parser.cpp +++ b/libaegisub/ass/dialogue_parser.cpp @@ -106,19 +106,32 @@ class WordSplitter { void SplitText(size_t &i) { UErrorCode err = U_ZERO_ERROR; - thread_local std::unique_ptr - bi(icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), err)); + thread_local std::unique_ptr bi(icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), err)); agi::UTextPtr ut(utext_openUTF8(nullptr, text.data() + pos, tokens[i].length, &err)); bi->setText(ut.get(), err); if (U_FAILURE(err)) throw agi::InternalError(u_errorName(err)); size_t pos = 0; while (bi->next() != UBRK_DONE) { auto len = bi->current() - pos; - auto rule = bi->getRuleStatus(); // FIXME: getRuleStatusVec? - if (rule >= UBRK_WORD_LETTER && rule < UBRK_WORD_KANA_LIMIT) - SwitchTo(i, dt::WORD, len); - else - SwitchTo(i, dt::TEXT, len); + + std::vector rules(8); + int n = bi->getRuleStatusVec(rules.data(), rules.size(), err); + if (err == U_BUFFER_OVERFLOW_ERROR) { + err = U_ZERO_ERROR; + bi->getRuleStatusVec(rules.data(), rules.size(), err); + } + + if (U_FAILURE(err)) throw agi::InternalError(u_errorName(err)); + + auto token_type = dt::TEXT; + + for (size_t i = 0; i < n; i++) { + if (rules[i] >= UBRK_WORD_LETTER && rules[i] < UBRK_WORD_IDEO_LIMIT) { + token_type = dt::WORD; + break; + } + } + SwitchTo(i, token_type, len); pos = bi->current(); } }