From 2e7290c1f497ed6bbc73ea2d07a909fd15ae7047 Mon Sep 17 00:00:00 2001 From: Laurent Pugin Date: Mon, 30 Sep 2024 13:59:39 +0200 Subject: [PATCH] Drop usage of codecvt. Fixes compilation warnings --- include/vrv/vrv.h | 5 ++ src/editortoolkit_cmn.cpp | 1 - src/toolkit.cpp | 21 ++++++- src/vrv.cpp | 112 +++++++++++++++++++++++++++++++++++++- 4 files changed, 134 insertions(+), 5 deletions(-) diff --git a/include/vrv/vrv.h b/include/vrv/vrv.h index daec31198e2..d103807cd37 100644 --- a/include/vrv/vrv.h +++ b/include/vrv/vrv.h @@ -86,6 +86,11 @@ std::string UTF32to8(const std::u32string &in); */ std::u32string UTF8to32(const std::string &in); +/** + * Utility for converting UTF16 (std::u16string) to UTF-8 + */ +std::string UTF16to8(const std::u16string &in); + /** * Format a string using vsnprintf. * The maximum length is giving by STRING_FORMAT_MAX_LEN diff --git a/src/editortoolkit_cmn.cpp b/src/editortoolkit_cmn.cpp index 18cbed036a5..294b6836f67 100644 --- a/src/editortoolkit_cmn.cpp +++ b/src/editortoolkit_cmn.cpp @@ -9,7 +9,6 @@ //-------------------------------------------------------------------------------- -#include #include #include diff --git a/src/toolkit.cpp b/src/toolkit.cpp index d97d50e3a62..0eabcfca64d 100644 --- a/src/toolkit.cpp +++ b/src/toolkit.cpp @@ -10,7 +10,6 @@ //---------------------------------------------------------------------------- #include -#include #include #include @@ -400,12 +399,28 @@ bool Toolkit::LoadUTF16File(const std::string &filename) u16data.erase(0, 1); } - std::wstring_convert, char16_t> convert; - std::string utf8line = convert.to_bytes(u16data); + // std::wstring_convert, char16_t> convert; + std::string utf8line = vrv::UTF16to8(u16data); // convert.to_bytes(u16data); return this->LoadData(utf8line, false); } +std::string UTF16toUTF8(const std::u16string &input) +{ + std::string output; + // Placeholder for manual conversion logic + // Real conversion logic here should handle actual UTF-16 to UTF-8 conversion + for (char16_t c : input) { + if (c < 0x80) { // Handle basic ASCII conversion + output.push_back(static_cast(c)); + } + else { + // Extend this block to handle non-ASCII characters + } + } + return output; +} + bool Toolkit::IsZip(const std::string &filename) { std::ifstream fin(filename.c_str(), std::ios::in | std::ios::binary); diff --git a/src/vrv.cpp b/src/vrv.cpp index 764e29370bf..06340f6ecff 100644 --- a/src/vrv.cpp +++ b/src/vrv.cpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -264,14 +263,125 @@ std::string ExtractIDFragment(std::string refID) std::string UTF32to8(const std::u32string &in) { + std::string out; + for (auto cp : in) { + if (cp < 0x80) { // One byte + out.push_back(static_cast(cp)); + } + else if (cp < 0x800) { // Two bytes + out.push_back(static_cast((cp >> 6) | 0xC0)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + else if (cp < 0x10000) { // Three bytes + out.push_back(static_cast((cp >> 12) | 0xE0)); + out.push_back(static_cast(((cp >> 6) & 0x3F) | 0x80)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + else { // Four bytes + out.push_back(static_cast((cp >> 18) | 0xF0)); + out.push_back(static_cast(((cp >> 12) & 0x3F) | 0x80)); + out.push_back(static_cast(((cp >> 6) & 0x3F) | 0x80)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + } + return out; + + // deprecated code + /* std::wstring_convert, char32_t> strCnv; return strCnv.to_bytes(in); + */ } std::u32string UTF8to32(const std::string &in) { + std::u32string out; + for (size_t i = 0; i < in.size();) { + uint32_t cp = in[i] & 0xFF; + if (cp <= 0x7F) { // One byte + out.push_back(cp); + ++i; + } + else if (cp <= 0xDF) { // Two bytes + cp = ((cp & 0x1F) << 6) | (in[i + 1] & 0x3F); + out.push_back(cp); + i += 2; + } + else if (cp <= 0xEF) { // Three bytes + cp = ((cp & 0x0F) << 12) | ((in[i + 1] & 0x3F) << 6) | (in[i + 2] & 0x3F); + out.push_back(cp); + i += 3; + } + else { // Four bytes + cp = ((cp & 0x07) << 18) | ((in[i + 1] & 0x3F) << 12) | ((in[i + 2] & 0x3F) << 6) | (in[i + 3] & 0x3F); + out.push_back(cp); + i += 4; + } + } + return out; + + // deprecated code + /* std::wstring_convert, char32_t> strCnv; return strCnv.from_bytes(in); + */ +} + +std::string UTF16to8(const std::u16string &in) +{ + std::string out; + auto it = in.begin(); + + while (it != in.end()) { + uint32_t cp; // Code point + + // Read the first UTF-16 unit + uint16_t lead = *it++; + + // If it's a high surrogate, read the next unit to form a full code point + if (lead >= 0xD800 && lead <= 0xDBFF) { + // Make sure there's a trailing surrogate + if (it != in.end()) { + uint16_t trail = *it++; + if (trail >= 0xDC00 && trail <= 0xDFFF) { + // Combine lead and trail to form a full code point + cp = ((lead - 0xD800) << 10) + (trail - 0xDC00) + 0x10000; + } + else { + throw std::runtime_error("Invalid UTF-16 sequence"); + } + } + else { + throw std::runtime_error("Incomplete UTF-16 sequence"); + } + } + else { + // It's a single UTF-16 unit, treat as a single code point + cp = lead; + } + + // Convert code point to UTF-8 + if (cp < 0x80) { // One byte + out.push_back(static_cast(cp)); + } + else if (cp < 0x800) { // Two bytes + out.push_back(static_cast((cp >> 6) | 0xC0)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + else if (cp < 0x10000) { // Three bytes + out.push_back(static_cast((cp >> 12) | 0xE0)); + out.push_back(static_cast(((cp >> 6) & 0x3F) | 0x80)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + else { // Four bytes + out.push_back(static_cast((cp >> 18) | 0xF0)); + out.push_back(static_cast(((cp >> 12) & 0x3F) | 0x80)); + out.push_back(static_cast(((cp >> 6) & 0x3F) | 0x80)); + out.push_back(static_cast((cp & 0x3F) | 0x80)); + } + } + + return out; } std::string GetFileVersion(int vmaj, int vmin, int vrev)