Drop usage of codecvt. Fixes compilation warnings

rism-digital · Sep 30, 2024 · 2e7290c · 2e7290c
1 parent 769a0bd
commit 2e7290c
Show file tree

Hide file tree

Showing 4 changed files with 134 additions and 5 deletions.
diff --git a/include/vrv/vrv.h b/include/vrv/vrv.h
@@ -86,6 +86,11 @@ std::string UTF32to8(const std::u32string &in);
  */
 std::u32string UTF8to32(const std::string &in);
 
+/**
+ * Utility for converting UTF16 (std::u16string) to UTF-8
+ */
+std::string UTF16to8(const std::u16string &in);
+
 /**
  * Format a string using vsnprintf.
  * The maximum length is giving by STRING_FORMAT_MAX_LEN

diff --git a/src/editortoolkit_cmn.cpp b/src/editortoolkit_cmn.cpp
@@ -9,7 +9,6 @@
 
 //--------------------------------------------------------------------------------
 
-#include <codecvt>
 #include <locale>
 #include <set>
 

diff --git a/src/toolkit.cpp b/src/toolkit.cpp
@@ -10,7 +10,6 @@
 //----------------------------------------------------------------------------
 
 #include <cassert>
-#include <codecvt>
 #include <locale>
 #include <regex>
 
@@ -400,12 +399,28 @@ bool Toolkit::LoadUTF16File(const std::string &filename)
         u16data.erase(0, 1);
     }
 
-    std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
-    std::string utf8line = convert.to_bytes(u16data);
+    // std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
+    std::string utf8line = vrv::UTF16to8(u16data); // convert.to_bytes(u16data);
 
     return this->LoadData(utf8line, false);
 }
 
+std::string UTF16toUTF8(const std::u16string &input)
+{
+    std::string output;
+    // Placeholder for manual conversion logic
+    // Real conversion logic here should handle actual UTF-16 to UTF-8 conversion
+    for (char16_t c : input) {
+        if (c < 0x80) { // Handle basic ASCII conversion
+            output.push_back(static_cast<char8_t>(c));
+        }
+        else {
+            // Extend this block to handle non-ASCII characters
+        }
+    }
+    return output;
+}
+
 bool Toolkit::IsZip(const std::string &filename)
 {
     std::ifstream fin(filename.c_str(), std::ios::in | std::ios::binary);

diff --git a/src/vrv.cpp b/src/vrv.cpp
@@ -11,7 +11,6 @@
 
 #include <cassert>
 #include <cmath>
-#include <codecvt>
 #include <cstdlib>
 #include <iostream>
 #include <locale>
@@ -264,14 +263,125 @@ std::string ExtractIDFragment(std::string refID)
 
 std::string UTF32to8(const std::u32string &in)
 {
+    std::string out;
+    for (auto cp : in) {
+        if (cp < 0x80) { // One byte
+            out.push_back(static_cast<char>(cp));
+        }
+        else if (cp < 0x800) { // Two bytes
+            out.push_back(static_cast<char>((cp >> 6) | 0xC0));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+        else if (cp < 0x10000) { // Three bytes
+            out.push_back(static_cast<char>((cp >> 12) | 0xE0));
+            out.push_back(static_cast<char>(((cp >> 6) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+        else { // Four bytes
+            out.push_back(static_cast<char>((cp >> 18) | 0xF0));
+            out.push_back(static_cast<char>(((cp >> 12) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>(((cp >> 6) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+    }
+    return out;
+
+    // deprecated code
+    /*
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> strCnv;
     return strCnv.to_bytes(in);
+    */
 }
 
 std::u32string UTF8to32(const std::string &in)
 {
+    std::u32string out;
+    for (size_t i = 0; i < in.size();) {
+        uint32_t cp = in[i] & 0xFF;
+        if (cp <= 0x7F) { // One byte
+            out.push_back(cp);
+            ++i;
+        }
+        else if (cp <= 0xDF) { // Two bytes
+            cp = ((cp & 0x1F) << 6) | (in[i + 1] & 0x3F);
+            out.push_back(cp);
+            i += 2;
+        }
+        else if (cp <= 0xEF) { // Three bytes
+            cp = ((cp & 0x0F) << 12) | ((in[i + 1] & 0x3F) << 6) | (in[i + 2] & 0x3F);
+            out.push_back(cp);
+            i += 3;
+        }
+        else { // Four bytes
+            cp = ((cp & 0x07) << 18) | ((in[i + 1] & 0x3F) << 12) | ((in[i + 2] & 0x3F) << 6) | (in[i + 3] & 0x3F);
+            out.push_back(cp);
+            i += 4;
+        }
+    }
+    return out;
+
+    // deprecated code
+    /*
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> strCnv;
     return strCnv.from_bytes(in);
+    */
+}
+
+std::string UTF16to8(const std::u16string &in)
+{
+    std::string out;
+    auto it = in.begin();
+
+    while (it != in.end()) {
+        uint32_t cp; // Code point
+
+        // Read the first UTF-16 unit
+        uint16_t lead = *it++;
+
+        // If it's a high surrogate, read the next unit to form a full code point
+        if (lead >= 0xD800 && lead <= 0xDBFF) {
+            // Make sure there's a trailing surrogate
+            if (it != in.end()) {
+                uint16_t trail = *it++;
+                if (trail >= 0xDC00 && trail <= 0xDFFF) {
+                    // Combine lead and trail to form a full code point
+                    cp = ((lead - 0xD800) << 10) + (trail - 0xDC00) + 0x10000;
+                }
+                else {
+                    throw std::runtime_error("Invalid UTF-16 sequence");
+                }
+            }
+            else {
+                throw std::runtime_error("Incomplete UTF-16 sequence");
+            }
+        }
+        else {
+            // It's a single UTF-16 unit, treat as a single code point
+            cp = lead;
+        }
+
+        // Convert code point to UTF-8
+        if (cp < 0x80) { // One byte
+            out.push_back(static_cast<char>(cp));
+        }
+        else if (cp < 0x800) { // Two bytes
+            out.push_back(static_cast<char>((cp >> 6) | 0xC0));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+        else if (cp < 0x10000) { // Three bytes
+            out.push_back(static_cast<char>((cp >> 12) | 0xE0));
+            out.push_back(static_cast<char>(((cp >> 6) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+        else { // Four bytes
+            out.push_back(static_cast<char>((cp >> 18) | 0xF0));
+            out.push_back(static_cast<char>(((cp >> 12) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>(((cp >> 6) & 0x3F) | 0x80));
+            out.push_back(static_cast<char>((cp & 0x3F) | 0x80));
+        }
+    }
+
+    return out;
 }
 
 std::string GetFileVersion(int vmaj, int vmin, int vrev)