add omega_util_BOM_size function

ctc-oss · Oct 9, 2023 · 06ce7c3 · 06ce7c3
1 parent cd89ab8
commit 06ce7c3
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 8 deletions.
diff --git a/core/src/include/omega_edit/utility.h b/core/src/include/omega_edit/utility.h
@@ -171,7 +171,7 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename);
 /**
  * Convert the given byte order mark (BOM) to a string
  * @param bom byte order mark (BOM) to convert
- * @return string representation of the given BOM
+ * @return string representation of the given BOM ("none", "UTF-8", "UTF-16LE", "UTF-16BE", "UTF-32LE", "UTF-32BE")
  */
 char const *omega_util_BOM_to_string(omega_bom_t bom);
 
@@ -195,6 +195,13 @@ typedef struct {
     size_t length;
 } omega_byte_buffer_t;
 
+/**
+ * Given a byte order mark (BOM), return the size of the byte order mark (BOM) in bytes
+ * @param bom byte order mark (BOM) to get the size of
+ * @return size of the byte order mark (BOM) in bytes
+ */
+size_t omega_util_BOM_size(omega_bom_t bom);
+
 /**
  * Get the byte order mark buffer (BOM) associated with the given byte order mark (BOM)
  * @param bom byte order mark (BOM) to get

diff --git a/core/src/lib/utility.c b/core/src/lib/utility.c
@@ -331,6 +331,7 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) {
         case BOM_UTF32BE:
             return "UTF-32BE";
         default:
+            // Should never happen
             return "unknown";
     }
 }
@@ -436,14 +437,14 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
             while (i + 1 < length) {
                 // Swap the bytes if the BOM is little endian
                 const uint16_t char16 = counts_ptr->bom == BOM_UTF16LE
-                                                ? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
-                                                : (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);
+                                        ? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
+                                        : (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);
 
                 if (is_lead_surrogate_UTF16_(char16)) {
                     if (i + 3 < length) {
                         const uint16_t next_char16 = counts_ptr->bom == BOM_UTF16LE
-                                                             ? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
-                                                             : (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
+                                                     ? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
+                                                     : (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
                         if (is_low_surrogate_UTF16_(next_char16)) {
                             ++counts_ptr->doubleByteChars;
                             i += 4;// skip the low surrogate as well
@@ -474,8 +475,8 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
                 // Swap the bytes if the BOM is little endian
                 const uint32_t char32 =
                         counts_ptr->bom == BOM_UTF32LE
-                                ? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
-                                : ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);
+                        ? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
+                        : ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);
 
                 if ((char32 >= 0xD800 && char32 <= 0xDFFF) || char32 > 0x10FFFF) {
                     ++counts_ptr->invalidBytes;// surrogate pairs and characters above 0x10FFFF are invalid in UTF-32
@@ -497,6 +498,22 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
     counts_ptr->invalidBytes += length - i;
 }
 
+size_t omega_util_BOM_size(omega_bom_t bom) {
+    switch (bom) {
+        case BOM_UTF8:
+            return 3;
+        case BOM_UTF16LE: // fall through
+        case BOM_UTF16BE:
+            return 2;
+        case BOM_UTF32LE: // fall through
+        case BOM_UTF32BE:
+            return 4;
+        case BOM_NONE: // fall through
+        default:
+            return 0;
+    }
+}
+
 const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) {
     static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = 3};
     static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = 2};

diff --git a/core/src/tests/omega_test.cpp b/core/src/tests/omega_test.cpp
@@ -591,36 +591,42 @@ TEST_CASE("Detect BOM", "[DetectBOM]") {
     REQUIRE(session_ptr);
     auto bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_NONE);
+    REQUIRE(0 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("none", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/utf-8bom_1.txt", nullptr, nullptr, 0, nullptr);
     REQUIRE(session_ptr);
     bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_UTF8);
+    REQUIRE(3 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("UTF-8", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/utf-16le_1.txt", nullptr, nullptr, 0, nullptr);
     REQUIRE(session_ptr);
     bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_UTF16LE);
+    REQUIRE(2 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("UTF-16LE", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/utf-16be_1.txt", nullptr, nullptr, 0, nullptr);
     REQUIRE(session_ptr);
     bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_UTF16BE);
+    REQUIRE(2 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("UTF-16BE", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/utf-32le_1.txt", nullptr, nullptr, 0, nullptr);
     REQUIRE(session_ptr);
     bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_UTF32LE);
+    REQUIRE(4 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("UTF-32LE", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/utf-32be_1.txt", nullptr, nullptr, 0, nullptr);
     REQUIRE(session_ptr);
     bom = omega_session_detect_BOM(session_ptr);
     REQUIRE(bom == BOM_UTF32BE);
+    REQUIRE(4 == omega_util_BOM_size(bom));
     REQUIRE(0 == strcmp("UTF-32BE", omega_util_BOM_to_string(bom)));
     omega_edit_destroy_session(session_ptr);
     session_ptr = omega_edit_create_session("data/ascii_1.txt", nullptr, nullptr, 0, nullptr);

diff --git a/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala b/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala
@@ -126,7 +126,7 @@ class EditorService(implicit val system: ActorSystem) extends Editor {
       val buffer = new Array[Byte](8192)
       val bytesRead = file.read(buffer)
       file.close
-      // Convert the bytes read into a String, sssuming the file is UTF-8 encoded; adjust encoding as necessary
+      // Convert the bytes read into a String, assuming the file is UTF-8 encoded; adjust encoding as necessary
       val text = new String(buffer, 0, bytesRead, if (bom == "unknown" || bom == "none") "UTF-8" else bom)
       val detector = new OptimaizeLangDetector().loadModels
       val languageResult = detector.detect(text)