diff --git a/core/src/include/omega_edit/utility.h b/core/src/include/omega_edit/utility.h index d47e762c3..b8559f67c 100644 --- a/core/src/include/omega_edit/utility.h +++ b/core/src/include/omega_edit/utility.h @@ -171,7 +171,7 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename); /** * Convert the given byte order mark (BOM) to a string * @param bom byte order mark (BOM) to convert - * @return string representation of the given BOM + * @return string representation of the given BOM ("none", "UTF-8", "UTF-16LE", "UTF-16BE", "UTF-32LE", "UTF-32BE") */ char const *omega_util_BOM_to_string(omega_bom_t bom); @@ -195,6 +195,13 @@ typedef struct { size_t length; } omega_byte_buffer_t; +/** + * Given a byte order mark (BOM), return the size of the byte order mark (BOM) in bytes + * @param bom byte order mark (BOM) to get the size of + * @return size of the byte order mark (BOM) in bytes + */ +size_t omega_util_BOM_size(omega_bom_t bom); + /** * Get the byte order mark buffer (BOM) associated with the given byte order mark (BOM) * @param bom byte order mark (BOM) to get diff --git a/core/src/lib/utility.c b/core/src/lib/utility.c index a3b5ca9aa..95cfad706 100644 --- a/core/src/lib/utility.c +++ b/core/src/lib/utility.c @@ -331,6 +331,7 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) { case BOM_UTF32BE: return "UTF-32BE"; default: + // Should never happen return "unknown"; } } @@ -436,14 +437,14 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega while (i + 1 < length) { // Swap the bytes if the BOM is little endian const uint16_t char16 = counts_ptr->bom == BOM_UTF16LE - ? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8 - : (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]); + ? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8 + : (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]); if (is_lead_surrogate_UTF16_(char16)) { if (i + 3 < length) { const uint16_t next_char16 = counts_ptr->bom == BOM_UTF16LE - ? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8 - : (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]); + ? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8 + : (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]); if (is_low_surrogate_UTF16_(next_char16)) { ++counts_ptr->doubleByteChars; i += 4;// skip the low surrogate as well @@ -474,8 +475,8 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega // Swap the bytes if the BOM is little endian const uint32_t char32 = counts_ptr->bom == BOM_UTF32LE - ? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24)) - : ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]); + ? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24)) + : ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]); if ((char32 >= 0xD800 && char32 <= 0xDFFF) || char32 > 0x10FFFF) { ++counts_ptr->invalidBytes;// surrogate pairs and characters above 0x10FFFF are invalid in UTF-32 @@ -497,6 +498,22 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega counts_ptr->invalidBytes += length - i; } +size_t omega_util_BOM_size(omega_bom_t bom) { + switch (bom) { + case BOM_UTF8: + return 3; + case BOM_UTF16LE: // fall through + case BOM_UTF16BE: + return 2; + case BOM_UTF32LE: // fall through + case BOM_UTF32BE: + return 4; + case BOM_NONE: // fall through + default: + return 0; + } +} + const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) { static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = 3}; static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = 2}; diff --git a/core/src/tests/omega_test.cpp b/core/src/tests/omega_test.cpp index 97539ad58..cdd116447 100644 --- a/core/src/tests/omega_test.cpp +++ b/core/src/tests/omega_test.cpp @@ -591,36 +591,42 @@ TEST_CASE("Detect BOM", "[DetectBOM]") { REQUIRE(session_ptr); auto bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_NONE); + REQUIRE(0 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("none", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/utf-8bom_1.txt", nullptr, nullptr, 0, nullptr); REQUIRE(session_ptr); bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_UTF8); + REQUIRE(3 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("UTF-8", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/utf-16le_1.txt", nullptr, nullptr, 0, nullptr); REQUIRE(session_ptr); bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_UTF16LE); + REQUIRE(2 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("UTF-16LE", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/utf-16be_1.txt", nullptr, nullptr, 0, nullptr); REQUIRE(session_ptr); bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_UTF16BE); + REQUIRE(2 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("UTF-16BE", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/utf-32le_1.txt", nullptr, nullptr, 0, nullptr); REQUIRE(session_ptr); bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_UTF32LE); + REQUIRE(4 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("UTF-32LE", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/utf-32be_1.txt", nullptr, nullptr, 0, nullptr); REQUIRE(session_ptr); bom = omega_session_detect_BOM(session_ptr); REQUIRE(bom == BOM_UTF32BE); + REQUIRE(4 == omega_util_BOM_size(bom)); REQUIRE(0 == strcmp("UTF-32BE", omega_util_BOM_to_string(bom))); omega_edit_destroy_session(session_ptr); session_ptr = omega_edit_create_session("data/ascii_1.txt", nullptr, nullptr, 0, nullptr); diff --git a/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala b/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala index e7930d388..a76d213b0 100644 --- a/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala +++ b/server/scala/serv/src/main/scala/com/ctc/omega_edit/grpc/EditorService.scala @@ -126,7 +126,7 @@ class EditorService(implicit val system: ActorSystem) extends Editor { val buffer = new Array[Byte](8192) val bytesRead = file.read(buffer) file.close - // Convert the bytes read into a String, sssuming the file is UTF-8 encoded; adjust encoding as necessary + // Convert the bytes read into a String, assuming the file is UTF-8 encoded; adjust encoding as necessary val text = new String(buffer, 0, bytesRead, if (bom == "unknown" || bom == "none") "UTF-8" else bom) val detector = new OptimaizeLangDetector().loadModels val languageResult = detector.detect(text)