Skip to content

Commit

Permalink
add omega_util_BOM_size function
Browse files Browse the repository at this point in the history
  • Loading branch information
scholarsmate committed Oct 9, 2023
1 parent cd89ab8 commit 06ce7c3
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 8 deletions.
9 changes: 8 additions & 1 deletion core/src/include/omega_edit/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename);
/**
* Convert the given byte order mark (BOM) to a string
* @param bom byte order mark (BOM) to convert
* @return string representation of the given BOM
* @return string representation of the given BOM ("none", "UTF-8", "UTF-16LE", "UTF-16BE", "UTF-32LE", "UTF-32BE")
*/
char const *omega_util_BOM_to_string(omega_bom_t bom);

Expand All @@ -195,6 +195,13 @@ typedef struct {
size_t length;
} omega_byte_buffer_t;

/**
* Given a byte order mark (BOM), return the size of the byte order mark (BOM) in bytes
* @param bom byte order mark (BOM) to get the size of
* @return size of the byte order mark (BOM) in bytes
*/
size_t omega_util_BOM_size(omega_bom_t bom);

/**
* Get the byte order mark buffer (BOM) associated with the given byte order mark (BOM)
* @param bom byte order mark (BOM) to get
Expand Down
29 changes: 23 additions & 6 deletions core/src/lib/utility.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) {
case BOM_UTF32BE:
return "UTF-32BE";
default:
// Should never happen
return "unknown";
}
}
Expand Down Expand Up @@ -436,14 +437,14 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
while (i + 1 < length) {
// Swap the bytes if the BOM is little endian
const uint16_t char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);

if (is_lead_surrogate_UTF16_(char16)) {
if (i + 3 < length) {
const uint16_t next_char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
if (is_low_surrogate_UTF16_(next_char16)) {
++counts_ptr->doubleByteChars;
i += 4;// skip the low surrogate as well
Expand Down Expand Up @@ -474,8 +475,8 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
// Swap the bytes if the BOM is little endian
const uint32_t char32 =
counts_ptr->bom == BOM_UTF32LE
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);

if ((char32 >= 0xD800 && char32 <= 0xDFFF) || char32 > 0x10FFFF) {
++counts_ptr->invalidBytes;// surrogate pairs and characters above 0x10FFFF are invalid in UTF-32
Expand All @@ -497,6 +498,22 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
counts_ptr->invalidBytes += length - i;
}

size_t omega_util_BOM_size(omega_bom_t bom) {
switch (bom) {
case BOM_UTF8:
return 3;
case BOM_UTF16LE: // fall through
case BOM_UTF16BE:
return 2;
case BOM_UTF32LE: // fall through
case BOM_UTF32BE:
return 4;
case BOM_NONE: // fall through
default:
return 0;
}
}

const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) {
static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = 3};
static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = 2};
Expand Down
6 changes: 6 additions & 0 deletions core/src/tests/omega_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,36 +591,42 @@ TEST_CASE("Detect BOM", "[DetectBOM]") {
REQUIRE(session_ptr);
auto bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_NONE);
REQUIRE(0 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("none", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-8bom_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF8);
REQUIRE(3 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-8", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-16le_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF16LE);
REQUIRE(2 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-16LE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-16be_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF16BE);
REQUIRE(2 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-16BE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-32le_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF32LE);
REQUIRE(4 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-32LE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-32be_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF32BE);
REQUIRE(4 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-32BE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/ascii_1.txt", nullptr, nullptr, 0, nullptr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class EditorService(implicit val system: ActorSystem) extends Editor {
val buffer = new Array[Byte](8192)
val bytesRead = file.read(buffer)
file.close
// Convert the bytes read into a String, sssuming the file is UTF-8 encoded; adjust encoding as necessary
// Convert the bytes read into a String, assuming the file is UTF-8 encoded; adjust encoding as necessary
val text = new String(buffer, 0, bytesRead, if (bom == "unknown" || bom == "none") "UTF-8" else bom)
val detector = new OptimaizeLangDetector().loadModels
val languageResult = detector.detect(text)
Expand Down

0 comments on commit 06ce7c3

Please sign in to comment.