Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add omega_util_BOM_size function #775

Merged
merged 1 commit into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion core/src/include/omega_edit/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename);
/**
* Convert the given byte order mark (BOM) to a string
* @param bom byte order mark (BOM) to convert
* @return string representation of the given BOM
* @return string representation of the given BOM ("none", "UTF-8", "UTF-16LE", "UTF-16BE", "UTF-32LE", "UTF-32BE")
*/
char const *omega_util_BOM_to_string(omega_bom_t bom);

Expand All @@ -195,6 +195,13 @@ typedef struct {
size_t length;
} omega_byte_buffer_t;

/**
* Given a byte order mark (BOM), return the size of the byte order mark (BOM) in bytes
* @param bom byte order mark (BOM) to get the size of
* @return size of the byte order mark (BOM) in bytes
*/
size_t omega_util_BOM_size(omega_bom_t bom);

/**
* Get the byte order mark buffer (BOM) associated with the given byte order mark (BOM)
* @param bom byte order mark (BOM) to get
Expand Down
29 changes: 23 additions & 6 deletions core/src/lib/utility.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) {
case BOM_UTF32BE:
return "UTF-32BE";
default:
// Should never happen
return "unknown";
}
}
Expand Down Expand Up @@ -436,14 +437,14 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
while (i + 1 < length) {
// Swap the bytes if the BOM is little endian
const uint16_t char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);

if (is_lead_surrogate_UTF16_(char16)) {
if (i + 3 < length) {
const uint16_t next_char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
if (is_low_surrogate_UTF16_(next_char16)) {
++counts_ptr->doubleByteChars;
i += 4;// skip the low surrogate as well
Expand Down Expand Up @@ -474,8 +475,8 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
// Swap the bytes if the BOM is little endian
const uint32_t char32 =
counts_ptr->bom == BOM_UTF32LE
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);

if ((char32 >= 0xD800 && char32 <= 0xDFFF) || char32 > 0x10FFFF) {
++counts_ptr->invalidBytes;// surrogate pairs and characters above 0x10FFFF are invalid in UTF-32
Expand All @@ -497,6 +498,22 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
counts_ptr->invalidBytes += length - i;
}

size_t omega_util_BOM_size(omega_bom_t bom) {
switch (bom) {
case BOM_UTF8:
return 3;
case BOM_UTF16LE: // fall through
case BOM_UTF16BE:
return 2;
case BOM_UTF32LE: // fall through
case BOM_UTF32BE:
return 4;
case BOM_NONE: // fall through
default:
return 0;
}
}

const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) {
static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = 3};
static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = 2};
Expand Down
6 changes: 6 additions & 0 deletions core/src/tests/omega_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,36 +591,42 @@ TEST_CASE("Detect BOM", "[DetectBOM]") {
REQUIRE(session_ptr);
auto bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_NONE);
REQUIRE(0 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("none", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-8bom_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF8);
REQUIRE(3 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-8", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-16le_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF16LE);
REQUIRE(2 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-16LE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-16be_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF16BE);
REQUIRE(2 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-16BE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-32le_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF32LE);
REQUIRE(4 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-32LE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/utf-32be_1.txt", nullptr, nullptr, 0, nullptr);
REQUIRE(session_ptr);
bom = omega_session_detect_BOM(session_ptr);
REQUIRE(bom == BOM_UTF32BE);
REQUIRE(4 == omega_util_BOM_size(bom));
REQUIRE(0 == strcmp("UTF-32BE", omega_util_BOM_to_string(bom)));
omega_edit_destroy_session(session_ptr);
session_ptr = omega_edit_create_session("data/ascii_1.txt", nullptr, nullptr, 0, nullptr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class EditorService(implicit val system: ActorSystem) extends Editor {
val buffer = new Array[Byte](8192)
val bytesRead = file.read(buffer)
file.close
// Convert the bytes read into a String, sssuming the file is UTF-8 encoded; adjust encoding as necessary
// Convert the bytes read into a String, assuming the file is UTF-8 encoded; adjust encoding as necessary
val text = new String(buffer, 0, bytesRead, if (bom == "unknown" || bom == "none") "UTF-8" else bom)
val detector = new OptimaizeLangDetector().loadModels
val languageResult = detector.detect(text)
Expand Down