Skip to content

Commit

Permalink
add omega_util_BOM_size function
Browse files Browse the repository at this point in the history
  • Loading branch information
scholarsmate committed Oct 7, 2023
1 parent cd89ab8 commit a0062f3
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 13 deletions.
9 changes: 8 additions & 1 deletion core/src/include/omega_edit/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename);
/**
* Convert the given byte order mark (BOM) to a string
* @param bom byte order mark (BOM) to convert
* @return string representation of the given BOM
* @return string representation of the given BOM ("none", "UTF-8", "UTF-16LE", "UTF-16BE", "UTF-32LE", "UTF-32BE")
*/
char const *omega_util_BOM_to_string(omega_bom_t bom);

Expand All @@ -195,6 +195,13 @@ typedef struct {
size_t length;
} omega_byte_buffer_t;

/**
* Given a byte order mark (BOM), return the size of the byte order mark (BOM) in bytes
* @param bom byte order mark (BOM) to get the size of
* @return size of the byte order mark (BOM) in bytes
*/
size_t omega_util_BOM_size(omega_bom_t bom);

/**
* Get the byte order mark buffer (BOM) associated with the given byte order mark (BOM)
* @param bom byte order mark (BOM) to get
Expand Down
43 changes: 32 additions & 11 deletions core/src/lib/utility.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) {
case BOM_UTF32BE:
return "UTF-32BE";
default:
// Should never happen
return "unknown";
}
}
Expand Down Expand Up @@ -436,14 +437,14 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
while (i + 1 < length) {
// Swap the bytes if the BOM is little endian
const uint16_t char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);
? (uint16_t) (data[i]) | (uint16_t) (data[i + 1]) << 8
: (uint16_t) (data[i]) << 8 | (uint16_t) (data[i + 1]);

if (is_lead_surrogate_UTF16_(char16)) {
if (i + 3 < length) {
const uint16_t next_char16 = counts_ptr->bom == BOM_UTF16LE
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
? (uint16_t) (data[i + 2]) | (uint16_t) (data[i + 3]) << 8
: (uint16_t) (data[i + 2]) << 8 | (uint16_t) (data[i + 3]);
if (is_low_surrogate_UTF16_(next_char16)) {
++counts_ptr->doubleByteChars;
i += 4;// skip the low surrogate as well
Expand Down Expand Up @@ -474,8 +475,8 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
// Swap the bytes if the BOM is little endian
const uint32_t char32 =
counts_ptr->bom == BOM_UTF32LE
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);
? (data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24))
: ((data[i] << 24) | (data[i + 1] << 16) | (data[i + 2] << 8) | data[i + 3]);

if ((char32 >= 0xD800 && char32 <= 0xDFFF) || char32 > 0x10FFFF) {
++counts_ptr->invalidBytes;// surrogate pairs and characters above 0x10FFFF are invalid in UTF-32
Expand All @@ -497,12 +498,32 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
counts_ptr->invalidBytes += length - i;
}

size_t omega_util_BOM_size(omega_bom_t bom) {
switch (bom) {
case BOM_UTF8:
return 3;
case BOM_UTF16LE:
case BOM_UTF16BE:
return 2;
case BOM_UTF32LE:
case BOM_UTF32BE:
return 4;
default:
return 0;
}
}

const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) {
static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = 3};
static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = 2};
static const omega_byte_buffer_t utf16be_bom = {.data = (omega_byte_t *) "\xFE\xFF", .length = 2};
static const omega_byte_buffer_t utf32le_bom = {.data = (omega_byte_t *) "\xFF\xFE\x00\x00", .length = 4};
static const omega_byte_buffer_t utf32be_bom = {.data = (omega_byte_t *) "\x00\x00\xFE\xFF", .length = 4};
static const omega_byte_buffer_t utf8_bom = {.data = (omega_byte_t *) "\xEF\xBB\xBF", .length = omega_util_BOM_size(
BOM_UTF8))};
static const omega_byte_buffer_t utf16le_bom = {.data = (omega_byte_t *) "\xFF\xFE", .length = omega_util_BOM_size(
BOM_UTF16LE))};
static const omega_byte_buffer_t utf16be_bom = {.data = (omega_byte_t *) "\xFE\xFF", .length = omega_util_BOM_size(
BOM_UTF16BE))};
static const omega_byte_buffer_t utf32le_bom = {.data = (omega_byte_t *) "\xFF\xFE\x00\x00", .length = omega_util_BOM_size(
BOM_UTF32LE))};
static const omega_byte_buffer_t utf32be_bom = {.data = (omega_byte_t *) "\x00\x00\xFE\xFF", .length = omega_util_BOM_size(
BOM_UTF32BE))};

switch (bom) {
case BOM_UTF8:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class EditorService(implicit val system: ActorSystem) extends Editor {
val buffer = new Array[Byte](8192)
val bytesRead = file.read(buffer)
file.close
// Convert the bytes read into a String, sssuming the file is UTF-8 encoded; adjust encoding as necessary
// Convert the bytes read into a String, assuming the file is UTF-8 encoded; adjust encoding as necessary
val text = new String(buffer, 0, bytesRead, if (bom == "unknown" || bom == "none") "UTF-8" else bom)
val detector = new OptimaizeLangDetector().loadModels
val languageResult = detector.detect(text)
Expand Down

0 comments on commit a0062f3

Please sign in to comment.