Skip to content

Commit

Permalink
More profiling services (#784)
Browse files Browse the repository at this point in the history
split out BOM, language, and content type detection into separate services
  • Loading branch information
scholarsmate authored Oct 11, 2023
1 parent 5b0be84 commit aed17ae
Show file tree
Hide file tree
Showing 28 changed files with 675 additions and 262 deletions.
5 changes: 5 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ export OE_LIB_DIR="$(readlink -f "$install_dir-shared-$type/lib")"
yarn install
yarn lint

if [[ -d "$OE_LIB_DIR" ]]; then
rm -f _install
ln -s "$OE_LIB_DIR" _install
fi

# Build, test, and package Scala server node module
yarn workspace @omega-edit/server package

Expand Down
3 changes: 2 additions & 1 deletion core/src/examples/count_characters.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ int main(int argc, char *argv[]) {
omega_session_t *session_ptr = omega_edit_create_session(argv[1], NULL, NULL, NO_EVENTS, NULL);
omega_character_counts_t *character_counts_ptr = omega_character_counts_create();
omega_session_character_counts(session_ptr, character_counts_ptr, 0,
omega_session_get_computed_file_size(session_ptr));
omega_session_get_computed_file_size(session_ptr),
omega_session_detect_BOM(session_ptr, 0));

printf("File: %s, BOM: %s\n", argv[1],
omega_util_BOM_to_string(omega_character_counts_get_BOM(character_counts_ptr)));
Expand Down
4 changes: 3 additions & 1 deletion core/src/examples/peek.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

using namespace std;

enum class display_mode_t { BIT_MODE, BYTE_MODE, CHAR_MODE };
enum class display_mode_t {
BIT_MODE, BYTE_MODE, CHAR_MODE
};
struct view_mode_t {
display_mode_t display_mode = display_mode_t::BYTE_MODE;
};
Expand Down
6 changes: 4 additions & 2 deletions core/src/examples/play.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ void session_change_cbk(const omega_session_t *session_ptr, omega_session_event_
}
}

enum class display_mode_t { BIT_MODE, BYTE_MODE, CHAR_MODE };
enum class display_mode_t {
BIT_MODE, BYTE_MODE, CHAR_MODE
};
struct view_mode_t {
display_mode_t display_mode = display_mode_t::CHAR_MODE;
};
Expand Down Expand Up @@ -144,7 +146,7 @@ void vpt_change_cbk(const omega_viewport_t *viewport_ptr,
<< " offset: " << omega_viewport_get_offset(viewport_ptr) << endl;
if (omega_viewport_get_user_data_ptr(viewport_ptr)) {
switch (reinterpret_cast<const view_mode_t *>(omega_viewport_get_user_data_ptr(viewport_ptr))
->display_mode) {
->display_mode) {
case display_mode_t::BIT_MODE:
clog << " BIT MODE [";
write_pretty_bits(omega_viewport_get_data(viewport_ptr),
Expand Down
6 changes: 3 additions & 3 deletions core/src/examples/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ inline void vpt_change_cbk(const omega_viewport_t *viewport_ptr, omega_viewport_
case VIEWPORT_EVT_CREATE:
case VIEWPORT_EVT_EDIT: {
char change_kind = (viewport_event_ptr)
? omega_change_get_kind_as_char(
reinterpret_cast<const omega_change_t *>(viewport_event_ptr))
: 'R';
? omega_change_get_kind_as_char(
reinterpret_cast<const omega_change_t *>(viewport_event_ptr))
: 'R';
clog << change_kind << ": [" << omega_viewport_get_string(viewport_ptr) << "]" << endl;
break;
}
Expand Down
4 changes: 2 additions & 2 deletions core/src/examples/simple_c.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ void vpt_change_cbk(const omega_viewport_t *viewport_ptr, omega_viewport_event_t
case VIEWPORT_EVT_CREATE:
case VIEWPORT_EVT_EDIT: {
char change_kind = viewport_event_ptr
? omega_change_get_kind_as_char((const omega_change_t *) (viewport_event_ptr))
: 'R';
? omega_change_get_kind_as_char((const omega_change_t *) (viewport_event_ptr))
: 'R';
fprintf(stdout, "%c: [%s]\n", change_kind, omega_viewport_get_data(viewport_ptr));
break;
}
Expand Down
7 changes: 5 additions & 2 deletions core/src/include/omega_edit/character_counts.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ void omega_character_counts_destroy(omega_character_counts_t *counts_ptr);
/**
* Reset an omega_character_counts_t object
* @param counts_ptr omega_character_counts_t object to reset
* @return given omega_character_counts_t object
* @note This function does not reset the byte order mark (BOM)
*/
void omega_character_counts_reset(omega_character_counts_t *counts_ptr);
omega_character_counts_t *omega_character_counts_reset(omega_character_counts_t *counts_ptr);

/**
* Get the byte order mark (BOM) for the given omega_character_counts_t object
Expand All @@ -62,8 +64,9 @@ omega_bom_t omega_character_counts_get_BOM(const omega_character_counts_t *count
* Set the byte order mark (BOM) for the given omega_character_counts_t object
* @param counts_ptr omega_character_counts_t object to set the BOM for
* @param bom BOM to set for the given omega_character_counts_t object
* @return given omega_character_counts_t object
*/
void omega_character_counts_set_BOM(omega_character_counts_t *counts_ptr, omega_bom_t bom);
omega_character_counts_t *omega_character_counts_set_BOM(omega_character_counts_t *counts_ptr, omega_bom_t bom);

/**
* Get the number of BOM bytes found for the given omega_character_counts_t object
Expand Down
2 changes: 1 addition & 1 deletion core/src/include/omega_edit/fwd_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ typedef enum {
typedef enum { MASK_AND, MASK_OR, MASK_XOR } omega_mask_kind_t;

/** Byte order mark (BOM) types */
typedef enum { BOM_NONE = 0, BOM_UTF8, BOM_UTF16LE, BOM_UTF16BE, BOM_UTF32LE, BOM_UTF32BE } omega_bom_t;
typedef enum { BOM_UNKNOWN = 0, BOM_NONE, BOM_UTF8, BOM_UTF16LE, BOM_UTF16BE, BOM_UTF32LE, BOM_UTF32BE } omega_bom_t;

/** Opaque character counts */
typedef struct omega_character_counts_struct omega_character_counts_t;
Expand Down
6 changes: 4 additions & 2 deletions core/src/include/omega_edit/session.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,10 @@ void omega_session_notify(const omega_session_t *session_ptr, omega_session_even
/**
* Given a session, return the detected byte order marker (BOM)
* @param session_ptr session to get the BOM from
* @param offset where in the session to begin detecting the BOM
* @return detected byte order marker (BOM)
*/
omega_bom_t omega_session_detect_BOM(const omega_session_t *session_ptr);
omega_bom_t omega_session_detect_BOM(const omega_session_t *session_ptr, int64_t offset);

/**
* Given a session, offset and length, populate a byte frequency profile
Expand All @@ -263,10 +264,11 @@ int omega_session_byte_frequency_profile(const omega_session_t *session_ptr,
* @param counts_ptr pointer to the character counts to populate
* @param offset where in the session to begin counting characters
* @param length number of bytes from the offset to stop counting characters (if 0, it will count to the end of the session)
* @param bom byte order marker (BOM) to use when counting characters
* @return zero on success and non-zero otherwise
*/
int omega_session_character_counts(const omega_session_t *session_ptr, omega_character_counts_t *counts_ptr,
int64_t offset, int64_t length);
int64_t offset, int64_t length, omega_bom_t bom);

/**
* Given a session, return the checkpoint directory
Expand Down
2 changes: 2 additions & 0 deletions core/src/include/omega_edit/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ omega_bom_t omega_util_detect_BOM_from_file(const char *filename);
*/
char const *omega_util_BOM_to_string(omega_bom_t bom);

omega_bom_t omega_util_string_to_BOM(char const *str);

/**
* Count the number of single byte, and multi-byte characters in the given data
* @param data data to count the characters in
Expand Down
8 changes: 5 additions & 3 deletions core/src/lib/character_counts.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <stdlib.h>

omega_character_counts_t *omega_character_counts_create() {
// use calloc to initialize all fields to zero
// use calloc to initialize all fields to zero, BOM is set to BOM_UNKNOWN
omega_character_counts_t *counts_ptr = (omega_character_counts_t *) calloc(1, sizeof(omega_character_counts_t));
assert(counts_ptr);
return counts_ptr;
Expand All @@ -29,24 +29,26 @@ void omega_character_counts_destroy(omega_character_counts_t *counts_ptr) {
free(counts_ptr);
}

void omega_character_counts_reset(omega_character_counts_t *counts_ptr) {
omega_character_counts_t *omega_character_counts_reset(omega_character_counts_t *counts_ptr) {
assert(counts_ptr);
counts_ptr->bomBytes = 0;
counts_ptr->singleByteChars = 0;
counts_ptr->doubleByteChars = 0;
counts_ptr->tripleByteChars = 0;
counts_ptr->quadByteChars = 0;
counts_ptr->invalidBytes = 0;
return counts_ptr;
}

omega_bom_t omega_character_counts_get_BOM(const omega_character_counts_t *counts_ptr) {
assert(counts_ptr);
return counts_ptr->bom;
}

void omega_character_counts_set_BOM(omega_character_counts_t *counts_ptr, omega_bom_t bom) {
omega_character_counts_t *omega_character_counts_set_BOM(omega_character_counts_t *counts_ptr, omega_bom_t bom) {
assert(counts_ptr);
counts_ptr->bom = bom;
return counts_ptr;
}

int64_t omega_character_counts_bom_bytes(const omega_character_counts_t *counts_ptr) {
Expand Down
12 changes: 7 additions & 5 deletions core/src/lib/session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,13 @@ void omega_session_notify(const omega_session_t *session_ptr, omega_session_even
}
}

omega_bom_t omega_session_detect_BOM(const omega_session_t *session_ptr) {
omega_bom_t omega_session_detect_BOM(const omega_session_t *session_ptr, int64_t offset) {
assert(session_ptr);
// get the first 4 bytes at the given offset
const auto segment_ptr = omega_segment_create(4);
omega_session_get_segment(session_ptr, segment_ptr, 0);
omega_session_get_segment(session_ptr, segment_ptr, offset);

// detect the BOM from the first 4 bytes
const auto bom = omega_util_detect_BOM_from_memory(omega_segment_get_data(segment_ptr),
omega_segment_get_length(segment_ptr));
omega_segment_destroy(segment_ptr);
Expand Down Expand Up @@ -304,15 +307,14 @@ int omega_session_byte_frequency_profile(const omega_session_t *session_ptr,
}

int omega_session_character_counts(const omega_session_t *session_ptr, omega_character_counts_t *counts_ptr,
int64_t offset, int64_t length) {
int64_t offset, int64_t length, omega_bom_t bom) {
assert(session_ptr);
assert(counts_ptr);
assert(0 <= offset);
length = length ? length : omega_session_get_computed_file_size(session_ptr) - offset;
assert(0 <= length);
assert(offset + length <= omega_session_get_computed_file_size(session_ptr));
omega_character_counts_reset(counts_ptr);
omega_character_counts_set_BOM(counts_ptr, omega_session_detect_BOM(session_ptr));
omega_character_counts_set_BOM(omega_character_counts_reset(counts_ptr), bom);
const auto segment_ptr = omega_segment_create(std::min(length, static_cast<int64_t>(BUFSIZ)));
while (length) {
if (const auto rc = omega_session_get_segment(session_ptr, segment_ptr, offset) != 0) { return rc; }
Expand Down
56 changes: 39 additions & 17 deletions core/src/lib/utility.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,30 @@ char const *omega_util_BOM_to_string(omega_bom_t bom) {
return "UTF-32LE";
case BOM_UTF32BE:
return "UTF-32BE";
case BOM_UNKNOWN: // fall through
default:
// Should never happen
return "unknown";
}
}

omega_bom_t omega_util_string_to_BOM(char const *str) {
if (0 == omega_util_strnicmp(str, "none", 4)) {
return BOM_NONE;
} else if (0 == omega_util_strnicmp(str, "UTF-8", 5)) {
return BOM_UTF8;
} else if (0 == omega_util_strnicmp(str, "UTF-16LE", 8)) {
return BOM_UTF16LE;
} else if (0 == omega_util_strnicmp(str, "UTF-16BE", 8)) {
return BOM_UTF16BE;
} else if (0 == omega_util_strnicmp(str, "UTF-32LE", 8)) {
return BOM_UTF32LE;
} else if (0 == omega_util_strnicmp(str, "UTF-32BE", 8)) {
return BOM_UTF32BE;
}
return BOM_UNKNOWN;
}

static inline int is_lead_surrogate_UTF16_(uint16_t word) {
// https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
return word >= 0xD800 && word <= 0xDBFF ? 1 : 0;
Expand All @@ -352,49 +370,51 @@ void omega_util_count_characters(const unsigned char *data, size_t length, omega
assert(counts_ptr);

// Skip the BOM if present (the BOM is metadata, not part of the text)
const size_t bomSize = omega_util_BOM_size(counts_ptr->bom);
switch (counts_ptr->bom) {
case BOM_UTF8:
if (length >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
data += 3;
length -= 3;
counts_ptr->bomBytes = 3;
data += bomSize;
length -= bomSize;
counts_ptr->bomBytes = bomSize;
}
break;
case BOM_UTF16LE:
if (length >= 2 && data[0] == 0xFF && data[1] == 0xFE) {
data += 2;
length -= 2;
counts_ptr->bomBytes = 2;
data += bomSize;
length -= bomSize;
counts_ptr->bomBytes = bomSize;
}
break;
case BOM_UTF16BE:
if (length >= 2 && data[0] == 0xFE && data[1] == 0xFF) {
data += 2;
length -= 2;
counts_ptr->bomBytes = 2;
data += bomSize;
length -= bomSize;
counts_ptr->bomBytes = bomSize;
}
break;
case BOM_UTF32LE:
if (length >= 4 && data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00) {
data += 4;
length -= 4;
counts_ptr->bomBytes = 4;
data += bomSize;
length -= bomSize;
counts_ptr->bomBytes = bomSize;
}
break;
case BOM_UTF32BE:
if (length >= 4 && data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF) {
data += 4;
length -= 4;
counts_ptr->bomBytes = 4;
data += bomSize;
length -= bomSize;
counts_ptr->bomBytes = bomSize;
}
break;
default:
// No BOM specified, do nothing
// No actual BOM specified, do nothing
break;
}
size_t i = 0;
switch (counts_ptr->bom) {
case BOM_NONE:// fall through, assume UTF-8 if the BOM is not specified
case BOM_UNKNOWN:// fall through, assume UTF-8 if the BOM is unknown
case BOM_NONE:// fall through, assume UTF-8 if the BOM is none
case BOM_UTF8:
while (i < length) {
if ((data[i] & 0x80) == 0) {
Expand Down Expand Up @@ -532,6 +552,8 @@ const omega_byte_buffer_t *omega_util_BOM_to_buffer(omega_bom_t bom) {
return &utf32le_bom;
case BOM_UTF32BE:
return &utf32be_bom;
case BOM_NONE: // fall through
case BOM_UNKNOWN: // fall through
default:
return NULL;
}
Expand Down
Loading

0 comments on commit aed17ae

Please sign in to comment.