Skip to content

Commit

Permalink
Add UTF-16 LE I/O routines
Browse files Browse the repository at this point in the history
These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
  • Loading branch information
matt335672 committed Sep 20, 2023
1 parent 79efda9 commit 71fa683
Show file tree
Hide file tree
Showing 7 changed files with 705 additions and 0 deletions.
258 changes: 258 additions & 0 deletions common/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,47 @@
#include "arch.h"
#include "parse.h"
#include "log.h"
#include "string_calls.h"
#include "unicode_defines.h"

/******************************************************************************/

#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define out_uint16_le_unchecked(s, v) do \
{ \
*((s)->p) = (unsigned char)((v) >> 0); \
(s)->p++; \
*((s)->p) = (unsigned char)((v) >> 8); \
(s)->p++; \
} while (0)
#else
#define out_uint16_le_unchecked(s, v) do \
{ \
*((unsigned short*)((s)->p)) = (unsigned short)(v); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = (unsigned short) \
( \
(*((unsigned char*)((s)->p + 0)) << 0) | \
(*((unsigned char*)((s)->p + 1)) << 8) \
); \
(s)->p += 2; \
} while (0)
#else
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = *((unsigned short*)((s)->p)); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
void
parser_stream_overflow_check(const struct stream *s, int n, int is_out,
const char *file, int line)
Expand Down Expand Up @@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
}
}
}

/******************************************************************************/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line)
{
// Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
// file and line
#ifdef USE_DEVEL_STREAMCHECK
int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
parser_stream_overflow_check(s, octet_cnt, 1, file, line);
#endif

while (vn > 0)
{
char32_t c32 = utf8_get_next_char(&v, &vn);
char16_t low;
if (c32 < 0x10000)
{
low = (char16_t)c32;
}
else
{
/* Need a surrogate pair */
low = LOW_SURROGATE_FROM_C32(c32);
char16_t high = HIGH_SURROGATE_FROM_C32(c32);
out_uint16_le_unchecked(s, high);
}
out_uint16_le_unchecked(s, low);
}
}

/******************************************************************************/
/**
* Gets the next Unicode character from a code stream
* @param s Stream
* @return Unicode character
*
* Non-characters and illegally coded characters are mapped to
* UTF_REPLACEMENT_CHARACTER
*
* @pre Two bytes are assumed to be available on the stram on entry
*/
static char32_t
get_c32_from_stream(struct stream *s)
{
char32_t c32 = UTF_REPLACEMENT_CHARACTER; // Assume failure
char16_t w;

in_uint16_le_unchecked(s, w);

if (IS_HIGH_SURROGATE(w))
{
if (s_check_rem(s, 2))
{
char16_t low;
in_uint16_le_unchecked(s, low);
if (IS_LOW_SURROGATE(low))
{
/* Valid surrogate pair */
char32_t v = C32_FROM_SURROGATE_PAIR(low, w);

/* Ignore some values which can be successfully encoded
* in this way */
if (!IS_PLANE_END_NON_CHARACTER(c32))
{
c32 = v;
}
}
else
{
/* Invalid low surrogate - pop character back */
s->p -= 2;
}
}
}
else if (!IS_LOW_SURROGATE(w) &&
!IS_PLANE_END_NON_CHARACTER(w) &&
!IS_ARABIC_NON_CHARACTER(w))
{
/* Character from the Basic Multilingual Plane */
c32 = (char32_t)w;
}

return c32;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
char *saved_s_end = s->end;

// Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
#ifdef USE_DEVEL_STREAMCHECK
parser_stream_overflow_check(s, n * 2, 0, file, line);
#endif
// Temporarily set the stream end pointer to allow us to use
// s_check_rem() when reading in UTF-16 words
if (s->end - s->p > (int)(n * 2))
{
s->end = s->p + (int)(n * 2);
}

while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}

rv += u8len;
}

// Restore stream to full length
s->end = saved_s_end;

if (vn > 0)
{
*v = '\0';
}
++rv;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
s->p = saved_s_p;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);
if (c32 == 0)
{
break; // Terminator encountered
}

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}
rv += u8len;
}

if (vn > 0)
{
*v = '\0';
}
++rv;

return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
s->p = saved_s_p;
return rv;
}
96 changes: 96 additions & 0 deletions common/parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
# define S_CHECK_REM_OUT(s,n)
#endif

/******************************************************************************/
/**
* Copies a UTF-8 string to a stream as little-endian UTF-16
*
* @param s Stream
* @param v UTF-8 string
* @param vn Length of UTF-8 string.
* @param file Caller location (from __FILE__)
* @param line Caller location (from __LINE__)
*
* Caller is expected to check there is room for the result in s
*/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line);

#define out_utf8_as_utf16_le(s,v,vn) \
out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)


/******************************************************************************/
/**
* Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
*
* @param s Stream
* @param n Number of 16-bit words to copy
* @param v Pointer to result buffer
* @param vn Max size of result buffer
*
* @return number of characters which would be written to v, INCLUDING
* an additional terminator. This can be used to check for a buffer
* overflow. A terminator is added whether or not the input
* includes one.
*
* Output is unconditionally NULL-terminated.
* Input is not checked for NULLs - these are copied verbatim
*/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line);

#define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)

/******************************************************************************/
/**
* Returns the size of the buffer needed to store a fixed-size
* little-endian UTF-16 string in a stream as a UTF-8 string
*
* @param s Stream
* @param n Number of 16-bit words to consider
* @return number of characters needed to store the UTF-8 string. This
* includes a terminator, which is written whether the parsed
* string includes one or not.
* @post Stream position is not moved between start and end of this call
*/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);

/******************************************************************************/
/**
* Copies a terminated little-endian UTF-16 string from a stream as UTF-8
*
* @param s Stream
* @param v Pointer to result buffer
* @param vn Max size of result buffer
*
* @return number of characters which would be written to v, INCLUDING
* the terminator. This can be used to check for a buffer overflow.
*
* Output is unconditionally NULL-terminated.
* Input processing stops when a NULL is encountered, or the end of the buffer
* is reached.
*/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn);

/******************************************************************************/
/**
* Returns the size of the buffer needed to store a terminated
* little-endian UTF-16 string in a stream as a terminated UTF-8 string
*
* @param s Stream
* @return number of characters needed to store the UTF-8 string,
* including the terminator
* @post Stream position is not moved between start and end of this call
*
* Input processing stops when a NULL is encountered, or the end of the buffer
* is reached.
*/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s);

/******************************************************************************/
#define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)

Expand Down
Loading

0 comments on commit 71fa683

Please sign in to comment.