Add UTF-16 LE I/O routines

These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
neutrinolabs · Sep 20, 2023 · 71fa683 · 71fa683
1 parent 79efda9
commit 71fa683
Show file tree

Hide file tree

Showing 7 changed files with 705 additions and 0 deletions.
diff --git a/common/parse.c b/common/parse.c
@@ -27,7 +27,47 @@
 #include "arch.h"
 #include "parse.h"
 #include "log.h"
+#include "string_calls.h"
+#include "unicode_defines.h"
 
+/******************************************************************************/
+
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((s)->p) = (unsigned char)((v) >> 0); \
+        (s)->p++; \
+        *((s)->p) = (unsigned char)((v) >> 8); \
+        (s)->p++; \
+    } while (0)
+#else
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((unsigned short*)((s)->p)) = (unsigned short)(v); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = (unsigned short) \
+              ( \
+                (*((unsigned char*)((s)->p + 0)) << 0) | \
+                (*((unsigned char*)((s)->p + 1)) << 8) \
+              ); \
+        (s)->p += 2; \
+    } while (0)
+#else
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = *((unsigned short*)((s)->p)); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
 void
 parser_stream_overflow_check(const struct stream *s, int n, int is_out,
                              const char *file, int line)
@@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
         }
     }
 }
+
+/******************************************************************************/
+void
+out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
+                          unsigned int vn,
+                          const char *file, int line)
+{
+    // Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
+    // file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
+    parser_stream_overflow_check(s, octet_cnt, 1, file, line);
+#endif
+
+    while (vn > 0)
+    {
+        char32_t c32 = utf8_get_next_char(&v, &vn);
+        char16_t low;
+        if (c32 < 0x10000)
+        {
+            low = (char16_t)c32;
+        }
+        else
+        {
+            /* Need a surrogate pair */
+            low = LOW_SURROGATE_FROM_C32(c32);
+            char16_t high = HIGH_SURROGATE_FROM_C32(c32);
+            out_uint16_le_unchecked(s, high);
+        }
+        out_uint16_le_unchecked(s, low);
+    }
+}
+
+/******************************************************************************/
+/**
+ * Gets the next Unicode character from a code stream
+ * @param s Stream
+ * @return Unicode character
+ *
+ * Non-characters and illegally coded characters are mapped to
+ * UTF_REPLACEMENT_CHARACTER
+ *
+ * @pre Two bytes are assumed to be available on the stram on entry
+ */
+static char32_t
+get_c32_from_stream(struct stream *s)
+{
+    char32_t c32 = UTF_REPLACEMENT_CHARACTER; // Assume failure
+    char16_t w;
+
+    in_uint16_le_unchecked(s, w);
+
+    if (IS_HIGH_SURROGATE(w))
+    {
+        if (s_check_rem(s, 2))
+        {
+            char16_t low;
+            in_uint16_le_unchecked(s, low);
+            if (IS_LOW_SURROGATE(low))
+            {
+                /* Valid surrogate pair */
+                char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
+
+                /* Ignore some values which can be successfully encoded
+                 * in this way */
+                if (!IS_PLANE_END_NON_CHARACTER(c32))
+                {
+                    c32 = v;
+                }
+            }
+            else
+            {
+                /* Invalid low surrogate  - pop character back */
+                s->p -= 2;
+            }
+        }
+    }
+    else if (!IS_LOW_SURROGATE(w) &&
+             !IS_PLANE_END_NON_CHARACTER(w) &&
+             !IS_ARABIC_NON_CHARACTER(w))
+    {
+        /* Character from the Basic Multilingual Plane */
+        c32 = (char32_t)w;
+    }
+
+    return c32;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
+                               char *v, unsigned int vn,
+                               const char *file, int line)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    char *saved_s_end = s->end;
+
+    // Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    parser_stream_overflow_check(s, n * 2, 0, file, line);
+#endif
+    // Temporarily set the stream end pointer to allow us to use
+    // s_check_rem() when reading in UTF-16 words
+    if (s->end - s->p > (int)(n * 2))
+    {
+        s->end = s->p + (int)(n * 2);
+    }
+
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+
+        rv += u8len;
+    }
+
+    // Restore stream to full length
+    s->end = saved_s_end;
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8(struct stream *s,
+                               char *v, unsigned int vn)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+        if (c32 == 0)
+        {
+            break;  // Terminator encountered
+        }
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+        rv += u8len;
+    }
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8_length(struct stream *s)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}
diff --git a/common/parse.h b/common/parse.h
@@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
 #   define S_CHECK_REM_OUT(s,n)
 #endif
 
+/******************************************************************************/
+/**
+ * Copies a UTF-8 string to a stream as little-endian UTF-16
+ *
+ * @param s Stream
+ * @param v UTF-8 string
+ * @param vn Length of UTF-8 string.
+ * @param file Caller location (from __FILE__)
+ * @param line Caller location (from __LINE__)
+ *
+ * Caller is expected to check there is room for the result in s
+ */
+void
+out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
+                          unsigned int vn,
+                          const char *file, int line);
+
+#define out_utf8_as_utf16_le(s,v,vn) \
+    out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)
+
+
+/******************************************************************************/
+/**
+ * Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
+ *
+ * @param s Stream
+ * @param n Number of 16-bit words to copy
+ * @param v Pointer to result buffer
+ * @param vn Max size of result buffer
+ *
+ * @return number of characters which would be written to v, INCLUDING
+ *         an additional terminator. This can be used to check for a buffer
+ *         overflow. A terminator is added whether or not the input
+ *         includes one.
+ *
+ * Output is unconditionally NULL-terminated.
+ * Input is not checked for NULLs - these are copied verbatim
+ */
+unsigned int
+in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
+                               char *v, unsigned int vn,
+                               const char *file, int line);
+
+#define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
+    in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)
+
+/******************************************************************************/
+/**
+ * Returns the size of the buffer needed to store a fixed-size
+ * little-endian UTF-16 string in a stream as a UTF-8 string
+ *
+ * @param s Stream
+ * @param n Number of 16-bit words to consider
+ * @return number of characters needed to store the UTF-8 string. This
+ *         includes a terminator, which is written whether the parsed
+ *         string includes one or not.
+ * @post Stream position is not moved between start and end of this call
+ */
+unsigned int
+in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);
+
+/******************************************************************************/
+/**
+ * Copies a terminated little-endian UTF-16 string from a stream as UTF-8
+ *
+ * @param s Stream
+ * @param v Pointer to result buffer
+ * @param vn Max size of result buffer
+ *
+ * @return number of characters which would be written to v, INCLUDING
+ *         the terminator. This can be used to check for a buffer overflow.
+ *
+ * Output is unconditionally NULL-terminated.
+ * Input processing stops when a NULL is encountered, or the end of the buffer
+ * is reached.
+ */
+unsigned int
+in_utf16_le_terminated_as_utf8(struct stream *s,
+                               char *v, unsigned int vn);
+
+/******************************************************************************/
+/**
+ * Returns the size of the buffer needed to store a terminated
+ * little-endian UTF-16 string in a stream as a terminated UTF-8 string
+ *
+ * @param s Stream
+ * @return number of characters needed to store the UTF-8 string,
+ *         including the terminator
+ * @post Stream position is not moved between start and end of this call
+ *
+ * Input processing stops when a NULL is encountered, or the end of the buffer
+ * is reached.
+ */
+unsigned int
+in_utf16_le_terminated_as_utf8_length(struct stream *s);
+
 /******************************************************************************/
 #define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)