Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Unicode support #2794

Merged
merged 10 commits into from
Nov 2, 2023
1 change: 1 addition & 0 deletions common/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ libcommon_la_SOURCES = \
thread_calls.h \
trans.c \
trans.h \
unicode_defines.h \
$(PIXMAN_SOURCES)

libcommon_la_LIBADD = \
Expand Down
15 changes: 12 additions & 3 deletions common/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ typedef unsigned long uintptr_t;

typedef int bool_t;

// Define Unicode character types
#if defined(HAVE_UCHAR_H)
#include <uchar.h>
#elif defined(HAVE_STDINT_H)
typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t;
#else
typedef uint16_t char16_t;
typedef uint32_t char32_t;
#endif

/* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN
in the makefile to override */

Expand Down Expand Up @@ -134,12 +145,10 @@ typedef bool_t tbool;
typedef intptr_t tbus;
typedef intptr_t tintptr;

/* wide char, socket */
/* socket */
#if defined(_WIN32)
typedef unsigned short twchar;
typedef unsigned int tsock;
#else
typedef int twchar;
typedef int tsock;
#endif
#endif /* DEFINED_Ts */
Expand Down
14 changes: 0 additions & 14 deletions common/os_calls.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,20 +181,6 @@ g_init(const char *app_name)

WSAStartup(2, &wsadata);
#endif

/* In order to get g_mbstowcs and g_wcstombs to work properly with
UTF-8 non-ASCII characters, LC_CTYPE cannot be "C" or blank.
To select UTF-8 encoding without specifying any countries/languages,
"C.UTF-8" is used but provided in few systems.

See also: https://sourceware.org/glibc/wiki/Proposals/C.UTF-8 */
char *lc_ctype;
lc_ctype = setlocale(LC_CTYPE, "C.UTF-8");
if (lc_ctype == NULL)
{
/* use en_US.UTF-8 instead if not available */
setlocale(LC_CTYPE, "en_US.UTF-8");
}
}

/*****************************************************************************/
Expand Down
258 changes: 258 additions & 0 deletions common/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,47 @@
#include "arch.h"
#include "parse.h"
#include "log.h"
#include "string_calls.h"
#include "unicode_defines.h"

/******************************************************************************/

#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define out_uint16_le_unchecked(s, v) do \
{ \
*((s)->p) = (unsigned char)((v) >> 0); \
(s)->p++; \
*((s)->p) = (unsigned char)((v) >> 8); \
(s)->p++; \
} while (0)
#else
#define out_uint16_le_unchecked(s, v) do \
{ \
*((unsigned short*)((s)->p)) = (unsigned short)(v); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = (unsigned short) \
( \
(*((unsigned char*)((s)->p + 0)) << 0) | \
(*((unsigned char*)((s)->p + 1)) << 8) \
); \
(s)->p += 2; \
} while (0)
#else
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = *((unsigned short*)((s)->p)); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
void
parser_stream_overflow_check(const struct stream *s, int n, int is_out,
const char *file, int line)
Expand Down Expand Up @@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
}
}
}

/******************************************************************************/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line)
{
// Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
// file and line
#ifdef USE_DEVEL_STREAMCHECK
int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
parser_stream_overflow_check(s, octet_cnt, 1, file, line);
#endif

while (vn > 0)
{
char32_t c32 = utf8_get_next_char(&v, &vn);
char16_t low;
if (c32 < 0x10000)
{
low = (char16_t)c32;
}
else
{
/* Need a surrogate pair */
low = LOW_SURROGATE_FROM_C32(c32);
char16_t high = HIGH_SURROGATE_FROM_C32(c32);
out_uint16_le_unchecked(s, high);
}
out_uint16_le_unchecked(s, low);
}
}

/******************************************************************************/
/**
* Gets the next Unicode character from a code stream
* @param s Stream
* @return Unicode character
*
* Non-characters and illegally coded characters are mapped to
* UCS_REPLACEMENT_CHARACTER
*
* @pre Two bytes are assumed to be available on the stram on entry
*/
static char32_t
get_c32_from_stream(struct stream *s)
{
char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
char16_t w;

in_uint16_le_unchecked(s, w);

if (IS_HIGH_SURROGATE(w))
{
if (s_check_rem(s, 2))
{
char16_t low;
in_uint16_le_unchecked(s, low);
if (IS_LOW_SURROGATE(low))
{
/* Valid surrogate pair */
char32_t v = C32_FROM_SURROGATE_PAIR(low, w);

/* Ignore some values which can be successfully encoded
* in this way */
if (!IS_PLANE_END_NON_CHARACTER(c32))
{
c32 = v;
}
}
else
{
/* Invalid low surrogate - pop character back */
s->p -= 2;
}
}
}
else if (!IS_LOW_SURROGATE(w) &&
!IS_PLANE_END_NON_CHARACTER(w) &&
!IS_ARABIC_NON_CHARACTER(w))
{
/* Character from the Basic Multilingual Plane */
c32 = (char32_t)w;
}

return c32;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
char *saved_s_end = s->end;

// Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
#ifdef USE_DEVEL_STREAMCHECK
parser_stream_overflow_check(s, n * 2, 0, file, line);
#endif
// Temporarily set the stream end pointer to allow us to use
// s_check_rem() when reading in UTF-16 words
if (s->end - s->p > (int)(n * 2))
{
s->end = s->p + (int)(n * 2);
}

while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}

rv += u8len;
}

// Restore stream to full length
s->end = saved_s_end;

if (vn > 0)
{
*v = '\0';
}
++rv;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
s->p = saved_s_p;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);
if (c32 == 0)
{
break; // Terminator encountered
}

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}
rv += u8len;
}

if (vn > 0)
{
*v = '\0';
}
++rv;

return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
s->p = saved_s_p;
return rv;
}
Loading