Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix leak, broken collation of UTF-8 string in non-UTF8 locale #22963

Open
wants to merge 10 commits into
base: blead
Choose a base branch
from
285 changes: 144 additions & 141 deletions locale.c
Original file line number Diff line number Diff line change
Expand Up @@ -9609,6 +9609,8 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
bool utf8 /* Is the input in UTF-8? */
)
{
PERL_ARGS_ASSERT_MEM_COLLXFRM_;

/* mem_collxfrm_() is like strxfrm() but with two important differences.
* First, it handles embedded NULs. Second, it allocates a bit more memory
* than needed for the transformed data itself. The real transformed data
Expand Down Expand Up @@ -9637,11 +9639,18 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
locale_t constructed_locale = (locale_t) 0;
# endif

PERL_ARGS_ASSERT_MEM_COLLXFRM_;

/* Must be NUL-terminated */
assert(*(input_string + len) == '\0');

/* We may have to allocate memory to hold modified versions of the input.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Link in the commit message takes me to a list of tickets, not a comment.

* Initialize to NULL here, and before any return, free them all. Those
* that do get allocated will be non-NULL then, and get freed */
char * sans_nuls = NULL; /* NULs changed to lowest collating ctrl */
char * sans_highs = NULL; /* >0xFF changed to highest collating byte
for non-UTF8 locales */
void * free_me = NULL; /* some called functions may allocate memory that
this function then is required to free */

if (PL_collxfrm_mult == 0) { /* unknown or bad */
if (PL_collxfrm_base != 0) { /* bad collation => skip */
DEBUG_L(PerlIO_printf(Perl_debug_log,
Expand All @@ -9659,10 +9668,10 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
* otherwise contain that character, but otherwise there may be
* less-than-perfect results with that character and NUL. This is
* unavoidable unless we replace strxfrm with our own implementation. */

if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded
NUL */
char * e = s + len;
char * sans_nuls;
STRLEN sans_nuls_len;
int try_non_controls;
char this_replacement_char[] = "?\0"; /* Room for a two-byte string,
Expand Down Expand Up @@ -9825,148 +9834,139 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
len = strlen(s);
} /* End of replacing NULs */

/* Make sure the UTF8ness of the string and locale match */
if (utf8 != PL_in_utf8_COLLATE_locale) {
/* XXX convert above Unicode to 10FFFF? */
const char * const t = s; /* Temporary so we can later find where the
input was */
if (! utf8) {

/* Here they don't match. Change the string's to be what the locale is
* expecting */

if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
s = (char *) bytes_to_utf8((const U8 *) s, &len);
/* When the locale is UTF-8, strxfrm() is expecting a UTF-8 string.
* Here, the string isn't. Convert it to be so. */
if (PL_in_utf8_COLLATE_locale) {
s = (char *) bytes_to_utf8_free_me((const U8 *) s, &len, &free_me);
utf8 = TRUE;
}
else { /* locale is not UTF-8; but input is; downgrade the input */

s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);

/* If the downgrade was successful we are done, but if the input
* contains things that require UTF-8 to represent, have to do
* damage control ... */
if (UNLIKELY(utf8)) {

/* What we do is construct a non-UTF-8 string with
* 1) the characters representable by a single byte converted
* to be so (if necessary);
* 2) and the rest converted to collate the same as the
* highest collating representable character. That makes
* them collate at the end. This is similar to how we
* handle embedded NULs, but we use the highest collating
* code point instead of the smallest. Like the NUL case,
* this isn't perfect, but is the best we can reasonably
* do. Every above-255 code point will sort the same as
* the highest-sorting 0-255 code point. If that code
* point can combine in a sequence with some other code
* points for weight calculations, us changing something to
* be it can adversely affect the results. But in most
* cases, it should work reasonably. And note that this is
* really an illegal situation: using code points above 255
* on a locale where only 0-255 are valid. If two strings
* sort entirely equal, then the sort order for the
* above-255 code points will be in code point order. */

utf8 = FALSE;

/* If we haven't calculated the code point with the maximum
* collating order for this locale, do so now */
if (! PL_strxfrm_max_cp) {
int j;

/* The current transformed string that collates the
* highest (except it also includes the prefixed collation
* index. */
char * cur_max_x = NULL;

/* Look through all legal code points (NUL isn't) */
for (j = 1; j < 256; j++) {
char * x;
STRLEN x_len;
char cur_source[] = { '\0', '\0' };

/* Create a 1-char string of the current code point */
cur_source[0] = (char) j;

/* Then transform it */
x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);

/* If something went wrong (which it shouldn't), just
* ignore this code point */
if (! x) {
continue;
}

/* If this character's transformation is higher than
* the current highest, this one becomes the highest */
if ( cur_max_x == NULL
|| strGT(x + COLLXFRM_HDR_LEN,
cur_max_x + COLLXFRM_HDR_LEN))
{
PL_strxfrm_max_cp = j;
Safefree(cur_max_x);
cur_max_x = x;
}
else {
Safefree(x);
}
/* We are ready to call strxfrm() */
}
else if (! PL_in_utf8_COLLATE_locale) {

/* Here, the string is UTF-8, but the locale isn't. strxfrm() is
* expecting a non-UTF-8 string. Convert the string to bytes. If
* that succeeds, we are ready to call strxfrm() */
utf8 = FALSE;
if (UNLIKELY(! utf8_to_bytes_new_pv((const U8 **) &s, &len, &free_me)))
{
/* But here, it didn't succeed; have to do damage control ...
*
* What we do is construct a non-UTF-8 string with
* 1) the characters representable by a single byte converted to
* be so (if not already);
* 2) and the rest converted to collate the same as the highest
* collating representable character. That makes them collate
* at the end. This is similar to how we handle embedded NULs,
* but we use the highest collating code point instead of the
* smallest. Like the NUL case, this isn't perfect, but is the
* best we can reasonably do. Every above-255 code point will
* sort the same as the highest-sorting 0-255 code point. If
* that code point can combine in a sequence with some other
* code points for weight calculations, us changing something
* to be it can adversely affect the results. But in most
* cases, it should work reasonably. And note that this is
* really an illegal situation: using code points above 255 on
* a locale where only 0-255 are valid. If two strings sort
* entirely equal, then the sort order for the above-255 code
* points will be in code point order.
*
* If we haven't calculated the code point with the maximum
* collating order for this locale, do so now */
if (! PL_strxfrm_max_cp) {
int j;

/* The current transformed string that collates the
* highest (except it also includes the prefixed collation
* index. */
char * cur_max_x = NULL;

/* Look through all legal code points (NUL isn't) */
for (j = 1; j < 256; j++) {
char * x;
STRLEN x_len;
char cur_source[] = { '\0', '\0' };

/* Create a 1-char string of the current code point */
cur_source[0] = (char) j;

/* Then transform it */
x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);

/* If something went wrong (which it shouldn't), just
* ignore this code point */
if (! x) {
continue;
}

if (! cur_max_x) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: Couldn't find any character to"
" replace above-Latin1 chars in locale %s with",
PL_collation_name));
goto bad;
/* If this character's transformation is higher than
* the current highest, this one becomes the highest */
if ( cur_max_x == NULL
|| strGT(x + COLLXFRM_HDR_LEN,
cur_max_x + COLLXFRM_HDR_LEN))
{
PL_strxfrm_max_cp = j;
Safefree(cur_max_x);
cur_max_x = x;
}
else {
Safefree(x);
}
}

if (! cur_max_x) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: highest 1-byte collating character"
" in locale %s is 0x%02X\n",
PL_collation_name,
PL_strxfrm_max_cp));

Safefree(cur_max_x);
"mem_collxfrm_: Couldn't find any character to"
" replace above-Latin1 chars in locale %s with",
PL_collation_name));
goto bad;
}

/* Here we know which legal code point collates the highest.
* We are ready to construct the non-UTF-8 string. The length
* will be at least 1 byte smaller than the input string
* (because we changed at least one 2-byte character into a
* single byte), but that is eaten up by the trailing NUL */
Newx(s, len, char);
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: highest 1-byte collating character"
" in locale %s is 0x%02X\n",
PL_collation_name,
PL_strxfrm_max_cp));

{
STRLEN i;
STRLEN d= 0;
char * e = (char *) t + len;

for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
U8 cur_char = t[i];
if (UTF8_IS_INVARIANT(cur_char)) {
s[d++] = cur_char;
}
else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
}
else { /* Replace illegal cp with highest collating
one */
s[d++] = PL_strxfrm_max_cp;
}
}
s[d++] = '\0';
Renew(s, d, char); /* Free up unused space */
}
Safefree(cur_max_x);
}
}

/* Here, we have constructed a modified version of the input. It could
* be that we already had a modified copy before we did this version.
* If so, that copy is no longer needed */
if (t != input_string) {
Safefree(t);
/* Here we know which legal code point collates the highest. We
* are ready to construct the non-UTF-8 string. The length will be
* at least 1 byte smaller than the input string (because we
* changed at least one 2-byte character into a single byte), but
* that is eaten up by the trailing NUL
*
* May shrink; will never grow */
Newx(sans_highs, len, char);

STRLEN i;
STRLEN d= 0;
const char * const t = s; /* Temporary so we can later find where
the input was */
char * e = (char *) t + len;
s = sans_highs;

for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
U8 cur_char = t[i];
if (UTF8_IS_INVARIANT(cur_char)) {
s[d++] = cur_char;
}
else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
}
else { /* Replace illegal cp's with highest collating one */
s[d++] = PL_strxfrm_max_cp;
}
}
s[d++] = '\0';
}
}
/* else // Here both the locale and string are UTF-8 */
/* XXX convert above Unicode to 10FFFF? */

length_in_chars = (utf8)
? utf8_length((U8 *) s, (U8 *) s + len)
Expand All @@ -9988,6 +9988,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
/* Store the collation id */
*(PERL_UINTMAX_T *)xbuf = PL_collation_ix;

# define CLEANUP_STRXFRM_COMMON \
STMT_START { \
Safefree(free_me); \
Safefree(sans_nuls); \
Safefree(sans_highs); \
} STMT_END

# if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
# ifdef USE_LOCALE_CTYPE

Expand All @@ -10004,6 +10011,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
STMT_START { \
if (constructed_locale != (locale_t) 0) \
freelocale(constructed_locale); \
CLEANUP_STRXFRM_COMMON; \
} STMT_END
# else
# define my_strxfrm(dest, src, n) strxfrm(dest, src, n)
Expand All @@ -10012,9 +10020,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);

# define CLEANUP_STRXFRM \
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
STMT_START { \
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \
CLEANUP_STRXFRM_COMMON; \
} STMT_END
# else
# define CLEANUP_STRXFRM NOOP
# define CLEANUP_STRXFRM CLEANUP_STRXFRM_COMMON
# endif
# endif

Expand Down Expand Up @@ -10154,28 +10165,20 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
first_time = FALSE;
}

CLEANUP_STRXFRM;

DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
CLEANUP_STRXFRM;

/* Free up unneeded space; retain enough for trailing NUL */
Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);

if (s != input_string) {
Safefree(s);
}

return xbuf;

bad:

CLEANUP_STRXFRM;
DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
CLEANUP_STRXFRM;

Safefree(xbuf);
if (s != input_string) {
Safefree(s);
}
*xlen = 0;

return NULL;
Expand Down