Perl · khwilliamson · Jan 29, 2025 · Jan 29, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/locale.c b/locale.c
@@ -9609,6 +9609,8 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                          bool utf8      /* Is the input in UTF-8? */
                    )
 {
+    PERL_ARGS_ASSERT_MEM_COLLXFRM_;
+
     /* mem_collxfrm_() is like strxfrm() but with two important differences.
      * First, it handles embedded NULs. Second, it allocates a bit more memory
      * than needed for the transformed data itself.  The real transformed data
@@ -9637,11 +9639,18 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     locale_t constructed_locale = (locale_t) 0;
 #  endif
 
-    PERL_ARGS_ASSERT_MEM_COLLXFRM_;
-
     /* Must be NUL-terminated */
     assert(*(input_string + len) == '\0');
 
+    /* We may have to allocate memory to hold modified versions of the input.
+     * Initialize to NULL here, and before any return, free them all.  Those
+     * that do get allocated will be non-NULL then, and get freed */
+    char * sans_nuls = NULL;       /* NULs changed to lowest collating ctrl */
+    char * sans_highs = NULL;   /* >0xFF changed to highest collating byte
+                                      for non-UTF8 locales */
+    void * free_me = NULL;  /* some called functions may allocate memory that
+                               this function then is required to free */
+
     if (PL_collxfrm_mult == 0) {     /* unknown or bad */
         if (PL_collxfrm_base != 0) { /* bad collation => skip */
             DEBUG_L(PerlIO_printf(Perl_debug_log,
@@ -9659,10 +9668,10 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
      * otherwise contain that character, but otherwise there may be
      * less-than-perfect results with that character and NUL.  This is
      * unavoidable unless we replace strxfrm with our own implementation. */
+
     if (UNLIKELY(s_strlen < len)) {   /* Only execute if there is an embedded
                                          NUL */
         char * e = s + len;
-        char * sans_nuls;
         STRLEN sans_nuls_len;
         int try_non_controls;
         char this_replacement_char[] = "?\0";   /* Room for a two-byte string,
@@ -9825,148 +9834,139 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
         len = strlen(s);
     } /* End of replacing NULs */
 
-    /* Make sure the UTF8ness of the string and locale match */
-    if (utf8 != PL_in_utf8_COLLATE_locale) {
-        /* XXX convert above Unicode to 10FFFF? */
-        const char * const t = s;   /* Temporary so we can later find where the
-                                       input was */
+    if (! utf8) {
 
-        /* Here they don't match.  Change the string's to be what the locale is
-         * expecting */
-
-        if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */
-            s = (char *) bytes_to_utf8((const U8 *) s, &len);
+        /* When the locale is UTF-8, strxfrm() is expecting a UTF-8 string.
+         * Here, the string isn't.  Convert it to be so. */
+        if (PL_in_utf8_COLLATE_locale) {
+            s = (char *) bytes_to_utf8_free_me((const U8 *) s, &len, &free_me);
             utf8 = TRUE;
         }
-        else {   /* locale is not UTF-8; but input is; downgrade the input */
-
-            s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8);
-
-            /* If the downgrade was successful we are done, but if the input
-             * contains things that require UTF-8 to represent, have to do
-             * damage control ... */
-            if (UNLIKELY(utf8)) {
-
-                /* What we do is construct a non-UTF-8 string with
-                 *  1) the characters representable by a single byte converted
-                 *     to be so (if necessary);
-                 *  2) and the rest converted to collate the same as the
-                 *     highest collating representable character.  That makes
-                 *     them collate at the end.  This is similar to how we
-                 *     handle embedded NULs, but we use the highest collating
-                 *     code point instead of the smallest.  Like the NUL case,
-                 *     this isn't perfect, but is the best we can reasonably
-                 *     do.  Every above-255 code point will sort the same as
-                 *     the highest-sorting 0-255 code point.  If that code
-                 *     point can combine in a sequence with some other code
-                 *     points for weight calculations, us changing something to
-                 *     be it can adversely affect the results.  But in most
-                 *     cases, it should work reasonably.  And note that this is
-                 *     really an illegal situation: using code points above 255
-                 *     on a locale where only 0-255 are valid.  If two strings
-                 *     sort entirely equal, then the sort order for the
-                 *     above-255 code points will be in code point order. */
-
-                utf8 = FALSE;
-
-                /* If we haven't calculated the code point with the maximum
-                 * collating order for this locale, do so now */
-                if (! PL_strxfrm_max_cp) {
-                    int j;
-
-                    /* The current transformed string that collates the
-                     * highest (except it also includes the prefixed collation
-                     * index. */
-                    char * cur_max_x = NULL;
-
-                    /* Look through all legal code points (NUL isn't) */
-                    for (j = 1; j < 256; j++) {
-                        char * x;
-                        STRLEN x_len;
-                        char cur_source[] = { '\0', '\0' };
-
-                        /* Create a 1-char string of the current code point */
-                        cur_source[0] = (char) j;
-
-                        /* Then transform it */
-                        x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
-
-                        /* If something went wrong (which it shouldn't), just
-                         * ignore this code point */
-                        if (! x) {
-                            continue;
-                        }
 
-                        /* If this character's transformation is higher than
-                         * the current highest, this one becomes the highest */
-                        if (   cur_max_x == NULL
-                            || strGT(x         + COLLXFRM_HDR_LEN,
-                                     cur_max_x + COLLXFRM_HDR_LEN))
-                        {
-                            PL_strxfrm_max_cp = j;
-                            Safefree(cur_max_x);
-                            cur_max_x = x;
-                        }
-                        else {
-                            Safefree(x);
-                        }
+        /* We are ready to call strxfrm() */
+    }
+    else if (! PL_in_utf8_COLLATE_locale) {
+
+        /* Here, the string is UTF-8, but the locale isn't.  strxfrm() is
+         * expecting a non-UTF-8 string.  Convert the string to bytes.  If
+         * that succeeds, we are ready to call strxfrm() */
+        utf8 = FALSE;
+        if (UNLIKELY(! utf8_to_bytes_new_pv((const U8 **) &s, &len, &free_me)))
+        {
+            /* But here, it didn't succeed; have to do damage control ...
+             *
+             * What we do is construct a non-UTF-8 string with
+             *  1) the characters representable by a single byte converted to
+             *     be so (if not already);
+             *  2) and the rest converted to collate the same as the highest
+             *     collating representable character.  That makes them collate
+             *     at the end.  This is similar to how we handle embedded NULs,
+             *     but we use the highest collating code point instead of the
+             *     smallest.  Like the NUL case, this isn't perfect, but is the
+             *     best we can reasonably do.  Every above-255 code point will
+             *     sort the same as the highest-sorting 0-255 code point.  If
+             *     that code point can combine in a sequence with some other
+             *     code points for weight calculations, us changing something
+             *     to be it can adversely affect the results.  But in most
+             *     cases, it should work reasonably.  And note that this is
+             *     really an illegal situation: using code points above 255 on
+             *     a locale where only 0-255 are valid.  If two strings sort
+             *     entirely equal, then the sort order for the above-255 code
+             *     points will be in code point order.
+             *
+             * If we haven't calculated the code point with the maximum
+             * collating order for this locale, do so now */
+            if (! PL_strxfrm_max_cp) {
+                int j;
+
+                /* The current transformed string that collates the
+                 * highest (except it also includes the prefixed collation
+                 * index. */
+                char * cur_max_x = NULL;
+
+                /* Look through all legal code points (NUL isn't) */
+                for (j = 1; j < 256; j++) {
+                    char * x;
+                    STRLEN x_len;
+                    char cur_source[] = { '\0', '\0' };
+
+                    /* Create a 1-char string of the current code point */
+                    cur_source[0] = (char) j;
+
+                    /* Then transform it */
+                    x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
+
+                    /* If something went wrong (which it shouldn't), just
+                     * ignore this code point */
+                    if (! x) {
+                        continue;
                     }
 
-                    if (! cur_max_x) {
-                        DEBUG_L(PerlIO_printf(Perl_debug_log,
-                            "mem_collxfrm_: Couldn't find any character to"
-                            " replace above-Latin1 chars in locale %s with",
-                            PL_collation_name));
-                        goto bad;
+                    /* If this character's transformation is higher than
+                     * the current highest, this one becomes the highest */
+                    if (   cur_max_x == NULL
+                        || strGT(x         + COLLXFRM_HDR_LEN,
+                                 cur_max_x + COLLXFRM_HDR_LEN))
+                    {
+                        PL_strxfrm_max_cp = j;
+                        Safefree(cur_max_x);
+                        cur_max_x = x;
+                    }
+                    else {
+                        Safefree(x);
                     }
+                }
 
+                if (! cur_max_x) {
                     DEBUG_L(PerlIO_printf(Perl_debug_log,
-                            "mem_collxfrm_: highest 1-byte collating character"
-                            " in locale %s is 0x%02X\n",
-                            PL_collation_name,
-                            PL_strxfrm_max_cp));
-
-                    Safefree(cur_max_x);
+                        "mem_collxfrm_: Couldn't find any character to"
+                        " replace above-Latin1 chars in locale %s with",
+                        PL_collation_name));
+                    goto bad;
                 }
 
-                /* Here we know which legal code point collates the highest.
-                 * We are ready to construct the non-UTF-8 string.  The length
-                 * will be at least 1 byte smaller than the input string
-                 * (because we changed at least one 2-byte character into a
-                 * single byte), but that is eaten up by the trailing NUL */
-                Newx(s, len, char);
+                DEBUG_L(PerlIO_printf(Perl_debug_log,
+                        "mem_collxfrm_: highest 1-byte collating character"
+                        " in locale %s is 0x%02X\n",
+                        PL_collation_name,
+                        PL_strxfrm_max_cp));
 
-                {
-                    STRLEN i;
-                    STRLEN d= 0;
-                    char * e = (char *) t + len;
-
-                    for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
-                        U8 cur_char = t[i];
-                        if (UTF8_IS_INVARIANT(cur_char)) {
-                            s[d++] = cur_char;
-                        }
-                        else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
-                            s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
-                        }
-                        else {  /* Replace illegal cp with highest collating
-                                   one */
-                            s[d++] = PL_strxfrm_max_cp;
-                        }
-                    }
-                    s[d++] = '\0';
-                    Renew(s, d, char);   /* Free up unused space */
-                }
+                Safefree(cur_max_x);
             }
-        }
 
-        /* Here, we have constructed a modified version of the input.  It could
-         * be that we already had a modified copy before we did this version.
-         * If so, that copy is no longer needed */
-        if (t != input_string) {
-            Safefree(t);
+            /* Here we know which legal code point collates the highest.  We
+             * are ready to construct the non-UTF-8 string.  The length will be
+             * at least 1 byte smaller than the input string (because we
+             * changed at least one 2-byte character into a single byte), but
+             * that is eaten up by the trailing NUL
+             *
+             * May shrink; will never grow */
+            Newx(sans_highs, len, char);
+
+            STRLEN i;
+            STRLEN d= 0;
+            const char * const t = s;   /* Temporary so we can later find where
+                                           the input was */
+            char * e = (char *) t + len;
+            s = sans_highs;
+
+            for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
+                U8 cur_char = t[i];
+                if (UTF8_IS_INVARIANT(cur_char)) {
+                    s[d++] = cur_char;
+                }
+                else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
+                    s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
+                }
+                else {  /* Replace illegal cp's with highest collating one */
+                    s[d++] = PL_strxfrm_max_cp;
+                }
+            }
+            s[d++] = '\0';
         }
     }
+    /* else   // Here both the locale and string are UTF-8 */
+    /* XXX convert above Unicode to 10FFFF? */
 
     length_in_chars = (utf8)
                       ? utf8_length((U8 *) s, (U8 *) s + len)
@@ -9988,6 +9988,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     /* Store the collation id */
     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
 
+#  define CLEANUP_STRXFRM_COMMON                                        \
+        STMT_START {                                                    \
+            Safefree(free_me);                                          \
+            Safefree(sans_nuls);                                        \
+            Safefree(sans_highs);                                       \
+        } STMT_END
+
 #  if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
 #    ifdef USE_LOCALE_CTYPE
 
@@ -10004,6 +10011,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
         STMT_START {                                                    \
             if (constructed_locale != (locale_t) 0)                     \
                 freelocale(constructed_locale);                         \
+            CLEANUP_STRXFRM_COMMON;                                     \
         } STMT_END
 #  else
 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
@@ -10012,9 +10020,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
 
 #      define CLEANUP_STRXFRM                                           \
-                restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
+        STMT_START {                                                    \
+                restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);  \
+                CLEANUP_STRXFRM_COMMON;                                 \
+        } STMT_END
 #    else
-#      define CLEANUP_STRXFRM  NOOP
+#      define CLEANUP_STRXFRM  CLEANUP_STRXFRM_COMMON
 #    endif
 #  endif
 
@@ -10154,28 +10165,20 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
         first_time = FALSE;
     }
 
-    CLEANUP_STRXFRM;
-
     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
+    CLEANUP_STRXFRM;
 
     /* Free up unneeded space; retain enough for trailing NUL */
     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
 
-    if (s != input_string) {
-        Safefree(s);
-    }
-
     return xbuf;
 
   bad:
 
-    CLEANUP_STRXFRM;
     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
+    CLEANUP_STRXFRM;
 
     Safefree(xbuf);
-    if (s != input_string) {
-        Safefree(s);
-    }
     *xlen = 0;
 
     return NULL;