Add new function bytes_to_utf8_free_me

This is like bytes_to_utf8, but if the representation of the input string is the same in UTF-8 as it is in native format, the allocation of new memory is skipped. This presents optimization possibilities.
Perl · Dec 5, 2024 · df1151c · df1151c
1 parent fcc9d7e
commit df1151c
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 19 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -794,8 +794,12 @@ Adp	|int	|bytes_cmp_utf8 |NN const U8 *b 			\
 Adp	|U8 *	|bytes_from_utf8|NN const U8 *s 			\
 				|NN STRLEN *lenp			\
 				|NN bool *is_utf8p
-Adp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
+Admp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
 				|NN STRLEN *lenp
+Adp	|U8 *	|bytes_to_utf8_free_me					\
+				|NN const U8 *s 			\
+				|NN STRLEN *lenp			\
+				|NULLOK const U8 **free_me
 AOdp	|SSize_t|call_argv	|NN const char *sub_name		\
 				|I32 flags				\
 				|NN char **argv

diff --git a/embed.h b/embed.h
@@ -156,7 +156,8 @@
 # define block_start(a)                         Perl_block_start(aTHX_ a)
 # define bytes_cmp_utf8(a,b,c,d)                Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 # define bytes_from_utf8(a,b,c)                 Perl_bytes_from_utf8(aTHX_ a,b,c)
-# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX_ a,b)
+# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX,a,b)
+# define bytes_to_utf8_free_me(a,b,c)           Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
 # define c9strict_utf8_to_uv                    Perl_c9strict_utf8_to_uv
 # define call_argv(a,b,c)                       Perl_call_argv(aTHX_ a,b,c)
 # define call_atexit(a,b)                       Perl_call_atexit(aTHX_ a,b)

diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -3182,20 +3182,32 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 }
 
 /*
-=for apidoc bytes_to_utf8
-
-Converts a string C<s> of length C<*lenp> bytes from the native encoding into
-UTF-8.
-Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.  The caller is responsible for arranging for
-the memory used by this string to get freed.
+=for apidoc      bytes_to_utf8
+=for apidoc_item bytes_to_utf8_free_me
+
+These each convert a string C<s> of length C<*lenp> bytes from the native
+encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
+the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
+sure that the string is terminated by a C<NUL> character.
+
+They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
+C<free_me>.  If that parameter is NULL, this function behaves identically to
+C<bytes_to_utf8>.  But if not NULL, the function skips allocating new memory if
+the input string already is C<NUL>-terminated, and its UTF-8 representation is
+the same as its native representation.  In other words it returns the input
+string if converting the string would be a no-op.  It sets C<*free_me> to NULL
+in that case.  Otherwise C<*free_me> is set to the address of the newly
+allocalted memory.  Note that in both cases, you can pass that result to
+C<L</Safefree>> and it will do the right thing.
+
+Note that when new memory is allocated, the caller is responsible for arranging
+for that memory to get freed.  (This is transparent to the caller if
+C<Safefree> is called with C<free_me>.)
 
 Upon return, the number of variants in the string can be computed by
 having saved the value of C<*lenp> before the call, and subtracting it from the
 after-call value of C<*lenp>.
 
-A C<NUL> character will be written after the end of the string.
-
 If you want to convert to UTF-8 from encodings other than
 the native (Latin1 or EBCDIC),
 see L</sv_recode_to_utf8>().
@@ -3204,17 +3216,24 @@ see L</sv_recode_to_utf8>().
 */
 
 U8*
-Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
+Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
+                                 const U8 ** free_me_ptr)
 {
-    const U8 * const send = s + (*lenp);
+    PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
+    PERL_UNUSED_CONTEXT;
+
+    const U8 * const send = s + *lenp;
+    Size_t variant_count = variant_under_utf8_count(s, send);
+    if (free_me_ptr != NULL && variant_count == 0 && s[*lenp-1] == '\0') {
+        *free_me_ptr = NULL;
+        return (U8 *) s;
+    }
+
     U8 *d;
     U8 *dst;
 
-    PERL_ARGS_ASSERT_BYTES_TO_UTF8;
-    PERL_UNUSED_CONTEXT;
-
     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
-    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
+    Newx(d, (*lenp) + variant_count + 1, U8);
     dst = d;
 
     while (s < send) {
@@ -3225,6 +3244,10 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
     *d = '\0';
     *lenp = d-dst;
 
+    if (free_me_ptr != NULL) {
+        *free_me_ptr = dst;
+    }
+
     return dst;
 }
 

diff --git a/utf8.h b/utf8.h
@@ -1328,6 +1328,7 @@ point's representation.
 
 #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
 
+#define Perl_bytes_to_utf8(mTHX, s, lenp)  Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
 typedef enum {
     PL_utf8_to_bytes_overwrite = 0,
     PL_utf8_to_bytes_new_memory,