Skip to content

Commit

Permalink
Add new function bytes_to_utf8_free_me
Browse files Browse the repository at this point in the history
This is like bytes_to_utf8, but if the representation of the input
string is the same in UTF-8 as it is in native format, the allocation of
new memory is skipped.

This presents optimization possibilities.
  • Loading branch information
khwilliamson committed Dec 5, 2024
1 parent fcc9d7e commit df1151c
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 19 deletions.
6 changes: 5 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \
Adp |U8 * |bytes_from_utf8|NN const U8 *s \
|NN STRLEN *lenp \
|NN bool *is_utf8p
Adp |U8 * |bytes_to_utf8 |NN const U8 *s \
Admp |U8 * |bytes_to_utf8 |NN const U8 *s \
|NN STRLEN *lenp
Adp |U8 * |bytes_to_utf8_free_me \
|NN const U8 *s \
|NN STRLEN *lenp \
|NULLOK const U8 **free_me
AOdp |SSize_t|call_argv |NN const char *sub_name \
|I32 flags \
|NN char **argv
Expand Down
3 changes: 2 additions & 1 deletion embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@
# define block_start(a) Perl_block_start(aTHX_ a)
# define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b)
# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv
# define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
# define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b)
Expand Down
7 changes: 5 additions & 2 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 38 additions & 15 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -3182,20 +3182,32 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
}

/*
=for apidoc bytes_to_utf8
Converts a string C<s> of length C<*lenp> bytes from the native encoding into
UTF-8.
Returns a pointer to the newly-created string, and sets C<*lenp> to
reflect the new length in bytes. The caller is responsible for arranging for
the memory used by this string to get freed.
=for apidoc bytes_to_utf8
=for apidoc_item bytes_to_utf8_free_me
These each convert a string C<s> of length C<*lenp> bytes from the native
encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
sure that the string is terminated by a C<NUL> character.
They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
C<free_me>. If that parameter is NULL, this function behaves identically to
C<bytes_to_utf8>. But if not NULL, the function skips allocating new memory if
the input string already is C<NUL>-terminated, and its UTF-8 representation is
the same as its native representation. In other words it returns the input
string if converting the string would be a no-op. It sets C<*free_me> to NULL
in that case. Otherwise C<*free_me> is set to the address of the newly
allocalted memory. Note that in both cases, you can pass that result to
C<L</Safefree>> and it will do the right thing.
Note that when new memory is allocated, the caller is responsible for arranging
for that memory to get freed. (This is transparent to the caller if
C<Safefree> is called with C<free_me>.)
Upon return, the number of variants in the string can be computed by
having saved the value of C<*lenp> before the call, and subtracting it from the
after-call value of C<*lenp>.
A C<NUL> character will be written after the end of the string.
If you want to convert to UTF-8 from encodings other than
the native (Latin1 or EBCDIC),
see L</sv_recode_to_utf8>().
Expand All @@ -3204,17 +3216,24 @@ see L</sv_recode_to_utf8>().
*/

U8*
Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
const U8 ** free_me_ptr)
{
const U8 * const send = s + (*lenp);
PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
PERL_UNUSED_CONTEXT;

const U8 * const send = s + *lenp;
Size_t variant_count = variant_under_utf8_count(s, send);
if (free_me_ptr != NULL && variant_count == 0 && s[*lenp-1] == '\0') {
*free_me_ptr = NULL;
return (U8 *) s;
}

U8 *d;
U8 *dst;

PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;

/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
Newx(d, (*lenp) + variant_count + 1, U8);
dst = d;

while (s < send) {
Expand All @@ -3225,6 +3244,10 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
*d = '\0';
*lenp = d-dst;

if (free_me_ptr != NULL) {
*free_me_ptr = dst;
}

return dst;
}

Expand Down
1 change: 1 addition & 0 deletions utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,7 @@ point's representation.

#define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)

#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
typedef enum {
PL_utf8_to_bytes_overwrite = 0,
PL_utf8_to_bytes_new_memory,
Expand Down

0 comments on commit df1151c

Please sign in to comment.