From 51a86b2dd80038f6f350f473ea4cf8688d3b95dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Kr=C3=B6ll?= Date: Mon, 15 Feb 2016 16:10:52 +0100 Subject: [PATCH] Avoid generating malformed UTF-8 and replacement characters by interpolating the variable as-is if it is not a valid UTF-8 sequence (nc), Github issue #88 --- MANIFEST | 1 + lib/Text/Xslate/PP/State.pm | 4 ++-- src/Text-Xslate.xs | 32 +++++++++++++++++++++++++++++--- t/900_bugs/046_issue88.t | 23 +++++++++++++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 t/900_bugs/046_issue88.t diff --git a/MANIFEST b/MANIFEST index d70c6b4e..85a812f4 100644 --- a/MANIFEST +++ b/MANIFEST @@ -311,6 +311,7 @@ t/900_bugs/042_perl59_issue.t t/900_bugs/043_issue107.t t/900_bugs/044_empty_result.t t/900_bugs/045_issue130.t +t/900_bugs/046_issue88.t t/900_bugs/issue79/tmpl/contentA.tt t/900_bugs/issue79/tmpl/contentB.tt t/900_bugs/issue79/tmpl/wrapperA.tt diff --git a/lib/Text/Xslate/PP/State.pm b/lib/Text/Xslate/PP/State.pm index 43d4b609..7f797961 100644 --- a/lib/Text/Xslate/PP/State.pm +++ b/lib/Text/Xslate/PP/State.pm @@ -177,7 +177,7 @@ sub print { if(defined ${$sv}) { $st->{output} .= (utf8::is_utf8($st->{output}) && !utf8::is_utf8(${$sv})) - ? $st->encoding->decode(${$sv}) + ? eval {$st->encoding->decode(${$sv}, Encode::FB_CROAK())} || ${$sv} : ${$sv}; } else { @@ -188,7 +188,7 @@ sub print { $sv =~ s/($Text::Xslate::PP::html_metachars)/$Text::Xslate::PP::html_escape{$1}/xmsgeo; $st->{output} .= (utf8::is_utf8($st->{output}) && !utf8::is_utf8($sv)) - ? $st->encoding->decode($sv) + ? eval {$st->encoding->decode($sv, Encode::FB_CROAK())} || $sv : $sv; } else { diff --git a/src/Text-Xslate.xs b/src/Text-Xslate.xs index 6fc2e114..159f4472 100644 --- a/src/Text-Xslate.xs +++ b/src/Text-Xslate.xs @@ -541,19 +541,39 @@ tx_unmark_raw(pTHX_ SV* const str) { /* does sv_catsv_nomg(dest, src), but significantly faster */ STATIC_INLINE void tx_sv_cat(pTHX_ SV* const dest, SV* const src) { + STRLEN len; + const char* pv = SvPV_const(src, len); + if(!SvUTF8(dest) && SvUTF8(src)) { sv_utf8_upgrade(dest); } - { - STRLEN len; - const char* const pv = SvPV_const(src, len); + if(SvUTF8(dest) == SvUTF8(src) + || is_utf8_string((const U8 *)pv, len)) { STRLEN const dest_cur = SvCUR(dest); char* const d = SvGROW(dest, dest_cur + len + 1 /* count '\0' */); SvCUR_set(dest, dest_cur + len); Copy(pv, d + dest_cur, len + 1 /* copy '\0' */, char); } + else { + STRLEN const dest_cur = SvCUR(dest); + /* Longest UTF-8 representation of each char is 2 octets. */ + char* const d_start = SvGROW(dest, dest_cur + 2 * len + 1 /* count '\0' */); + char* d = d_start + dest_cur; + + while(len--) { + const U8 c = *pv++; + if (UTF8_IS_INVARIANT(c)) { + *(d++) = c; + } else { + *(d++) = UTF8_EIGHT_BIT_HI(c); + *(d++) = UTF8_EIGHT_BIT_LO(c); + } + } + *d = '\0'; + SvCUR_set(dest, d - d_start); + } } static void /* doesn't care about raw-ness */ @@ -563,6 +583,8 @@ tx_sv_cat_with_html_escape_force(pTHX_ SV* const dest, SV* const src) { const char* const end = cur + len; STRLEN const dest_cur = SvCUR(dest); char* d; + const U32 upgrade_on_copy = SvUTF8(dest) && !SvUTF8(src) + && !is_utf8_string((const U8 *)cur, len); (void)SvGROW(dest, dest_cur + ( len * ( sizeof(""") - 1) ) + 1); if(!SvUTF8(dest) && SvUTF8(src)) { @@ -595,6 +617,10 @@ tx_sv_cat_with_html_escape_force(pTHX_ SV* const dest, SV* const src) { // CopyToken("'", d); CopyToken("'", d); } + else if (upgrade_on_copy && !UTF8_IS_INVARIANT(c)) { + *(d++) = UTF8_EIGHT_BIT_HI(c); + *(d++) = UTF8_EIGHT_BIT_LO(c); + } else { *(d++) = c; } diff --git a/t/900_bugs/046_issue88.t b/t/900_bugs/046_issue88.t new file mode 100644 index 00000000..cbcc0547 --- /dev/null +++ b/t/900_bugs/046_issue88.t @@ -0,0 +1,23 @@ +#!perl +# https://github.com/xslate/p5-Text-Xslate/issues/88 +use strict; +use warnings; +use Test::More; + +use utf8; +use Text::Xslate 'mark_raw'; +my $xslate = Text::Xslate->new(); + +is $xslate->render_string('<: $string :>', {string => "Ä"}) => 'Ä'; +is $xslate->render_string('<: $string :>', {string => "\x{c4}"}) => 'Ä'; + +is $xslate->render_string('あ<: $string :>', {string => "Ä"}) => 'あÄ'; +is $xslate->render_string('あ<: $string :>', {string => "\x{c4}"}) => 'あÄ'; + +is $xslate->render_string('<: $string :>', {string => mark_raw("Ä")}) => 'Ä'; +is $xslate->render_string('<: $string :>', {string => mark_raw("\x{c4}")}) => 'Ä'; + +is $xslate->render_string('あ<: $string :>', {string => mark_raw("Ä")}) => 'あÄ'; +is $xslate->render_string('あ<: $string :>', {string => mark_raw("\x{c4}")}) => 'あÄ'; + +done_testing();