Skip to content

Commit

Permalink
Simply BOM handling in charset conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
plorkyeran authored and CoffeeFlux committed Nov 24, 2023
1 parent 4e117b8 commit 8b63241
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 77 deletions.
62 changes: 6 additions & 56 deletions libaegisub/common/charset_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,55 +66,15 @@ namespace {
});

if (enc != std::end(pretty_names) && strcmp(enc->pretty, name) == 0)
return enc->real;
name = enc->real;
// UTF-16 and UTF-32 encode a BOM, which we don't want
if (boost::iequals(name, "utf-16"))
name = "utf-16be";
if (boost::iequals(name, "utf-32"))
name = "utf-32be";
return name;
}

size_t get_bom_size(Iconv& cd) {
// Most (but not all) iconv implementations automatically insert a BOM
// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
// we usually don't want this, as some of the wxString using code
// assumes there is no BOM (as the exact encoding is known externally)
// As such, when doing conversions we will strip the BOM if it exists,
// then manually add it when writing files

char buff[8];
const char* src = "";
char *dst = buff;
size_t srcLen = 1;
size_t dstLen = 8;

size_t res = cd(&src, &srcLen, &dst, &dstLen);
assert(res != iconv_failed);
assert(srcLen == 0);

size_t size = 0;
for (src = buff; src < dst; ++src) {
if (*src) ++size;
}
if (size) {
// If there is a BOM, it will always be at least as big as the NUL
size = std::max(size, (8 - dstLen) / 2);
}
return size;
}

void eat_bom(Iconv& cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
// a specified byte order), skip over it
if (bomSize > 0 && inbytesleft && *inbytesleft) {
// libiconv marks the bom as written after writing the first
// character after the bom rather than when it writes the bom, so
// convert at least one extra character
char bom[8];
char *dst = bom;
size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
const char *src = *inbuf;
size_t srcSize = *inbytesleft;
cd(&src, &srcSize, &dst, &dstSize);
}
}

// Calculate the size of NUL in the given character set
size_t nul_size(const char *encoding) {
// We need a character set to convert from with a known encoding of NUL
Expand All @@ -137,22 +97,16 @@ namespace {

#ifdef ICONV_POSIX
class ConverterImpl final : public Converter {
size_t bomSize;
Iconv cd;
public:
// subst is not used here because POSIX doesn't let you disable substitution
ConverterImpl(bool, const char* sourceEncoding, const char* destEncoding)
{
const char *dstEnc = get_real_encoding_name(destEncoding);
cd = Iconv("utf-8", dstEnc);

bomSize = get_bom_size(cd);
cd = Iconv(get_real_encoding_name(sourceEncoding), dstEnc);
}

size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);

size_t res = cd(inbuf, inbytesleft, outbuf, outbytesleft);

// This loop never does anything useful with a POSIX-compliant iconv
Expand All @@ -170,7 +124,6 @@ namespace {
#else

class ConverterImpl final : public iconv_fallbacks, public Converter {
size_t bomSize;
char invalidRep[8];
size_t invalidRepSize;
Iconv cd;
Expand All @@ -197,8 +150,6 @@ namespace {
const char *dstEnc = get_real_encoding_name(destEncoding);
cd = Iconv("utf-8", dstEnc);

bomSize = get_bom_size(cd);

// Get fallback character
const char sbuff[] = "?";
const char *src = sbuff;
Expand Down Expand Up @@ -228,7 +179,6 @@ namespace {
}

size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) override {
eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
size_t res = cd(inbuf, inbytesleft, outbuf, outbytesleft);

if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
Expand Down
1 change: 1 addition & 0 deletions libaegisub/common/line_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ line_iterator_base::line_iterator_base(std::istream &stream, const char *encodin
c.Convert("\r", cr);
width = c.Convert("\n", lf);
conv = std::make_shared<agi::charset::IconvWrapper>(encoding, "utf-8");
assert(width != 0);
}
}

Expand Down
4 changes: 0 additions & 4 deletions libaegisub/include/libaegisub/charset_conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

/// @file charset_conv.h
/// @brief Wrapper for libiconv to present a more C++-friendly API
/// @ingroup libaegisub

#pragma once

#include <memory>
Expand Down
26 changes: 9 additions & 17 deletions tests/tests/line_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,34 +24,26 @@
#include <vector>

template<typename T>
void test_values(agi::line_iterator<T>& iter) {
EXPECT_EQ(iter, end(iter));
}

template<typename T, typename First, typename... Values>
void test_values(agi::line_iterator<T>& iter, First first, Values... values) {
EXPECT_FALSE(iter == end(iter));
EXPECT_EQ(*iter, first);
EXPECT_NO_THROW(++iter);
test_values(iter, values...);
}

template<typename T, typename... Values>
void test(std::string const& str, const char *encoding, Values... values) {
void test(std::string const& str, const char *encoding, std::initializer_list<T> values) {
std::stringstream ss(str);
agi::line_iterator<T> iter;
EXPECT_NO_THROW(iter = agi::line_iterator<T>(ss, encoding));
test_values(iter, values...);
for (auto&& value : values) {
ASSERT_NE(iter, end(iter));
EXPECT_EQ(*iter, value);
EXPECT_NO_THROW(++iter);
}
EXPECT_EQ(iter, end(iter));
}

template<typename T, typename... Values>
void expect_eq(const char *str, Values... values) {
std::string utf8(str);
test<T>(utf8, "utf-8", values...);
test<T>(utf8, "utf-8", {values...});

agi::charset::IconvWrapper conv("utf-8", "utf-16");
auto utf16 = conv.Convert(utf8);
test<T>(utf16, "utf-16", values...);
test<T>(utf16, "utf-16", {values...});
}

TEST(lagi_line, int) {
Expand Down

0 comments on commit 8b63241

Please sign in to comment.