From 24720a792f0f1aebb08fed203e701d5cb6230088 Mon Sep 17 00:00:00 2001 From: xiaoyifang <105986+xiaoyifang@users.noreply.github.com> Date: Fri, 3 Jan 2025 21:26:50 +0800 Subject: [PATCH] opt: remove qtext codec from indexzip.cc (#2057) --- src/common/iconv.cc | 11 ++++ src/common/iconv.hh | 4 +- src/dict/utils/indexedzip.cc | 106 +++++------------------------------ 3 files changed, 29 insertions(+), 92 deletions(-) diff --git a/src/common/iconv.cc b/src/common/iconv.cc index adb7c60ce..365f75bdb 100644 --- a/src/common/iconv.cc +++ b/src/common/iconv.cc @@ -121,3 +121,14 @@ QString Iconv::toQString( char const * fromEncoding, void const * fromData, size Iconv ic( fromEncoding ); return ic.convert( fromData, dataSize ); } +QString Iconv::findValidEncoding( const QStringList & encodings ) +{ + for ( const QString & encoding : encodings ) { + iconv_t cd = iconv_open( "UTF-8", encoding.toUtf8().constData() ); + if ( cd != (iconv_t)-1 ) { + iconv_close( cd ); + return encoding; + } + } + return {}; +} diff --git a/src/common/iconv.hh b/src/common/iconv.hh index 7c6a1b639..872d69079 100644 --- a/src/common/iconv.hh +++ b/src/common/iconv.hh @@ -6,6 +6,7 @@ #include "ex.hh" #include "text.hh" #include +#include #include /// "Internationalization conversion" for char encoding conversion, currently implemented with iconv() @@ -34,7 +35,8 @@ public: static std::string toUtf8( char const * fromEncoding, std::u32string_view str ); static QString toQString( char const * fromEncoding, void const * fromData, size_t dataSize ); - + // tries to find a valid encoding from the given list of encodings. + static QString findValidEncoding( const QStringList & encodings ); // Copying/assigning isn't supported Q_DISABLE_COPY_MOVE( Iconv ); }; diff --git a/src/dict/utils/indexedzip.cc b/src/dict/utils/indexedzip.cc index 494127a28..85872e18e 100644 --- a/src/dict/utils/indexedzip.cc +++ b/src/dict/utils/indexedzip.cc @@ -6,8 +6,6 @@ #include #include "text.hh" #include "iconv.hh" -#include - #include using namespace BtreeIndexing; @@ -160,10 +158,6 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 } // File seems to be a valid zip file - - - QTextCodec * localeCodec = QTextCodec::codecForLocale(); - ZipFile::CentralDirEntry entry; bool alreadyCounted; @@ -177,102 +171,32 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 continue; } - // Check if the file name has some non-ascii letters. - - unsigned char const * ptr = (unsigned char const *)entry.fileName.constData(); - - bool hasNonAscii = false; - - for ( ;; ) { - if ( *ptr & 0x80 ) { - hasNonAscii = true; - break; - } - else if ( !*ptr++ ) { - break; - } - } - - alreadyCounted = false; - - if ( !hasNonAscii ) { - // Add entry as is - + if ( entry.fileNameInUTF8 ) { zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset ); if ( filesCount ) { *filesCount += 1; } } else { - // Try assuming different encodings. Those are UTF8, system locale and two - // Russian ones (Windows and Windows OEM). Unfortunately, zip - // files do not say which encoding they utilize. - - // Utf8 try { - std::u32string decoded = Text::toUtf32( entry.fileName.constData() ); - - zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); - if ( filesCount != 0 && !alreadyCounted ) { - *filesCount += 1; - alreadyCounted = true; + //detect encoding. + auto encoding = Iconv::findValidEncoding( { "LOCAL", "IBM437", "CP866", "CP1251", "UTF-8" } ); + if ( encoding.isEmpty() ) { + qWarning() << "Zip warning: failed to detect encoding -- skipping file" << entry.fileName.data(); + continue; } - } - catch ( Text::exCantDecode & ) { - // Failed to decode - } + std::u32string nameInSystemLocale = + Iconv::toWstring( encoding.toUtf8().constData(), entry.fileName.constData(), entry.fileName.size() ); + if ( !nameInSystemLocale.empty() ) { + zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset ); - if ( !entry.fileNameInUTF8 ) { - std::u32string nameInSystemLocale; - - // System locale - if ( localeCodec ) { - QString name = localeCodec->toUnicode( entry.fileName.constData(), entry.fileName.size() ); - nameInSystemLocale = name.toStdU32String(); - if ( !nameInSystemLocale.empty() ) { - zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset ); - - if ( filesCount != 0 && !alreadyCounted ) { - *filesCount += 1; - alreadyCounted = true; - } + if ( filesCount != 0 ) { + *filesCount += 1; } } - - - // CP866 - try { - std::u32string decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() ); - - if ( nameInSystemLocale != decoded ) { - zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); - - if ( filesCount != 0 && !alreadyCounted ) { - *filesCount += 1; - alreadyCounted = true; - } - } - } - catch ( Iconv::Ex & ) { - // Failed to decode - } - - // CP1251 - try { - std::u32string decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() ); - - if ( nameInSystemLocale != decoded ) { - zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); - - if ( filesCount != 0 && !alreadyCounted ) { - *filesCount += 1; - alreadyCounted = true; - } - } - } - catch ( Iconv::Ex & ) { - // Failed to decode - } + } + catch ( Iconv::Ex & ) { + // Failed to decode } } }