Skip to content

Commit

Permalink
fix: indexedzip indexFile method rewrite (#2061)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoyifang authored Jan 4, 2025
1 parent 24720a7 commit 13b910a
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 81 deletions.
4 changes: 3 additions & 1 deletion src/dict/btreeidx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ enum {
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 4
FormatVersion = 4,
//the indexedzip parse logic version
ZipParseLogicVersion = 1
};

// These exceptions which might be thrown during the index traversal
Expand Down
2 changes: 1 addition & 1 deletion src/dict/dsl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )

enum {
Signature = 0x584c5344, // DSLX on little-endian, XLSD on big-endian
CurrentFormatVersion = 23 + BtreeIndexing::FormatVersion + Folding::Version,
CurrentFormatVersion = 23 + BtreeIndexing::FormatVersion + Folding::Version + BtreeIndexing::ZipParseLogicVersion,
CurrentZipSupportVersion = 2,
CurrentFtsIndexVersion = 7
};
Expand Down
2 changes: 1 addition & 1 deletion src/dict/gls.cc
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )

enum {
Signature = 0x58534c47, // GLSX on little-endian, XSLG on big-endian
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version,
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version + BtreeIndexing::ZipParseLogicVersion,
CurrentZipSupportVersion = 2,
CurrentFtsIndexVersion = 1
};
Expand Down
2 changes: 1 addition & 1 deletion src/dict/stardict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct Ifo

enum {
Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version + BtreeIndexing::ZipParseLogicVersion
};

#pragma pack( push, 1 )
Expand Down
18 changes: 12 additions & 6 deletions src/dict/utils/indexedzip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ bool IndexedZip::loadFile( uint32_t offset, vector< char > & data )
return false;
}

//the offset is central dir header position.
ZipFile::LocalFileHeader header;

if ( !ZipFile::readLocalHeader( zip, header ) ) {
if ( !ZipFile::readLocalHeaderFromCentral( zip, header ) ) {
vector< string > zipFileNames;
zip.getFilenames( zipFileNames );
qDebug( "Failed to load header" );
qDebug() << "Failed to load header";
string filename;
if ( zip.getCurrentFile() < zipFileNames.size() ) {
filename = zipFileNames.at( zip.getCurrentFile() );
Expand All @@ -73,11 +74,16 @@ bool IndexedZip::loadFile( uint32_t offset, vector< char > & data )
return false;
}

// Which algorithm was used?
zip.seek( header.offset );
if ( !ZipFile::skipLocalHeader( zip ) ) {
qDebug() << "Failed to skip local header";
return false;
}

// Which algorithm was used?
switch ( header.compressionMethod ) {
case ZipFile::Uncompressed:
qDebug( "Uncompressed" );
qDebug() << "Uncompressed";
data.resize( header.uncompressedSize );
return (size_t)zip.read( &data.front(), data.size() ) == data.size();

Expand Down Expand Up @@ -172,7 +178,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
}

if ( entry.fileNameInUTF8 ) {
zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset );
zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.centralHeaderOffset );
if ( filesCount ) {
*filesCount += 1;
}
Expand All @@ -188,7 +194,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
std::u32string nameInSystemLocale =
Iconv::toWstring( encoding.toUtf8().constData(), entry.fileName.constData(), entry.fileName.size() );
if ( !nameInSystemLocale.empty() ) {
zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset );
zipFileNames.addSingleWord( nameInSystemLocale, entry.centralHeaderOffset );

if ( filesCount != 0 ) {
*filesCount += 1;
Expand Down
87 changes: 23 additions & 64 deletions src/dict/utils/zipfile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,11 @@ __attribute__( ( packed ) )
#endif
;

struct DataDescriptor
{
quint32 crc32;
quint32 compressedSize;
quint32 uncompressedSize;
}
#ifndef _MSC_VER
__attribute__( ( packed ) )
#endif
;

#pragma pack( pop )

static quint32 const endOfCdirRecordSignatureValue = qToLittleEndian( 0x06054b50 );
static quint32 const centralFileHeaderSignature = qToLittleEndian( 0x02014b50 );
static quint32 const localFileHeaderSignature = qToLittleEndian( 0x04034b50 );
static quint32 const dataDescriptorHeaderSignature = qToLittleEndian( 0x08074b50 );

static CompressionMethod getCompressionMethod( quint16 compressionMethod )
{
Expand Down Expand Up @@ -148,6 +136,8 @@ bool readNextEntry( SplitZipFile & zip, CentralDirEntry & entry )
{
CentralFileHeaderRecord record;

auto centralDirOffset = zip.pos();

if ( zip.read( (char *)&record, sizeof( record ) ) != sizeof( record ) ) {
return false;
}
Expand All @@ -172,6 +162,7 @@ bool readNextEntry( SplitZipFile & zip, CentralDirEntry & entry )
return false;
}

entry.centralHeaderOffset = zip.calcAbsoluteOffset( centralDirOffset, qFromLittleEndian( record.diskNumberStart ) );
entry.localHeaderOffset = zip.calcAbsoluteOffset( qFromLittleEndian( record.offsetOfLocalHeader ),
qFromLittleEndian( record.diskNumberStart ) );
entry.compressedSize = qFromLittleEndian( record.compressedSize );
Expand All @@ -182,7 +173,7 @@ bool readNextEntry( SplitZipFile & zip, CentralDirEntry & entry )
return true;
}

bool readLocalHeader( SplitZipFile & zip, LocalFileHeader & entry )
bool skipLocalHeader( SplitZipFile & zip )
{
LocalFileHeaderRecord record;

Expand All @@ -194,69 +185,37 @@ bool readLocalHeader( SplitZipFile & zip, LocalFileHeader & entry )
return false;
}

// Read file name

// skip file name
int fileNameLength = qFromLittleEndian( record.fileNameLength );
entry.fileName = zip.read( fileNameLength );
// Skip extra field
return zip.seek( zip.pos() + fileNameLength + qFromLittleEndian( record.extraFieldLength ) );
}

if ( entry.fileName.size() != fileNameLength ) {
bool readLocalHeaderFromCentral( SplitZipFile & zip, LocalFileHeader & entry )
{
CentralFileHeaderRecord record;

if ( zip.read( (char *)&record, sizeof( record ) ) != sizeof( record ) ) {
return false;
}

// Skip extra field

if ( !zip.seek( zip.pos() + qFromLittleEndian( record.extraFieldLength ) ) ) {
if ( record.signature != centralFileHeaderSignature ) {
return false;
}
// Check if the data descriptor is present
quint16 gpBits = qFromLittleEndian( record.gpBits );

//bit 3 means the data descriptor is present ,which usually in stream files.
//the data descriptor follows the real file data. skip the file data and check the data descriptor signature,
//from the zlib format description ,the signature is optional!
bool hasDataDescriptor = ( gpBits & 0x0008 ) != 0;

if ( hasDataDescriptor && ( record.compressedSize == 0 ) ) {
auto current_pos = zip.pos();
// If compressedSize is 0, we need to find the data descriptor
QByteArray dataDescriptorSignature( (char const *)&dataDescriptorHeaderSignature, sizeof( quint32 ) );

QByteArray buffer;
while ( true ) {
char byte;
if ( zip.read( &byte, sizeof( byte ) ) != sizeof( byte ) ) {
return false;
}
buffer.append( byte );

if ( buffer.size() >= dataDescriptorSignature.size() ) {
QByteArray lastBytes = buffer.right( sizeof( dataDescriptorSignature ) );
if ( lastBytes == dataDescriptorSignature ) {
// Found the data descriptor signature
break;
}
buffer.remove( 0, 1 );
}
}

DataDescriptor dataDescriptor;

if ( zip.read( (char *)&dataDescriptor, sizeof( dataDescriptor ) ) != sizeof( dataDescriptor ) ) {
return false;
}

entry.compressedSize = qFromLittleEndian( dataDescriptor.compressedSize );
entry.uncompressedSize = qFromLittleEndian( dataDescriptor.uncompressedSize );
// Read file name
int fileNameLength = qFromLittleEndian( record.fileNameLength );
entry.fileName = zip.read( fileNameLength );

//restore
zip.seek( current_pos );
}
else {
entry.compressedSize = qFromLittleEndian( record.compressedSize );
entry.uncompressedSize = qFromLittleEndian( record.uncompressedSize );
if ( entry.fileName.size() != fileNameLength ) {
return false;
}

entry.compressedSize = qFromLittleEndian( record.compressedSize );
entry.uncompressedSize = qFromLittleEndian( record.uncompressedSize );
entry.compressionMethod = getCompressionMethod( record.compressionMethod );
entry.offset = zip.calcAbsoluteOffset( qFromLittleEndian( record.offsetOfLocalHeader ),
qFromLittleEndian( record.diskNumberStart ) );

return true;
}
Expand Down
10 changes: 5 additions & 5 deletions src/dict/utils/zipfile.hh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ enum CompressionMethod {
struct CentralDirEntry
{
QByteArray fileName;

quint32 centralHeaderOffset;
quint32 localHeaderOffset, compressedSize, uncompressedSize;
CompressionMethod compressionMethod;
bool fileNameInUTF8;
Expand All @@ -51,6 +51,7 @@ struct LocalFileHeader

quint32 compressedSize, uncompressedSize;
CompressionMethod compressionMethod;
quint32 offset;
};

/// Finds the central directory in the given file and positions it at its
Expand All @@ -65,9 +66,8 @@ bool positionAtCentralDir( SplitZipFile & );
/// Returns true on success, false otherwise.
bool readNextEntry( SplitZipFile &, CentralDirEntry & );

/// Reads loca file header from the zip at its current offset. The file gets
/// advanced by the size of entry and starts pointing to file data.
/// Returns true on success, false otherwise.
bool readLocalHeader( SplitZipFile &, LocalFileHeader & );
/// Skips the local header of the file at the current position. the file data follows the header.
bool skipLocalHeader( SplitZipFile & zip );
bool readLocalHeaderFromCentral( SplitZipFile &, LocalFileHeader & );

} // namespace ZipFile
2 changes: 1 addition & 1 deletion src/dict/xdxf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )

enum {
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion + Folding::Version + BtreeIndexing::ZipParseLogicVersion
};

enum ArticleFormat {
Expand Down
2 changes: 1 addition & 1 deletion src/dict/zipsounds.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ DEF_EX( exInvalidData, "Invalid data encountered", Dictionary::Ex )

enum {
Signature = 0x5350495a, // ZIPS on little-endian, SPIZ on big-endian
CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion
CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion + BtreeIndexing::ZipParseLogicVersion
};

#pragma pack( push, 1 )
Expand Down

0 comments on commit 13b910a

Please sign in to comment.