Skip to content

Commit

Permalink
Add facility to extract and set ZIM metadata (#1133)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jaifroid authored Oct 17, 2023
1 parent 671cd9a commit 108f118
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 59 deletions.
2 changes: 1 addition & 1 deletion tests/unit/spec/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ var runTests = function () {
QUnit.module('zim_direntry_search_and_read');
QUnit.test("check DirEntry.fromStringId 'A Fool for You'", function (assert) {
var done = assert.async();
var aFoolForYouDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive._file, '5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined');
var aFoolForYouDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive.file, '5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined');

assert.expect(2);
var callbackFunction = function (dirEntry, htmlArticle) {
Expand Down
32 changes: 18 additions & 14 deletions www/js/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -1232,11 +1232,8 @@ function setLocalArchiveFromArchiveList () {
}
}
resetCssCache();
selectedArchive = zimArchiveLoader.loadArchiveFromDeviceStorage(selectedStorage, archiveDirectory, function () {
settingsStore.setItem('lastSelectedArchive', archiveDirectory, Infinity);
// The archive is set : go back to home page to start searching
document.getElementById('btnHome').click();
}, function (message, label) {
settingsStore.setItem('lastSelectedArchive', archiveDirectory, Infinity);
zimArchiveLoader.loadArchiveFromDeviceStorage(selectedStorage, archiveDirectory, archiveReadyCallback, function (message, label) {
// callbackError which is called in case of an error
uiUtil.systemAlert(message, label);
});
Expand Down Expand Up @@ -1339,17 +1336,24 @@ function setLocalArchiveFromFileList (files) {
}
}
resetCssCache();
selectedArchive = null;
selectedArchive = zimArchiveLoader.loadArchiveFromFiles(files, function () {
// The archive is set : go back to home page to start searching
document.getElementById('btnHome').click();
document.getElementById('downloadInstruction').style.display = 'none';
}, function (message, label) {
zimArchiveLoader.loadArchiveFromFiles(files, archiveReadyCallback, function (message, label) {
// callbackError which is called in case of an error
uiUtil.systemAlert(message, label);
});
}

/**
* Functions to be run immediately after the archive is loaded
*
* @param {ZIMArchive} archive The ZIM archive
*/
function archiveReadyCallback (archive) {
selectedArchive = archive;
// The archive is set: go back to home page to start searching
document.getElementById('btnHome').click();
document.getElementById('downloadInstruction').style.display = 'none';
}

/**
* Sets the localArchive from the File selects populated by user
*/
Expand Down Expand Up @@ -1654,7 +1658,7 @@ function readArticle (dirEntry) {
}

// We put the ZIM filename as a prefix in the URL, so that browser caches are separate for each ZIM file
iframeArticleContent.src = '../' + selectedArchive._file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
iframeArticleContent.src = '../' + selectedArchive.file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
} else {
// In jQuery mode, we read the article content in the backend and manually insert it in the iframe
if (dirEntry.isRedirect()) {
Expand Down Expand Up @@ -2183,7 +2187,7 @@ function goToRandomArticle () {
// We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
// DEV: If articlePtrPos is defined in zimFile, then we are using a v1 article-only title listing. By definition,
// all dirEntries in an article-only listing must be articles.
if (selectedArchive._file.articlePtrPos || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
if (selectedArchive.file.articlePtrPos || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = false;
var activeContent = document.getElementById('activeContent');
if (activeContent) activeContent.style.display = 'none';
Expand Down Expand Up @@ -2214,7 +2218,7 @@ function goToMainArticle () {
document.getElementById('welcomeText').style.display = '';
} else {
// For now, this code doesn't support reading Zimit archives without error, so we warn the user and suggest some solutions
if (selectedArchive._file.zimType === 'zimit') {
if (selectedArchive.zimType === 'zimit') {
uiUtil.systemAlert(translateUI.t('dialog-unsupported-archivetype-message') || '<p>You are attempting to open a Zimit-style archive, which is currently unsupported in this app.</p>' +
'<p>There is experimental support for this kind of archive in the Kiwix JS PWA. Go to: ' +
'<a href="https://pwa.kiwix.org" target="_blank">https://pwa.kiwix.org</a>.</p>' +
Expand Down
134 changes: 92 additions & 42 deletions www/js/lib/zimArchive.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* zimArchive.js: Support for archives in ZIM format.
*
* Copyright 2015 Mossroy and contributors
* Copyright 2015-2023 Mossroy, Jaifroid and contributors
* Licence GPL v3:
*
* This file is part of Kiwix.
Expand Down Expand Up @@ -33,10 +33,17 @@ import utf8 from './utf8.js';
/**
* ZIM Archive
*
*
* @typedef ZIMArchive
* @property {ZIMFile} _file The ZIM file (instance of ZIMFile, that might physically be split into several actual files)
* @property {String} _language Language of the content
* @property {ZIMFile} file The ZIM file (instance of ZIMFile, that might physically be split into several actual _files)
* @property {String} counter Counter of various types of content in the archive
* @property {String} creator Creator of the content
* @property {String} date Date of the creation of the archive
* @property {String} description Description of the content
* @property {String} language Language of the content
* @property {String} name Name of the archive
* @property {String} publisher Publisher of the content
* @property {String} title Title of the content
* @property {String} zimType Extended property: currently either 'open' for OpenZIM file type, or 'zimit' for the warc2zim file type used by Zimit
*/

/**
Expand Down Expand Up @@ -65,17 +72,16 @@ var LZ;
*/
function ZIMArchive (storage, path, callbackReady, callbackError) {
var that = this;
that._file = null;
that._language = ''; // @TODO
that.file = null;
var createZimfile = function (fileArray) {
zimfile.fromFileArray(fileArray).then(function (file) {
that._file = file;
that.file = file;
// Clear the previous libzimWoker
LZ = null;
// Set a global parameter to report the search provider type
params.searchProvider = 'title';
// File has been created, but we need to add any Listings which extend the archive metadata
that._file.setListings([
that.file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
Expand All @@ -99,14 +105,14 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
}
]).then(function () {
// There is currently an exception thrown in the libzim wasm if we attempt to load a split ZIM archive, so we work around
var isSplitZim = /\.zima.$/i.test(that._file._files[0].name);
if (that._file.fullTextIndex && (params.debugLibzimASM || !isSplitZim && typeof Atomics !== 'undefined' &&
var isSplitZim = /\.zima.$/i.test(that.file._files[0].name);
if (that.file.fullTextIndex && (params.debugLibzimASM || !isSplitZim && typeof Atomics !== 'undefined' &&
// Note that Android and NWJS currently throw due to problems with Web Worker context
!/Android/.test(params.appType) && !(window.nw && that._file._files[0].readMode === 'electron'))) {
!/Android/.test(params.appType) && !(window.nw && that.file._files[0].readMode === 'electron'))) {
var libzimReaderType = params.debugLibzimASM || ('WebAssembly' in self ? 'wasm' : 'asm');
console.log('Instantiating libzim ' + libzimReaderType + ' Web Worker...');
LZ = new Worker('js/lib/libzim-' + libzimReaderType + '.js');
that.callLibzimWorker({ action: 'init', files: that._file._files }).then(function (msg) {
that.callLibzimWorker({ action: 'init', files: that.file._files }).then(function (msg) {
// console.debug(msg);
params.searchProvider = 'fulltext: ' + libzimReaderType;
// Update the API panel
Expand All @@ -117,25 +123,52 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
});
} else {
// var message = 'Full text searching is not available because ';
if (!that._file.fullTextIndex) {
if (!that.file.fullTextIndex) {
params.searchProvider += ': no_fulltext'; // message += 'this ZIM does not have a full-text index.';
} else if (isSplitZim) {
params.searchProvider += ': split_zim'; // message += 'the ZIM archive is split.';
} else if (typeof Atomics === 'undefined') {
params.searchProvider += ': no_atomics'; // message += 'this browser does not support Atomic operations.';
} else if (/Android/.test(params.appType)) {
params.searchProvider += ': no_sharedArrayBuffer';
} else if (params.debugLibzimASM === 'disable') {
params.searchProvider += ': disabled';
} else {
params.searchProvider += ': unknown';
}
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
}
// Set the archive file type ('open' or 'zimit')
that.setZimType();
// Add time-critical metadata from the M/ namespace that you need early access to here
// Note that adding metadata here delays the reporting of the ZIM archive as ready
// Further metadata are added in the background below, and can be accessed later
Promise.all([
that.addMetadataToZIMFile('Creator'),
that.addMetadataToZIMFile('Language')
]).then(function () {
console.debug('ZIMArchive ready, metadata will be added in the background');
// All listings should be loaded, so we can now call the callback
callbackReady(that);
});
// Add non-time-critical metadata to archive in background so as not to delay opening of the archive
// DEV: Note that it does not make sense to extract illustration (icon) metadata here. Instead, if you implement use of the illustration
// metadata as icons for the loaded ZIM [kiwix-js #886], you should simply use the ZIMArdhive.getMetadata() function when needed
setTimeout(function () {
Promise.all([
that.addMetadataToZIMFile('Counter'),
that.addMetadataToZIMFile('Date'),
that.addMetadataToZIMFile('Description'),
that.addMetadataToZIMFile('Name'),
that.addMetadataToZIMFile('Publisher'),
that.addMetadataToZIMFile('Title')
]).then(function () {
console.debug('ZIMArchive metadata loaded:', that);
});
}, 1500);
}).catch(function (err) {
console.warn('Error setting archive listings: ', err);
});
// Set the archive file type ('open' or 'zimit')
params.zimType = that.setZimType();
// DEV: Currently, extended listings are only used for title (=article) listings when the user searches
// for an article or uses the Random button, by which time the listings will have been extracted.
// If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
// declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
callbackReady(that);
});
};
if (storage && !path) {
Expand Down Expand Up @@ -189,27 +222,27 @@ ZIMArchive.prototype._searchArchiveParts = function (storage, prefixPath) {
* @returns {Boolean}
*/
ZIMArchive.prototype.isReady = function () {
return this._file !== null;
return this.file !== null;
};

/**
* Detects whether the supplied archive is a Zimit-style archive or an OpenZIM archive and
* sets a _file.zimType property accordingly; also returns the detected type. Extends ZIMFile.
* sets a zimType property accordingly; also returns the detected type. Extends ZIMArchive.
* @returns {String} Either 'zimit' for a Zimit archive, or 'open' for an OpenZIM archive
*/
ZIMArchive.prototype.setZimType = function () {
var fileType = null;
var archiveType = null;
if (this.isReady()) {
fileType = 'open';
this._file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) fileType = 'zimit';
archiveType = 'open';
this.file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) archiveType = 'zimit';
});
this._file.zimType = fileType;
console.debug('Archive type set to: ' + fileType);
this.zimType = archiveType;
console.debug('Archive type set to: ' + archiveType);
} else {
console.error('ZIMArchive is not ready! Cannot set ZIM type.');
}
return fileType;
return archiveType;
};

/**
Expand All @@ -219,8 +252,8 @@ ZIMArchive.prototype.setZimType = function () {
*/
ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
if (this.isReady()) {
var mainPageUrlIndex = this._file.mainPage;
this._file.dirEntryByUrlIndex(mainPageUrlIndex).then(callback);
var mainPageUrlIndex = this.file.mainPage;
this.file.dirEntryByUrlIndex(mainPageUrlIndex).then(callback);
}
};

Expand All @@ -230,7 +263,7 @@ ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
* @returns {DirEntry}
*/
ZIMArchive.prototype.parseDirEntryId = function (dirEntryId) {
return zimDirEntry.DirEntry.fromStringId(this._file, dirEntryId);
return zimDirEntry.DirEntry.fromStringId(this.file, dirEntryId);
};

/**
Expand Down Expand Up @@ -335,7 +368,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noIn
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
var ver = this.file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
Expand All @@ -360,9 +393,9 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
var that = this;
var cns = this.getContentNamespace();
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this._file.articleCount || this._file.entryCount;
var articleCount = this.file.articleCount || this.file.entryCount;
util.binarySearch(0, articleCount, function (i) {
return that._file.dirEntryByTitleIndex(i).then(function (dirEntry) {
return that.file.dirEntryByTitleIndex(i).then(function (dirEntry) {
if (search.status === 'cancelled') return 0;
var ns = dirEntry.namespace;
// DEV: This search is redundant if we managed to populate articlePtrLst and articleCount, but it only takes two instructions and
Expand All @@ -387,7 +420,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
nextStart: index
};
}
return that._file.dirEntryByTitleIndex(index).then(function (dirEntry) {
return that.file.dirEntryByTitleIndex(index).then(function (dirEntry) {
search.scanCount++;
var title = dirEntry.getTitleOrUrl();
// Only return dirEntries with titles that actually begin with prefix
Expand Down Expand Up @@ -488,7 +521,7 @@ ZIMArchive.prototype.callLibzimWorker = function (parameters) {
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.resolveRedirect = function (dirEntry, callback) {
this._file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(callback);
this.file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(callback);
};

/**
Expand Down Expand Up @@ -530,8 +563,8 @@ ZIMArchive.prototype.readBinaryFile = function (dirEntry, callback) {
*/
ZIMArchive.prototype.getDirEntryByPath = function (path) {
var that = this;
return util.binarySearch(0, this._file.entryCount, function (i) {
return that._file.dirEntryByUrlIndex(i).then(function (dirEntry) {
return util.binarySearch(0, this.file.entryCount, function (i) {
return that.file.dirEntryByUrlIndex(i).then(function (dirEntry) {
var url = dirEntry.namespace + '/' + dirEntry.url;
if (path < url) {
return -1;
Expand All @@ -543,7 +576,7 @@ ZIMArchive.prototype.getDirEntryByPath = function (path) {
});
}).then(function (index) {
if (index === null) return null;
return that._file.dirEntryByUrlIndex(index);
return that.file.dirEntryByUrlIndex(index);
}).then(function (dirEntry) {
return dirEntry;
});
Expand All @@ -555,9 +588,9 @@ ZIMArchive.prototype.getDirEntryByPath = function (path) {
*/
ZIMArchive.prototype.getRandomDirEntry = function (callback) {
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this._file.articleCount || this._file.entryCount;
var articleCount = this.file.articleCount || this.file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this._file.dirEntryByTitleIndex(index).then(callback);
this.file.dirEntryByTitleIndex(index).then(callback);
};

/**
Expand All @@ -582,6 +615,23 @@ ZIMArchive.prototype.getMetadata = function (key, callback) {
});
};

/**
* Add Metadata to the ZIM file
* @param {String} key The key of the metadata to add to the ZIM file
* @returns {Promise<String>} A Promise that resolves with the metadata string, if it exists
*/
ZIMArchive.prototype.addMetadataToZIMFile = function (key) {
var that = this;
var lcaseKey = key.toLocaleLowerCase();
return new Promise(function (resolve, reject) {
that.getMetadata(key, function (data) {
data = data || '';
that[lcaseKey] = data;
resolve(data);
});
});
};

export default {
ZIMArchive: ZIMArchive
};
Loading

0 comments on commit 108f118

Please sign in to comment.