Skip to content

Commit

Permalink
Updated scribe.extractText to work with xml/hocr inputs and added test
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 2, 2024
1 parent 5909d06 commit df50f09
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 1 deletion.
2 changes: 2 additions & 0 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,8 @@ export async function recognizeAllPages(legacy = true, lstm = true, mainData = f
* @param {boolean} [options.vanillaMode=false] - Whether to use the vanilla Tesseract.js model.
*/
export async function recognize(options = {}) {
if (!inputData.pdfMode && !inputData.imageMode) throw new Error('No PDF or image data found to recognize.');

await gs.getGeneralScheduler();

const combineMode = options && options.combineMode ? options.combineMode : 'data';
Expand Down
4 changes: 3 additions & 1 deletion scribe.js
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ const init = async (params) => {
* @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
*/
const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
init({ ocr: true, font: true });
await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
const skipRecPDF = inputData.pdfMode && (ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR);
if (!skipRecPDF) await recognize({ langs });
const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
if (!skipRecPDF && !skipRecOCR) await recognize({ langs });
return exportData(outputFormat);
};

Expand Down
12 changes: 12 additions & 0 deletions tests/module/misc.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,15 @@ describe('Check cleanup functions allow for resetting module.', function () {
await scribe.terminate();
});
}).timeout(120000);

describe('extractText function can be used with .xml imports.', function () {
this.timeout(10000);
it('Should recognize basic .jpg image using single function', async () => {
const txt = await scribe.extractText([`${ASSETS_PATH_KARMA}/econometrica_example_abbyy.xml`]);
assert.strictEqual(txt.slice(0, 17), 'Check for updates');
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

0 comments on commit df50f09

Please sign in to comment.