Updated scribe.extractText to work with xml/hocr inputs and added test

scribeocr · Sep 2, 2024 · df50f09 · df50f09
1 parent 5909d06
commit df50f09
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 1 deletion.
diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js
@@ -519,6 +519,8 @@ export async function recognizeAllPages(legacy = true, lstm = true, mainData = f
  * @param {boolean} [options.vanillaMode=false] - Whether to use the vanilla Tesseract.js model.
  */
 export async function recognize(options = {}) {
+  if (!inputData.pdfMode && !inputData.imageMode) throw new Error('No PDF or image data found to recognize.');
+
   await gs.getGeneralScheduler();
 
   const combineMode = options && options.combineMode ? options.combineMode : 'data';

diff --git a/scribe.js b/scribe.js
@@ -79,12 +79,14 @@ const init = async (params) => {
  * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
  */
 const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
+  if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
   const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
   const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
   init({ ocr: true, font: true });
   await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
   const skipRecPDF = inputData.pdfMode && (ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR);
-  if (!skipRecPDF) await recognize({ langs });
+  const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
+  if (!skipRecPDF && !skipRecOCR) await recognize({ langs });
   return exportData(outputFormat);
 };
 

diff --git a/tests/module/misc.spec.js b/tests/module/misc.spec.js
@@ -24,3 +24,15 @@ describe('Check cleanup functions allow for resetting module.', function () {
     await scribe.terminate();
   });
 }).timeout(120000);
+
+describe('extractText function can be used with .xml imports.', function () {
+  this.timeout(10000);
+  it('Should recognize basic .jpg image using single function', async () => {
+    const txt = await scribe.extractText([`${ASSETS_PATH_KARMA}/econometrica_example_abbyy.xml`]);
+    assert.strictEqual(txt.slice(0, 17), 'Check for updates');
+  }).timeout(10000);
+
+  after(async () => {
+    await scribe.terminate();
+  });
+}).timeout(120000);