Refactored convert functions and added convertOCRPage to API

scribeocr · Sep 7, 2024 · 15f8a84 · 15f8a84
1 parent 65e4f40
commit 15f8a84
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 31 deletions.
diff --git a/js/extractPDFText.js b/js/extractPDFText.js
@@ -1,6 +1,6 @@
+import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
-import { convertOCRAll } from './recognizeConvert.js';
-import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
+import { convertOCR } from './recognizeConvert.js';
 
 /**
  * Extract raw text content from currently loaded PDF.
@@ -102,7 +102,7 @@ export const extractInternalPDFText = async (options = {}) => {
   const format = 'stext';
 
   // Process HOCR using web worker, reading from file first if that has not been done already
-  await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
+  await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
 
   res.content = ocrAll.pdf;
 

diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js
@@ -154,6 +154,24 @@ export class gs {
     }
   };
 
+  /**
+   * @param {Parameters<typeof import('./import/convertPageHocr.js').convertPageHocr>[0]} args
+   * @returns {ReturnType<typeof import('./import/convertPageHocr.js').convertPageHocr>}
+   */
+  static convertPageHocr = async (args) => (await gs.schedulerInner.addJob('convertPageHocr', args));
+
+  /**
+   * @param {Parameters<typeof import('./import/convertPageAbbyy.js').convertPageAbbyy>[0]} args
+   * @returns {ReturnType<typeof import('./import/convertPageAbbyy.js').convertPageAbbyy>}
+   */
+  static convertPageAbbyy = async (args) => (await gs.schedulerInner.addJob('convertPageAbbyy', args));
+
+  /**
+   * @param {Parameters<typeof import('./import/convertPageStext.js').convertPageStext>[0]} args
+   * @returns {ReturnType<typeof import('./import/convertPageStext.js').convertPageStext>}
+   */
+  static convertPageStext = async (args) => (await gs.schedulerInner.addJob('convertPageStext', args));
+
   /**
    * @param {Parameters<typeof import('./worker/optimizeFontModule.js').optimizeFont>[0]} args
    * @returns {ReturnType<typeof import('./worker/optimizeFontModule.js').optimizeFont>}

diff --git a/js/import/import.js b/js/import/import.js
@@ -23,7 +23,7 @@ import { gs } from '../generalWorkerMain.js';
 import { imageUtils } from '../objects/imageObjects.js';
 import { LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
 import { PageMetrics } from '../objects/pageMetricsObjects.js';
-import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
+import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
 import { replaceObjectProperties } from '../utils/miscUtils.js';
 import { importOCRFiles } from './importOCR.js';
 
@@ -436,7 +436,7 @@ export async function importFiles(files, options = {}) {
     if (stextMode) format = 'stext';
 
     // Process HOCR using web worker, reading from file first if that has not been done already
-    await convertOCRAll(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
+    await convertOCR(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
       // Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate).
       if (!existingOpt && !stextMode) {
         await checkCharWarn(convertPageWarn);
@@ -487,5 +487,5 @@ export async function importFilesSupp(files, ocrName) {
   if (ocrData.abbyyMode) format = 'abbyy';
   if (ocrData.stextMode) format = 'stext';
 
-  await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
+  await convertOCR(ocrData.hocrRaw, false, format, ocrName, scribeMode);
 }
diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js
@@ -299,34 +299,20 @@ export function checkCharWarn(warnArr) {
  * @param {boolean} mainData - Whether this is the "main" data that document metrics are calculated from.
  *  For imports of user-provided data, the first data provided should be flagged as the "main" data.
  *  For Tesseract.js recognition, the Tesseract Legacy results should be flagged as the "main" data.
- * @param {("hocr"|"abbyy"|"stext"|"blocks")} format - Format of raw data.
+ * @param {("hocr"|"abbyy"|"stext")} format - Format of raw data.
  * @param {string} engineName - Name of OCR engine.
  * @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program.
  */
-async function convertOCRPage(ocrRaw, n, mainData, format, engineName, scribeMode = false) {
-  let func = 'convertPageHocr';
-  if (format === 'abbyy') {
-    func = 'convertPageAbbyy';
-  } else if (format === 'stext') {
-    func = 'convertPageStext';
-  } else if (format === 'blocks') {
-    func = 'convertPageBlocks';
-  }
-
-  // Imports are always run in workers in actual use, however for debugging purposes they can be run in the main thread.
+export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, scribeMode = false) {
   let res;
-  const parallel = true;
-  if (parallel) {
-    await gs.getGeneralScheduler();
-    res = await gs.schedulerInner.addJob(func, { ocrStr: ocrRaw, n, scribeMode });
-  } else if (func === 'convertPageHocr') {
-    res = await import('./import/convertPageHocr.js').then((m) => m.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode }));
-  } else if (func === 'convertPageAbbyy') {
-    res = await import('./import/convertPageAbbyy.js').then((m) => m.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode }));
-  } else if (func === 'convertPageStext') {
-    res = await import('./import/convertPageStext.js').then((m) => m.convertPageStext({ ocrStr: ocrRaw, n, scribeMode }));
-  } else if (func === 'convertPageBlocks') {
-    res = await import('./import/convertPageBlocks.js').then((m) => m.convertPageBlocks({ ocrStr: ocrRaw, n, scribeMode }));
+  if (format === 'hocr') {
+    res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
+  } else if (format === 'abbyy') {
+    res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode });
+  } else if (format === 'stext') {
+    res = await gs.convertPageStext({ ocrStr: ocrRaw, n, scribeMode });
+  } else {
+    throw new Error(`Invalid format: ${format}`);
   }
 
   await convertPageCallback(res, n, mainData, engineName);
@@ -385,7 +371,7 @@ export async function convertPageCallback({
  * @param {string} engineName - Name of OCR engine.
  * @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program.
  */
-export async function convertOCRAll(ocrRawArr, mainData, format, engineName, scribeMode) {
+export async function convertOCR(ocrRawArr, mainData, format, engineName, scribeMode) {
   // For each page, process OCR using web worker
   const promiseArr = [];
   for (let n = 0; n < ocrRawArr.length; n++) {

diff --git a/scribe.js b/scribe.js
@@ -23,6 +23,7 @@ import ocr from './js/objects/ocrObjects.js';
 import {
   calcEvalStatsDoc,
   compareOCR,
+  convertOCRPage,
   evalOCRPage,
   recognize, recognizePage,
 } from './js/recognizeConvert.js';
@@ -242,6 +243,7 @@ export default {
   clear,
   combineOCRPage,
   compareOCR,
+  convertOCRPage,
   data,
   enableFontOpt,
   evalOCRPage,