Skip to content

Commit

Permalink
Refactored convert functions and added convertOCRPage to API
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 7, 2024
1 parent 65e4f40 commit 15f8a84
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 31 deletions.
6 changes: 3 additions & 3 deletions js/extractPDFText.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
import { ImageCache } from './containers/imageContainer.js';
import { convertOCRAll } from './recognizeConvert.js';
import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
import { convertOCR } from './recognizeConvert.js';

/**
* Extract raw text content from currently loaded PDF.
Expand Down Expand Up @@ -102,7 +102,7 @@ export const extractInternalPDFText = async (options = {}) => {
const format = 'stext';

// Process HOCR using web worker, reading from file first if that has not been done already
await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);

res.content = ocrAll.pdf;

Expand Down
18 changes: 18 additions & 0 deletions js/generalWorkerMain.js
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,24 @@ export class gs {
}
};

/**
* @param {Parameters<typeof import('./import/convertPageHocr.js').convertPageHocr>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageHocr.js').convertPageHocr>}
*/
static convertPageHocr = async (args) => (await gs.schedulerInner.addJob('convertPageHocr', args));

/**
* @param {Parameters<typeof import('./import/convertPageAbbyy.js').convertPageAbbyy>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageAbbyy.js').convertPageAbbyy>}
*/
static convertPageAbbyy = async (args) => (await gs.schedulerInner.addJob('convertPageAbbyy', args));

/**
* @param {Parameters<typeof import('./import/convertPageStext.js').convertPageStext>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageStext.js').convertPageStext>}
*/
static convertPageStext = async (args) => (await gs.schedulerInner.addJob('convertPageStext', args));

/**
* @param {Parameters<typeof import('./worker/optimizeFontModule.js').optimizeFont>[0]} args
* @returns {ReturnType<typeof import('./worker/optimizeFontModule.js').optimizeFont>}
Expand Down
6 changes: 3 additions & 3 deletions js/import/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import { gs } from '../generalWorkerMain.js';
import { imageUtils } from '../objects/imageObjects.js';
import { LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
import { PageMetrics } from '../objects/pageMetricsObjects.js';
import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
import { replaceObjectProperties } from '../utils/miscUtils.js';
import { importOCRFiles } from './importOCR.js';

Expand Down Expand Up @@ -436,7 +436,7 @@ export async function importFiles(files, options = {}) {
if (stextMode) format = 'stext';

// Process HOCR using web worker, reading from file first if that has not been done already
await convertOCRAll(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
await convertOCR(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
// Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate).
if (!existingOpt && !stextMode) {
await checkCharWarn(convertPageWarn);
Expand Down Expand Up @@ -487,5 +487,5 @@ export async function importFilesSupp(files, ocrName) {
if (ocrData.abbyyMode) format = 'abbyy';
if (ocrData.stextMode) format = 'stext';

await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
await convertOCR(ocrData.hocrRaw, false, format, ocrName, scribeMode);
}
36 changes: 11 additions & 25 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -299,34 +299,20 @@ export function checkCharWarn(warnArr) {
* @param {boolean} mainData - Whether this is the "main" data that document metrics are calculated from.
* For imports of user-provided data, the first data provided should be flagged as the "main" data.
* For Tesseract.js recognition, the Tesseract Legacy results should be flagged as the "main" data.
* @param {("hocr"|"abbyy"|"stext"|"blocks")} format - Format of raw data.
* @param {("hocr"|"abbyy"|"stext")} format - Format of raw data.
* @param {string} engineName - Name of OCR engine.
* @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program.
*/
async function convertOCRPage(ocrRaw, n, mainData, format, engineName, scribeMode = false) {
let func = 'convertPageHocr';
if (format === 'abbyy') {
func = 'convertPageAbbyy';
} else if (format === 'stext') {
func = 'convertPageStext';
} else if (format === 'blocks') {
func = 'convertPageBlocks';
}

// Imports are always run in workers in actual use, however for debugging purposes they can be run in the main thread.
export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, scribeMode = false) {
let res;
const parallel = true;
if (parallel) {
await gs.getGeneralScheduler();
res = await gs.schedulerInner.addJob(func, { ocrStr: ocrRaw, n, scribeMode });
} else if (func === 'convertPageHocr') {
res = await import('./import/convertPageHocr.js').then((m) => m.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode }));
} else if (func === 'convertPageAbbyy') {
res = await import('./import/convertPageAbbyy.js').then((m) => m.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode }));
} else if (func === 'convertPageStext') {
res = await import('./import/convertPageStext.js').then((m) => m.convertPageStext({ ocrStr: ocrRaw, n, scribeMode }));
} else if (func === 'convertPageBlocks') {
res = await import('./import/convertPageBlocks.js').then((m) => m.convertPageBlocks({ ocrStr: ocrRaw, n, scribeMode }));
if (format === 'hocr') {
res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
} else if (format === 'abbyy') {
res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode });
} else if (format === 'stext') {
res = await gs.convertPageStext({ ocrStr: ocrRaw, n, scribeMode });
} else {
throw new Error(`Invalid format: ${format}`);
}

await convertPageCallback(res, n, mainData, engineName);
Expand Down Expand Up @@ -385,7 +371,7 @@ export async function convertPageCallback({
* @param {string} engineName - Name of OCR engine.
* @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program.
*/
export async function convertOCRAll(ocrRawArr, mainData, format, engineName, scribeMode) {
export async function convertOCR(ocrRawArr, mainData, format, engineName, scribeMode) {
// For each page, process OCR using web worker
const promiseArr = [];
for (let n = 0; n < ocrRawArr.length; n++) {
Expand Down
2 changes: 2 additions & 0 deletions scribe.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import ocr from './js/objects/ocrObjects.js';
import {
calcEvalStatsDoc,
compareOCR,
convertOCRPage,
evalOCRPage,
recognize, recognizePage,
} from './js/recognizeConvert.js';
Expand Down Expand Up @@ -242,6 +243,7 @@ export default {
clear,
combineOCRPage,
compareOCR,
convertOCRPage,
data,
enableFontOpt,
evalOCRPage,
Expand Down

0 comments on commit 15f8a84

Please sign in to comment.