From 90fe37e5e33d3208beabad32dbd14136d55614b3 Mon Sep 17 00:00:00 2001 From: Balearica Date: Mon, 19 Aug 2024 19:10:44 -0700 Subject: [PATCH] Fixed bug with OCR comparison requiring image; added test --- js/containers/imageContainer.js | 2 +- js/export/exportDebugCsv.js | 8 +- js/import/import.js | 4 +- js/recognizeConvert.js | 16 +- js/worker/compareOCRModule.js | 17 +- package-lock.json | 46 +- package.json | 2 +- scribe.js | 4 +- tests/assets/complaint_1.truth.hocr | 250 ++++++ tests/assets/complaint_1.xml | 1216 +++++++++++++++++++++++++++ tests/module/evaluate.spec.js | 37 + 11 files changed, 1566 insertions(+), 36 deletions(-) create mode 100644 tests/assets/complaint_1.truth.hocr create mode 100644 tests/assets/complaint_1.xml create mode 100644 tests/module/evaluate.spec.js diff --git a/js/containers/imageContainer.js b/js/containers/imageContainer.js index 59d6ccd..5e31a62 100644 --- a/js/containers/imageContainer.js +++ b/js/containers/imageContainer.js @@ -300,7 +300,7 @@ export class ImageCache { page: n + 1, dpi, color, skipText: skipTextMode, }).then((res) => new ImageWrapper(n, res, color ? 'color' : 'gray')); } - throw new Error('No input mode set'); + throw new Error('Attempted to render image without image input provided.'); }; /** diff --git a/js/export/exportDebugCsv.js b/js/export/exportDebugCsv.js index ce0810c..1dfed21 100644 --- a/js/export/exportDebugCsv.js +++ b/js/export/exportDebugCsv.js @@ -22,10 +22,10 @@ const escapeCSVField = (field) => { }; /** - * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string. - * @param {Array} data - The array of data objects. - * @returns {string} - The CSV string. - */ + * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string. + * @param {Array} data - The array of data objects. + * @returns {string} - The CSV string. + */ export const convertToCSV = (data) => { if (data.length === 0) { return ''; diff --git a/js/import/import.js b/js/import/import.js index ce5ca4d..159fe4d 100644 --- a/js/import/import.js +++ b/js/import/import.js @@ -465,6 +465,8 @@ export async function importFilesSupp(files, ocrName) { const ocrData = await importOCRFiles(ocrFilesAll); + const scribeMode = ocrData.scribeMode; + const pageCountHOCR = ocrData.hocrRaw.length; // If both OCR data and image data are present, confirm they have the same number of pages @@ -478,5 +480,5 @@ export async function importFilesSupp(files, ocrName) { if (ocrData.abbyyMode) format = 'abbyy'; if (ocrData.stextMode) format = 'stext'; - convertOCRAll(ocrData.hocrRaw, false, format, ocrName); + await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode); } diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js index 6318cf4..70040db 100644 --- a/js/recognizeConvert.js +++ b/js/recognizeConvert.js @@ -24,7 +24,19 @@ import { replaceObjectProperties } from './utils/miscUtils.js'; */ export const compareOCRPage = async (pageA, pageB, options) => { const func = typeof process !== 'undefined' ? (await import('./worker/compareOCRModule.js')).compareOCRPageImp : gs.scheduler.compareOCRPageImp; - const binaryImage = await ImageCache.getBinary(pageA.n); + + // Some combinations of options require the image to be provided, and some do not. + // We skip sending the image for those that do not, as in addition to helping performance, + // this is also necessary to run basic comparison scripts (e.g. benchmarking accuracy) without providing the image. + // TODO: Rework the options so this works better with types. + // At present TypeScript has no way of knowing that certain combinations of options go with each other. + const mode = options?.mode || 'stats'; + const evalConflicts = options?.evalConflicts ?? true; + const supplementComp = options?.supplementComp ?? false; + const skipImage = (mode === 'stats' && !supplementComp) || (mode === 'comb' && !evalConflicts && !supplementComp); + + const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n); + const pageMetricsObj = pageMetricsArr[pageA.n]; return func({ pageA, pageB, binaryImage, pageMetricsObj, options, @@ -51,7 +63,7 @@ export const evalOCRPage = async (params) => { * Compare two sets of OCR data. * @param {Array} ocrA * @param {Array} ocrB - * @param {Parameters[0]['options']} options + * @param {Parameters[0]['options']} [options] */ export const compareOCR = async (ocrA, ocrB, options) => { /** @type {Parameters[2]} */ diff --git a/js/worker/compareOCRModule.js b/js/worker/compareOCRModule.js index 01d4130..fe09a20 100644 --- a/js/worker/compareOCRModule.js +++ b/js/worker/compareOCRModule.js @@ -486,10 +486,19 @@ async function penalizeWord(wordObjs) { export async function compareOCRPageImp({ pageA, pageB, binaryImage, pageMetricsObj, options = {}, }) { - const binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src); - - const imageUpscaled = binaryImage.upscaled; - const imageRotated = binaryImage.rotated; + // The `binaryImage` argument is not sent for certain operations, which do not require it. + // For example, running a basic comparison between a page and the ground truth does not require having the image. + // The types do not currently reflect this, so this should be reworked at some point. + /** @type {?ImageBitmap} */ + let binaryImageBit = null; + let imageUpscaled = false; + let imageRotated = false; + + if (binaryImage) { + binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src); + imageUpscaled = binaryImage.upscaled; + imageRotated = binaryImage.rotated; + } const mode = options?.mode === undefined ? 'stats' : options?.mode; const editConf = options?.editConf === undefined ? false : options?.editConf; diff --git a/package-lock.json b/package-lock.json index 9a15fb6..f3f85e1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,10 +9,10 @@ "version": "0.2.0", "license": "AGPL-3.0", "dependencies": { + "@scribe.js/tesseract.js": "^5.0.4", "canvas": "^2.11.2", "commander": "^11.1.0", "puppeteer": "^22.13.0", - "tesseract.js": "scribeocr/tesseract.js#2065fd6", "web-worker": "~1.2.0" }, "devDependencies": { @@ -739,6 +739,29 @@ "node": ">=12" } }, + "node_modules/@scribe.js/tesseract.js": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/@scribe.js/tesseract.js/-/tesseract.js-5.0.4.tgz", + "integrity": "sha512-RLYqg1Yapg6xHhTmnp7lHCXfeoj0hD4W3N5fAIReCumzLRv3gn3pgZrWlFblMUZFq85t1cZ2jDZHC2SfSm30Sg==", + "hasInstallScript": true, + "dependencies": { + "@scribe.js/tesseract.js-core": "^5.1.0", + "bmp-js": "^0.1.0", + "idb-keyval": "^6.2.0", + "is-electron": "^2.2.2", + "is-url": "^1.2.4", + "node-fetch": "^2.6.9", + "opencollective-postinstall": "^2.0.3", + "regenerator-runtime": "^0.13.3", + "wasm-feature-detect": "^1.2.11", + "zlibjs": "^0.3.1" + } + }, + "node_modules/@scribe.js/tesseract.js-core": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/@scribe.js/tesseract.js-core/-/tesseract.js-core-5.1.0.tgz", + "integrity": "sha512-gP2nvNMTyDbMtTECQbxAi2Zqn444TUzMNOOzYl3o8q+TiL1KbtYXC3nXD35nF3+8E6KZRg5yLdcdnyW5SylkYQ==" + }, "node_modules/@sideway/address": { "version": "4.1.5", "resolved": "https://registry.npmjs.org/@sideway/address/-/address-4.1.5.tgz", @@ -8390,27 +8413,6 @@ "streamx": "^2.15.0" } }, - "node_modules/tesseract.js": { - "version": "5.0.4", - "resolved": "git+ssh://git@github.com/scribeocr/tesseract.js.git#2065fd67c09c99d593096f66b475f840590203d3", - "hasInstallScript": true, - "dependencies": { - "bmp-js": "^0.1.0", - "idb-keyval": "^6.2.0", - "is-electron": "^2.2.2", - "is-url": "^1.2.4", - "node-fetch": "^2.6.9", - "opencollective-postinstall": "^2.0.3", - "regenerator-runtime": "^0.13.3", - "tesseract.js-core": "scribeocr/tesseract.js-core#ed2b922", - "wasm-feature-detect": "^1.2.11", - "zlibjs": "^0.3.1" - } - }, - "node_modules/tesseract.js-core": { - "version": "5.1.0", - "resolved": "git+ssh://git@github.com/scribeocr/tesseract.js-core.git#ed2b9220ab784c8772bd8ee8002f94e5ebe577df" - }, "node_modules/text-decoder": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz", diff --git a/package.json b/package.json index ced2015..8702dd5 100644 --- a/package.json +++ b/package.json @@ -52,7 +52,7 @@ "canvas": "^2.11.2", "commander": "^11.1.0", "puppeteer": "^22.13.0", - "tesseract.js": "scribeocr/tesseract.js#2065fd6", + "@scribe.js/tesseract.js": "^5.0.4", "web-worker": "~1.2.0" } } diff --git a/scribe.js b/scribe.js index a63de50..65f856d 100644 --- a/scribe.js +++ b/scribe.js @@ -11,7 +11,7 @@ import { ImageCache } from './js/containers/imageContainer.js'; import coords from './js/coordinates.js'; import { drawDebugImages } from './js/debug.js'; import { download, exportData } from './js/export/export.js'; -import { writeDebugCsv } from './js/export/exportDebugCsv.js'; +import { writeDebugCsv, convertToCSV } from './js/export/exportDebugCsv.js'; import { extractSingleTableContent } from './js/export/exportWriteTabular.js'; import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js'; import { gs } from './js/generalWorkerMain.js'; @@ -131,6 +131,8 @@ class utils { // Misc utils static calcBoxOverlap = calcBoxOverlap; + static convertToCSV = convertToCSV; + static replaceSmartQuotes = replaceSmartQuotes; static getRandomAlphanum = getRandomAlphanum; diff --git a/tests/assets/complaint_1.truth.hocr b/tests/assets/complaint_1.truth.hocr new file mode 100644 index 0000000..20ce813 --- /dev/null +++ b/tests/assets/complaint_1.truth.hocr @@ -0,0 +1,250 @@ + + + + + + + + + + + + + + + + +
+ + Case + 1:13-cv-01317-TWT-ECS + Document + 1 + Filed + 04/22/13 + Page + 1 + of + 10 + + + IN + THE + UNITED + STATES + DISTRICT + COURT + + + FOR + THE + NORTHERN + DISTRICT + OF + GEORGIA + + + ATLANTA + DIVISION + + + EQUAL + EMPLOYMENT + + + OPPORTUNITY + COMMISSION, + + + Plaintiff, + CIVIL + ACTION + NO. + + + v. + + + HIRE + DYNAMICS, + LLC, + JURY + TRIAL + DEMANDED + + + Defendant. + + + COMPLAINT + + + This + is + an + action + under + Title + VII + of + the + Civil + Rights + Act + of + 1964 + (“Title + + + VII”), + as + amended, + and + Title + I + of + the + Civil + Rights + Act + of + 1991 + to + correct + unlawful + + + employment + practices + on + the + basis + of + retaliation + and + to + provide + appropriate + relief + to + + + Christopher + Wood + (“Wood”), + who + was + adversely + affected + by + such + practices. + The + + + Plaintiff + alleges + that + while + employed + with + Defendant + Hire + Dynamics, + LLC + + + (“Defendant”), + a + staffing + company, + as + a + Quality + Auditor + assigned + to + work + for + one + + + of + its + clients, + Wood + was + suspended + for + one + week + and, + as + a + result, + he + complained + of + + + discrimination + and + informed + his + supervisor + at + Defendant + that + he + was + going + to + the + + + EEOC + to + complain + of + discrimination. + Wood + did + actually + file + a + charge + of + + + discrimination. + Defendant + never + allowed + Wood + to + return + to + work + after + his + + + suspension + and + denied + him + all + future + assignments + even + though + Wood + consistently + + + 1 + +
+ + \ No newline at end of file diff --git a/tests/assets/complaint_1.xml b/tests/assets/complaint_1.xml new file mode 100644 index 0000000..f989d1b --- /dev/null +++ b/tests/assets/complaint_1.xml @@ -0,0 +1,1216 @@ + + + + + + + +C +a +s +e + +l +: +1 +3 +- +c +v +- +0 +1 +3 +1 +7 +- +T +W +T +- +E +C +S + +D +o +c +u +m +e +n +t + +I + +F +i +l +e +d + +0 +4 +/ +2 +2 +/ +1 +3 + +P +a +g +e + +1 + +o +f + +1 +0 + + + + + + +I +N + +T +H +E + +U +N +I +T +E +D + +S +T +A +T +E +S + +D +I +S +T +R +I +C +T + +C +O +U +R +T + +F +O +R + +T +H +E + +N +O +R +T +H +E +R +N + +D +I +S +T +R +I +C +T + +O +F + +G +E +O +R +G +I +A + +A +T +L +A +N +T +A + +D +I +V +I +S +I +O +N + + + + + + +E +Q +U +A +L + +E +M +P +L +O +Y +M +E +N +T + +O +P +P +O +R +T +U +N +I +T +Y + +C +O +M +M +I +S +S +I +O +N +, + + +P +l +a +i +n +t +i +f +f +, + + +v +. + + +H +I +R +E + +D +Y +N +A +M +I +C +S +, + +L +L +C +, + + +D +e +f +e +n +d +a +n +t +. + + + + + + +C +I +V +I +L + +A +C +T +I +O +N + +N +O +. + + + + + + +J +U +R +Y + +T +R +I +A +L + +D +E +M +A +N +D +E +D + + + + + + +C +O +M +P +L +A +I +N +T + + +T +h +i +s + +i +s + +a +n + +a +c +t +i +o +n + +u +n +d +e +r + +T +i +t +l +e + +V +I +I + +o +f + +t +h +e + +C +i +v +i +l + +R +i +g +h +t +s + +A +c +t + +o +f + +1 +9 +6 +4 + +( + +T +i +t +l +e + +V +I +I + +) +, + +a +s + +a +m +e +n +d +e +d +, + +a +n +d + +T +i +t +l +e + +I + +o +f + +t +h +e + +C +i +v +i +l + +R +i +g +h +t +s + +A +c +t + +o +f + +1 +9 +9 +1 + +t +o + +c +o +r +r +e +c +t + +u +n +l +a +w +f +u +l + +e +m +p +l +o +y +m +e +n +t + +p +r +a +c +t +i +c +e +s + +o +n + +t +h +e + +b +a +s +i +s + +o +f + +r +e +t +a +l +i +a +t +i +o +n + +a +n +d + +t +o + +p +r +o +v +i +d +e + +a +p +p +r +o +p +r +i +a +t +e + +r +e +l +i +e +f + +t +o + +C +h +r +i +s +t +o +p +h +e +r + +W +o +o +d + +( + +W +o +o +d + +) +, + +w +h +o + +w +a +s + +a +d +v +e +r +s +e +l +y + +a +f +f +e +c +t +e +d + +b +y + +s +u +c +h + +p +r +a +c +t +i +c +e +s +. + +T +h +e + +P +l +a +i +n +t +i +f +f + +a +l +l +e +g +e +s + +t +h +a +t + +w +h +i +l +e + +e +m +p +l +o +y +e +d + +w +i +t +h + +D +e +f +e +n +d +a +n +t + +H +i +r +e + +D +y +n +a +m +i +c +s +, + +L +L +C + +( + +D +e +f +e +n +d +a +n +t + +) +, + +a + +s +t +a +f +f +i +n +g + +c +o +m +p +a +n +y +, + +a +s + +a + +Q +u +a +l +i +t +y + +A +u +d +i +t +o +r + +a +s +s +i +g +n +e +d + +t +o + +w +o +r +k + +f +o +r + +o +n +e + +o +f + +i +t +s + +c +l +i +e +n +t +s +, + +W +o +o +d + +w +a +s + +s +u +s +p +e +n +d +e +d + +f +o +r + +o +n +e + +w +e +e +k + +a +n +d +, + +a +s + +a + +r +e +s +u +l +t +, + +h +e + +c +o +m +p +l +a +i +n +e +d + +o +f + +d +i +s +c +r +i +m +i +n +a +t +i +o +n + +a +n +d + +i +n +f +o +r +m +e +d + +h +i +s + +s +u +p +e +r +v +i +s +o +r + +a +t + +D +e +f +e +n +d +a +n +t + +t +h +a +t + +h +e + +w +a +s + +g +o +i +n +g + +t +o + +t +h +e + +E +E +O +C + +t +o + +c +o +m +p +l +a +i +n + +o +f + +d +i +s +c +r +i +m +i +n +a +t +i +o +n +. + +W +o +o +d + +d +i +d + +a +c +t +u +a +l +l +y + +f +i +l +e + +a + +c +h +a +r +g +e + +o +f + +d +i +s +c +r +i +m +i +n +a +t +i +o +n +. + +D +e +f +e +n +d +a +n +t + +n +e +v +e +r + +a +l +l +o +w +e +d + +W +o +o +d + +t +o + +r +e +t +u +r +n + +t +o + +w +o +r +k + +a +f +t +e +r + +h +i +s + +s +u +s +p +e +n +s +i +o +n + +a +n +d + +d +e +n +i +e +d + +h +i +m + +a +l +l + +f +u +t +u +r +e + +a +s +s +i +g +n +m +e +n +t +s + +e +v +e +n + +t +h +o +u +g +h + +W +o +o +d + +c +o +n +s +i +s +t +e +n +t +l +y + + + + + + +1 + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/module/evaluate.spec.js b/tests/module/evaluate.spec.js new file mode 100644 index 0000000..70b1bb8 --- /dev/null +++ b/tests/module/evaluate.spec.js @@ -0,0 +1,37 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +// import mocha from '../../node_modules/mocha/mocha.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check evaluate function.', function () { + this.timeout(10000); + before(async () => { + }); + + it('Should correctly compare page to ground truth', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/complaint_1.xml`]); + await scribe.importFilesSupp([`${ASSETS_PATH_KARMA}/complaint_1.truth.hocr`], 'Ground Truth'); + + const res = await scribe.compareOCR(scribe.data.ocr.active, scribe.data.ocr['Ground Truth']); + + const evalStatsDoc = scribe.utils.calcEvalStatsDoc(res.metrics); + + assert.strictEqual(evalStatsDoc.total, 183); + assert.strictEqual(evalStatsDoc.correct, 181); + assert.strictEqual(evalStatsDoc.incorrect, 2); + assert.strictEqual(evalStatsDoc.missed, 0); + assert.strictEqual(evalStatsDoc.extra, 0); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000);