Added PDF text extraction to API; replaced with

scribeocr · Aug 17, 2024 · fad9cdc · fad9cdc
1 parent d43810d
commit fad9cdc
Show file tree

Hide file tree

Showing 16 changed files with 189 additions and 121 deletions.
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -67,6 +67,12 @@
         // "one-var": "off",
         // "one-var-declaration-per-line": "off", 
 
+        // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
+        "import/no-relative-packages": "off",
+
+        // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
+        "no-lone-blocks": "off",
+
         // This rule was depreciated
         "no-return-await": "off",
 

diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
 import scribe from 'scribe.js-ocr';
 
 // Basic usage
-scribe.recognizeFiles(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
+scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
 	.then((res) => console.log(res))
 ```
 

diff --git a/docs/API.md b/docs/API.md
@@ -4,7 +4,7 @@
 
 *   [init][1]
     *   [Parameters][2]
-*   [recognizeFiles][3]
+*   [extractText][3]
     *   [Parameters][4]
 *   [clear][5]
 *   [terminate][6]
@@ -35,16 +35,21 @@ Initialize the program and optionally pre-load resources.
         The PDF renderer and OCR engine are automatically loaded when needed.
         Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
 
-## recognizeFiles
+## extractText
 
-Helper function for recognizing files with a single function call.
+Function for extracting text from image and PDF files with a single function call.
+By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
 For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
 
 ### Parameters
 
 *   `files` &#x20;
 *   `langs` **[Array][21]<[string][22]>**  (optional, default `['eng']`)
 *   `outputFormat`   (optional, default `'txt'`)
+*   `options` **[Object][19]?**  (optional, default `{}`)
+
+    *   `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
+    *   `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
 
 ## clear
 
@@ -100,6 +105,10 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
 ### Parameters
 
 *   `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
+*   `options` **[Object][19]?**  (optional, default `{}`)
+
+    *   `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
+    *   `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
 
 ## recognizePage
 
@@ -135,7 +144,7 @@ The results of recognition can be exported by calling `exportFiles` after this f
 
 [2]: #parameters
 
-[3]: #recognizefiles
+[3]: #extracttext
 
 [4]: #parameters-1
 

diff --git a/examples/browser/recognize-basic.js b/examples/browser/recognize-basic.js
@@ -5,6 +5,6 @@ await scribe.init({ ocr: true, font: true });
 const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
 elm.addEventListener('change', async () => {
   if (!elm.files) return;
-  const text = await scribe.recognizeFiles(elm.files);
+  const text = await scribe.extractText(elm.files);
   console.log(text);
 });
diff --git a/examples/node/recognize-basic.js b/examples/node/recognize-basic.js
@@ -5,7 +5,7 @@ import scribe from '../../scribe.js';
 const [,, imagePath] = process.argv;
 
 (async () => {
-  const res = await scribe.recognizeFiles([imagePath]);
+  const res = await scribe.extractText([imagePath]);
   console.log(res);
   await scribe.terminate();
 })();
diff --git a/js/containers/app.js b/js/containers/app.js
@@ -74,9 +74,6 @@ export class inputData {
   /** `true` if user re-uploaded HOCR data created by Scribe OCR */
   static resumeMode = false;
 
-  /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
-  static extractTextMode = false;
-
   /** `true` if ground truth data is uploaded */
   static evalMode = false;
 

diff --git a/js/containers/imageContainer.js b/js/containers/imageContainer.js
@@ -216,52 +216,19 @@ export class ImageCache {
   static pageCount = 0;
 
   /**
- * The dimensions that each page would be, if it was rendered at 300 DPI.
- * @type {Array<dims>}
- */
-  static pdfDims300Arr = [];
+   * The dimensions that each page would be, if it was rendered at 300 DPI.
+   * @type {Array<dims>}
+   */
+  static pdfDims300 = [];
 
   static inputModes = {
     pdf: false,
     image: false,
   };
 
-  static pdfContentStats = {
-    /** Total number of letters in the source PDF. */
-    letterCountTotal: 0,
-    /** Total number of visible letters in the source PDF. */
-    letterCountVis: 0,
-    /** Total number of pages with 100+ letters in the source PDF. */
-    pageCountTotalText: 0,
-    /** Total number of pages with 100+ visible letters in the source PDF. */
-    pageCountVisText: 0,
-  };
-
   /** @type {?('text'|'ocr'|'image')} */
   static pdfType = null;
 
-  static setPdfType = () => {
-    // The PDF is considered text-native if:
-    // (1) The total number of visible letters is at least 100 per page on average.
-    // (2) The total number of visible letters is at least 90% of the total number of letters.
-    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
-    if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
-      && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'text';
-    // The PDF is considered ocr-native if:
-    // (1) The total number of letters is at least 100 per page on average.
-    // (2) The total number of letters is at least half of the total number of letters.
-    } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'ocr';
-    // Otherwise, the PDF is considered image-native.
-    // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
-    } else {
-      ImageCache.pdfType = 'image';
-    }
-  };
-
   static colorModeDefault = 'gray';
 
   static cacheRenderPages = 3;
@@ -327,7 +294,7 @@ export class ImageCache {
     } if (ImageCache.inputModes.pdf) {
       const pageMetrics = pageMetricsArr[n];
       const targetWidth = pageMetrics.dims.width;
-      const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
+      const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
       const muPDFScheduler = await ImageCache.getMuPDFScheduler();
       return muPDFScheduler.drawPageAsPNG({
         page: n + 1, dpi, color, skipText: skipTextMode,
@@ -566,14 +533,10 @@ export class ImageCache {
     ImageCache.inputModes.image = false;
     ImageCache.inputModes.pdf = false;
     ImageCache.pageCount = 0;
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     ImageCache.loadCount = 0;
     ImageCache.nativeProps.length = 0;
     ImageCache.binaryProps.length = 0;
-    ImageCache.pdfContentStats.letterCountTotal = 0;
-    ImageCache.pdfContentStats.letterCountVis = 0;
-    ImageCache.pdfContentStats.pageCountTotalText = 0;
-    ImageCache.pdfContentStats.pageCountVisText = 0;
   };
 
   static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
    *
    * @param {ArrayBuffer} fileData
    * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
-   * @param {Boolean} [extractStext=false]
    */
-  static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
+  static openMainPDF = async (fileData, skipText = false) => {
     const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
 
     await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
 
     const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
 
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     pageDims1.forEach((x) => {
-      ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
+      ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
     });
 
     ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
 
     // For reasons that are unclear, a small number of pages have been rendered into massive files
     // so a hard-cap on resolution must be imposed.
-    const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
+    const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
 
     // In addition to capping the resolution, also switch the width/height
-    ImageCache.pdfDims300Arr.forEach((x, i) => {
+    ImageCache.pdfDims300.forEach((x, i) => {
       const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
       pageMetricsArr[i] = new PageMetrics(pageDims);
     });
@@ -674,23 +636,5 @@ export class ImageCache {
         await setUploadFontsWorker(gs.schedulerInner);
       });
     }
-
-    if (extractStext) {
-      ocrAllRaw.active = Array(ImageCache.pageCount);
-      const resArr = pageDPI.map(async (x, i) => {
-        // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
-        // The XML format is the only built-in mupdf format that includes character-level granularity.
-        const res = await muPDFScheduler.pageText({
-          page: i + 1, dpi: x, format: 'xml', calcStats: true,
-        });
-        ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
-        ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
-        if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
-        if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
-        ocrAllRaw.active[i] = res.content;
-      });
-      await Promise.all(resArr);
-      ImageCache.setPdfType();
-    }
   };
 }
diff --git a/js/extractPDFText.js b/js/extractPDFText.js
@@ -0,0 +1,110 @@
+import { ImageCache } from './containers/imageContainer.js';
+import { convertOCRAll } from './recognizeConvert.js';
+import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
+
+/**
+ * Extract raw text content from currently loaded PDF.
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
+ */
+const extractInternalPDFTextRaw = async () => {
+  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
+
+  const pdfContentStats = {
+    /** Total number of letters in the source PDF. */
+    letterCountTotal: 0,
+    /** Total number of visible letters in the source PDF. */
+    letterCountVis: 0,
+    /** Total number of pages with 100+ letters in the source PDF. */
+    pageCountTotalText: 0,
+    /** Total number of pages with 100+ visible letters in the source PDF. */
+    pageCountVisText: 0,
+  };
+
+  const stextArr = /** @type {Array<string>} */ ([]);
+  const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
+  const resArr = pageDPI.map(async (x, i) => {
+    // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
+    // The XML format is the only built-in mupdf format that includes character-level granularity.
+    const res = await muPDFScheduler.pageText({
+      page: i + 1, dpi: x, format: 'xml', calcStats: true,
+    });
+    pdfContentStats.letterCountTotal += res.letterCountTotal;
+    pdfContentStats.letterCountVis += res.letterCountVis;
+    if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
+    if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
+    stextArr[i] = res.content;
+  });
+  await Promise.all(resArr);
+
+  /** @type {"image" | "text" | "ocr"} */
+  let type = 'image';
+
+  // Determine whether the PDF is text-native, image-only, or image + OCR.
+  {
+    // The PDF is considered text-native if:
+    // (1) The total number of visible letters is at least 100 per page on average.
+    // (2) The total number of visible letters is at least 90% of the total number of letters.
+    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
+    if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
+      && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
+      type = 'text';
+      // The PDF is considered ocr-native if:
+      // (1) The total number of letters is at least 100 per page on average.
+      // (2) The total number of letters is at least half of the total number of letters.
+    } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
+      type = 'ocr';
+      // Otherwise, the PDF is considered image-native.
+      // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
+    } else {
+      type = 'image';
+    }
+  }
+
+  return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
+};
+
+/**
+ * Extract and parse text from currently loaded PDF.
+ * @param {Object} [options]
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
+ *   This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
+ */
+export const extractInternalPDFText = async (options = {}) => {
+  const extractPDFTextNative = options?.extractPDFTextNative ?? true;
+  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
+  const extractPDFTextImage = options?.extractPDFTextImage ?? false;
+
+  const setActive = options?.setActive ?? false;
+
+  const res = await extractInternalPDFTextRaw();
+
+  ImageCache.pdfType = res.type;
+  ocrAllRaw.pdf = res.contentRaw;
+
+  if (!extractPDFTextImage && res.type === 'image') return res;
+
+  if (!extractPDFTextOCR && res.type === 'ocr') return res;
+
+  if (!extractPDFTextNative && res.type === 'text') return res;
+
+  ocrAll.pdf = Array(ImageCache.pageCount);
+
+  if (setActive) {
+    ocrAllRaw.active = ocrAllRaw.pdf;
+    ocrAll.active = ocrAll.pdf;
+  }
+
+  const format = 'stext';
+
+  // Process HOCR using web worker, reading from file first if that has not been done already
+  await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
+
+  res.content = ocrAll.pdf;
+
+  return res;
+};
diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js
@@ -263,12 +263,12 @@ export class gs {
   static getGeneralScheduler = async () => {
     if (gs.schedulerReady) {
       await gs.schedulerReady;
-      return gs.scheduler;
+      return /** @type {GeneralScheduler} */ (gs.scheduler);
     }
 
     await gs.init();
 
-    return gs.scheduler;
+    return /** @type {GeneralScheduler} */ (gs.scheduler);
   };
 
   static terminate = async () => {