Updates to module and docs

scribeocr · Aug 16, 2024 · 06072f7 · 06072f7
1 parent 4e7859f
commit 06072f7
Show file tree

Hide file tree

Showing 23 changed files with 3,240 additions and 104 deletions.
diff --git a/README.md b/README.md
@@ -9,4 +9,50 @@ Common use cases:
 3. Write `.pdf` files that include a high-quality invisible text layer.
 	1. scribe.js can insert text into an existing `.pdf` file, making it searchable.
 
-Scribe.js is a library intended for developers.  End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
+Scribe.js is a library intended for developers.  End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
+
+# Setup
+Install from `npm` by running the following:
+```sh
+npm i scribe.js-ocr
+```
+
+Scribe.js is written in JavaScript using ESM, so can be imported directly from browser or Node.js JavaScript code.
+```js
+// Import statement in browser:
+import scribe from 'node_modules/scribe.js-ocr/scribe.js';
+// Import statement for Node.js:
+import scribe from 'scribe.js-ocr';
+
+// Basic usage
+scribe.recognizeFiles(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
+	.then((res) => console.log(res))
+```
+
+When using Scribe.js in the browser, all files must be served from the same origin as the file importing Scribe.js.  This means that importing Scribe.js from a CDN will not work.  There is no UMD version.
+
+# Scribe.js vs. Tesseract.js
+Considering whether Scribe.js or Tesseract.js is better for your project?  Read [this article](./docs/scribe_vs_tesseract.md).
+
+# Documentation
+- [Basic Browser Examples](./examples/browser/)
+- [Basic Node.js Examples](./examples/node/)
+- [Scribe.js vs. Tesseract.js Comparison](./docs/scribe_vs_tesseract.md)
+- [API](./docs/API.md)
+
+# Contributing
+To work on a local copy, simply clone with `--recurse-submodules` and install.  Please run the automated tests before making a PR.
+```sh
+## Clone the repo, including recursively cloning submodules
+git clone --recurse-submodules [email protected]:scribeocr/scribe.js.git
+cd scribe.js
+
+## Install dependencies
+npm i
+
+## Make changes
+## [...]
+
+## Run automated tests before making PR
+npm run test
+```
diff --git a/cli/extract.js b/cli/extract.js
@@ -1,6 +1,6 @@
 import fs from 'fs';
 import path from 'path';
-import scribe from '../module.js';
+import scribe from '../scribe.js';
 
 /**
  *

diff --git a/cli/main.js b/cli/main.js
@@ -5,7 +5,7 @@ import fs from 'fs';
 import path from 'path';
 
 import { tmpUnique } from '../js/worker/compareOCRModule.js';
-import scribe from '../module.js';
+import scribe from '../scribe.js';
 
 // When `debugMode` is enabled:
 // (1) Comparison images are saved as .png files.

diff --git a/docs/API.md b/docs/API.md
@@ -0,0 +1,182 @@
+<!-- Generated by documentation.js. Update this documentation by updating the source code. -->
+
+### Table of Contents
+
+*   [init][1]
+    *   [Parameters][2]
+*   [recognizeFiles][3]
+    *   [Parameters][4]
+*   [clear][5]
+*   [terminate][6]
+*   [exportData][7]
+    *   [Parameters][8]
+*   [download][9]
+    *   [Parameters][10]
+*   [SortedInputFiles][11]
+    *   [Properties][12]
+*   [importFiles][13]
+    *   [Parameters][14]
+*   [recognizePage][15]
+    *   [Parameters][16]
+*   [recognize][17]
+    *   [Parameters][18]
+
+## init
+
+Initialize the program and optionally pre-load resources.
+
+### Parameters
+
+*   `params` **[Object][19]?**&#x20;
+
+    *   `params.pdf` **[boolean][20]** Load PDF renderer. (optional, default `false`)
+    *   `params.ocr` **[boolean][20]** Load OCR engine. (optional, default `false`)
+    *   `params.font` **[boolean][20]** Load built-in fonts.
+        The PDF renderer and OCR engine are automatically loaded when needed.
+        Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
+
+## recognizeFiles
+
+Helper function for recognizing files with a single function call.
+For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
+
+### Parameters
+
+*   `files` &#x20;
+*   `langs` **[Array][21]<[string][22]>**  (optional, default `['eng']`)
+*   `outputFormat`   (optional, default `'txt'`)
+
+## clear
+
+Clears all document-specific data.
+
+## terminate
+
+Terminates the program and releases resources.
+
+## exportData
+
+Export active OCR data to specified format.
+
+### Parameters
+
+*   `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)**  (optional, default `'txt'`)
+*   `minValue` **[number][23]**  (optional, default `0`)
+*   `maxValue` **[number][23]**  (optional, default `-1`)
+
+Returns **[Promise][24]<([string][22] | [ArrayBuffer][25])>**&#x20;
+
+## download
+
+Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
+
+### Parameters
+
+*   `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)**&#x20;
+*   `fileName` **[string][22]**&#x20;
+*   `minValue` **[number][23]**  (optional, default `0`)
+*   `maxValue` **[number][23]**  (optional, default `-1`)
+
+## SortedInputFiles
+
+An object with this shape can be used to provide input to the `importFiles` function,
+without needing that function to figure out the file types.
+This is required when using ArrayBuffer inputs.
+
+Type: [Object][19]
+
+### Properties
+
+*   `pdfFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+*   `imageFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+*   `ocrFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+
+## importFiles
+
+Import files for processing.
+An object with `pdfFiles`, `imageFiles`, and `ocrFiles` arrays can be provided to import multiple types of files.
+Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
+
+### Parameters
+
+*   `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
+
+## recognizePage
+
+Recognize a single page in active document.
+Use `recognize` instead to recognize all pages in a document.
+
+### Parameters
+
+*   `n` **[number][23]** Page number to recognize.
+*   `legacy` **[boolean][20]** *
+*   `lstm` **[boolean][20]** *
+*   `areaMode` **[boolean][20]** *
+*   `tessOptions` **[Object][19]<[string][22], [string][22]>** Options to pass to Tesseract.js. (optional, default `{}`)
+*   `debugVis` **[boolean][20]** Generate instructions for debugging visualizations. (optional, default `false`)
+
+## recognize
+
+Recognize all pages in active document.
+Files for recognition should already be imported using `importFiles` before calling this function.
+The results of recognition can be exported by calling `exportFiles` after this function.
+
+### Parameters
+
+*   `options` **[Object][19]**  (optional, default `{}`)
+
+    *   `options.mode` **(`"speed"` | `"quality"`)** Recognition mode. (optional, default `'quality'`)
+    *   `options.langs` **[Array][21]<[string][22]>** Language(s) in document. (optional, default `['eng']`)
+    *   `options.modeAdv` **(`"lstm"` | `"legacy"` | `"combined"`)** Alternative method of setting recognition mode. (optional, default `'combined'`)
+    *   `options.combineMode` **(`"conf"` | `"data"`)** Method of combining OCR results. Used if OCR data already exists. (optional, default `'data'`)
+    *   `options.vanillaMode` **[boolean][20]** Whether to use the vanilla Tesseract.js model. (optional, default `false`)
+
+[1]: #init
+
+[2]: #parameters
+
+[3]: #recognizefiles
+
+[4]: #parameters-1
+
+[5]: #clear
+
+[6]: #terminate
+
+[7]: #exportdata
+
+[8]: #parameters-2
+
+[9]: #download
+
+[10]: #parameters-3
+
+[11]: #sortedinputfiles
+
+[12]: #properties
+
+[13]: #importfiles
+
+[14]: #parameters-4
+
+[15]: #recognizepage
+
+[16]: #parameters-5
+
+[17]: #recognize
+
+[18]: #parameters-6
+
+[19]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object
+
+[20]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+
+[21]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array
+
+[22]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String
+
+[23]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number
+
+[24]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise
+
+[25]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer
diff --git a/docs/scribe_vs_tesseract.md b/docs/scribe_vs_tesseract.md
@@ -0,0 +1,39 @@
+# Overview
+Scribe.js and Tesseract.js are both JavaScript packages that allow for running OCR in the browser or Node.js.  As both packages have advantages and disadvantages, this article explains how the packages differ, and should help developers decide which package is right for their project.
+
+## TL;DR
+Tesseract.js is smaller and faster than Scribe.js.  Projects that only need to extract text from `.png` and `.jpeg` images, and are satisfied with "pretty good" accuracy, should use Tesseract.js.  Scribe.js builds on Tesseract.js by providing more accurate OCR results and more features.  Most notably, Scribe.js provides PDF support, including the ability to extract existing text from PDFs, run OCR on PDFs, and add text layers to PDFs.  Developers unsure of the tradeoffs should try both packages using images their application is likely to encounter. 
+
+# Scope
+The reason why Tesseract.js and Scribe.js exist as separate packages, despite providing similar features and containing shared code, is that the scope of both projects is different.  Tesseract.js has a significantly narrower scope compared to Scribe.js.
+
+**The goal of Tesseract.js is to bring Tesseract--a popular program we do not maintain--to JavaScript.**  As long as the JavaScript interface is user-friendly and works correctly, and recognition results are similar to Tesseract on desktop, Tesseract.js is working as intended.  All bugs inherited from the main Tesseract codebase are outside of the scope of the Tesseract.js project.  As a result, a large number of Tesseract.js Git Issues, including virtually all accuracy-related issues, are closed as out of scope.
+
+**The goal of Scribe.js is to provide high-quality text extraction in JavaScript.**  Scribe.js was created to build on Tesseract.js and support many valid bug reports and feature requests that are outside of the scope of Tesseract.js.  For example, two of the most common requests from Tesseract.js users are improved OCR accuracy and PDF support.  Scribe.js (optionally) includes a custom OCR model that differs from, and generally outperforms, Tesseract.  When provided a text-native `.pdf`, Scribe.js can bypass OCR entirely and return the raw text.
+
+# Differences
+### PDF Support
+Tesseract.js does not support `.pdf` files.  The only way to extract text from `.pdf` files using Tesseract.js is to render the `.pdf` file into a series of `.png` images using a separate library and then recognizing those `.png` images.  In addition to being slow, this process is often unnecessary, as many modern `.pdf` files are already text-native, meaning that no OCR needs to occur.  
+
+Scribe OCR does support `.pdf` files, and can extract text from `.pdf` files in multiple ways.  Scribe OCR can recognize the contents of the `.pdf` file using OCR.  Additionally, for `.pdf` files that are text-native or contain an existing OCR layer, the existing text can be extracted directly.  The latter method is significantly faster compared to rendering the `.pdf` to images and running OCR.
+
+### OCR Quality
+Scribe.js produces results that are generally more accurate than Tesseract.js.  
+1. Particularly for high-quality scans and screenshots, Scribe.js misidentifies fewer words.
+2. Scribe.js often recognizes words that are skipped entirely by Tesseract.
+3. Scribe.js can identify font styles, which Tesseract is incapable of.
+	1. This can be observed by using the GUI at [scribeocr.com](https://scribeocr.com/).
+
+### GUI
+Scribe OCR contains a GUI web application that end-users can use to scan documents.  Tesseract.js is intended for developers within other applications, so is unsuitable for end users. 
+
+### File Size
+The additional features added by Scribe.js take up more space.  Enabling PDF support requires loading multiple megabytes of dependencies.  Using the Scribe.js default `quality` OCR model loads more language data than Tesseract.js does by default.
+
+Notably, these resources are only loaded if requested--the PDF resources are only loaded if a PDF file is uploaded or exported, and setting OCR mode to `speed` prevents additional data from being downloaded.  However, if all optional features are disabled, Scribe.js has little to offer over Tesseract.js. 
+
+### Speed
+The Scribe.js default `quality` recognition mode runs additional recognition and checks, which therefore increases runtime.  The amount varies significantly document-to-document, but is often in the range of a 40-90% increase versus the `speed` mode (which provides results similar to to Tesseract.js).  For applications where accuracy is not critical, this increase in runtime may not be worth it.
+
+### License
+Tesseract.js is Apache 2.0 licensed.  This is a permissive license that imposes no meaningful restrictions on use.  Scribe.js is AGPL 3.0 licensed, which is a copy-left license.  As a result, to use Scribe.js in your program--whether on the front-end or server-side--you must either (1) publish your program under AGPL 3.0 or a compatible license or (2) obtain a proprietary license (contact [email protected]).
diff --git a/examples/browser/recognize-basic.html b/examples/browser/recognize-basic.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML>
+<html>
+  <head>
+    <script src="./recognize-basic.js" type="module"></script>
+  </head>
+  <body>
+    <input type="file" id="uploader" multiple>
+  </body>
+</html>
diff --git a/examples/browser/recognize-basic.js b/examples/browser/recognize-basic.js
@@ -0,0 +1,10 @@
+import scribe from '../../scribe.js';
+// Pre-load OCR and font data to avoid delay when user uploads a file.
+await scribe.init({ ocr: true, font: true });
+
+const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
+elm.addEventListener('change', async () => {
+  if (!elm.files) return;
+  const text = await scribe.recognizeFiles(elm.files);
+  console.log(text);
+});
diff --git a/examples/node/recognize-basic.js b/examples/node/recognize-basic.js
@@ -0,0 +1,11 @@
+#!/usr/bin/env node
+// Run `node examples/node/recognize-basic.js path/to/image.jpg` to recognize text in an image.
+import scribe from '../../scribe.js';
+
+const [,, imagePath] = process.argv;
+
+(async () => {
+  const res = await scribe.recognizeFiles([imagePath]);
+  console.log(res);
+  await scribe.terminate();
+})();
diff --git a/js/export/export.js b/js/export/export.js
@@ -8,12 +8,14 @@ import { renderHOCR } from './exportRenderHOCR.js';
 import { renderText } from './exportRenderText.js';
 
 /**
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
+ * Export active OCR data to specified format.
+ * @public
+ * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} [format='txt']
  * @param {number} [minValue=0]
  * @param {number} [maxValue=-1]
  * @returns {Promise<string|ArrayBuffer>}
  */
-export async function exportData(format, minValue = 0, maxValue = -1) {
+export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
   if (format === 'text') format = 'txt';
 
   if (maxValue === -1) maxValue = inputData.pageCount - 1;
@@ -184,6 +186,7 @@ export async function exportData(format, minValue = 0, maxValue = -1) {
 
 /**
  * Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
+ * @public
  * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
  * @param {string} fileName
  * @param {number} [minValue=0]

diff --git a/js/fontContainerMain.js b/js/fontContainerMain.js
@@ -80,7 +80,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
   if (!fontAll.active || (!fontAll.active.NimbusSans.normal.opt && !fontAll.active.NimbusRomNo9L.normal.opt)) fontAll.active = fontAll.raw;
 
   if (typeof process === 'undefined') {
-    await gs.schedulerReadyLoadFonts;
+    // This assumes that the scheduler `init` method has at least started.
+    if (gs.schedulerReady === null) console.warn('Failed to load fonts to workers as workers have not been initialized yet.');
+    await gs.schedulerReady;
     await setBuiltInFontsWorker(gs.schedulerInner, true);
   }
 

diff --git a/js/fontEval.js b/js/fontEval.js
@@ -1,7 +1,9 @@
 import { DebugData, fontMetricsObj, pageMetricsArr } from './containers/dataContainer.js';
 import { fontAll } from './containers/fontContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
-import { enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto } from './fontContainerMain.js';
+import {
+  enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
+} from './fontContainerMain.js';
 import { gs } from './generalWorkerMain.js';
 
 /**
@@ -162,6 +164,8 @@ export async function evaluateFonts(pageArr) {
 export async function runFontOptimization(ocrArr) {
   const browserMode = typeof process === 'undefined';
 
+  await loadBuiltInFontsRaw();
+
   const fontRaw = fontAll.getContainer('raw');
 
   const calculateOpt = fontMetricsObj && Object.keys(fontMetricsObj).length > 0;