Updated font comparison

scribeocr · Sep 2, 2024 · 0705925 · 0705925
1 parent df50f09
commit 0705925
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 56 deletions.
diff --git a/js/import/convertPageBlocks.js b/js/import/convertPageBlocks.js
@@ -120,6 +120,15 @@ export async function convertPageBlocks({
           wordObj.lang = word.language;
           wordObj.conf = word.confidence;
 
+          // Keep alternative choices if they have higher confidence than the primary choice.
+          // This happens when the original "best choice" is rejected due to an ad-hoc penalty, most frequently because it is a non-dictionary word.
+          if (word.choices.length > 0) {
+            word.choices.sort((a, b) => b.confidence - a.confidence);
+            if (word.choices[0].text !== word.text) {
+              wordObj.textAlt = word.choices[0].text;
+            }
+          }
+
           // The `word` object has a `is_italic` property, but it is always false.
           // Therefore, the font name is checked to determine if the word is italic.
           // See: https://github.com/naptha/tesseract.js/issues/907

diff --git a/js/import/convertPageShared.js b/js/import/convertPageShared.js
@@ -51,16 +51,21 @@ export function pass2(pageObj, rotateAngle) {
       // Word contains multiple capital letters, no lowercase letters, and has character-level data.
       if (!/[a-z]/.test(wordObj.text) && /[A-Z].?[A-Z]/.test(wordObj.text) && wordObj.chars) {
         // Filter to only include letters
-        const filterArr = wordObj.text.split('').map((x) => /[a-z]/i.test(x));
-        const charArrSub = wordObj.chars.filter((x, y) => filterArr[y]);
+        const letterChars = wordObj.chars.filter((x) => /[a-z]/i.test(x.text));
 
-        const firstLetterHeight = charArrSub[0].bbox.bottom - charArrSub[0].bbox.top;
-        const otherLetterHeightArr = charArrSub.slice(1).map((x) => x.bbox.bottom - x.bbox.top);
+        // The letter "Q" is a special case as the capital letter is larger than other capital letters.
+        const firstLetterHeight = letterChars[0].bbox.bottom - letterChars[0].bbox.top;
+        const otherLetters = letterChars.slice(1).filter((x) => !/[q]/i.test(x.text));
+        if (otherLetters.length === 0) continue;
+
+        const otherLetterHeightArr = otherLetters.map((x) => x.bbox.bottom - x.bbox.top);
         const otherLetterHeightMax = Math.max(...otherLetterHeightArr);
         const otherLetterHeightMin = Math.min(...otherLetterHeightArr);
 
+        const firstLetterThresh = ['q', 'Q'].includes(letterChars[0].text) ? 1.3 : 1.1;
+
         // If the first letter is significantly larger than the others, then this word would need to be in title case.
-        if (firstLetterHeight > otherLetterHeightMax * 1.1) {
+        if (firstLetterHeight > otherLetterHeightMax * firstLetterThresh) {
           // If the other letters are all around the same size, then the word is small caps.
           if ((otherLetterHeightMax / otherLetterHeightMin) < 1.15) {
             smallCapsWordArr.push(wordObj);
@@ -69,8 +74,9 @@ export function pass2(pageObj, rotateAngle) {
           }
         } else {
           // Otherwise, all the letters need to be about the same size for this to be small caps.
-          const letterChars = wordObj.chars.filter((x) => /[a-z]/i.test(x.text));
-          const allLetterHeightArr = letterChars.map((x) => x.bbox.bottom - x.bbox.top);
+          const letterCharsAsc = wordObj.chars.filter((x) => /[a-pr-z]/i.test(x.text));
+          if (letterCharsAsc.length < 2) continue;
+          const allLetterHeightArr = letterCharsAsc.map((x) => x.bbox.bottom - x.bbox.top);
           const allLetterHeightMax = Math.max(...allLetterHeightArr);
           const allLetterHeightMin = Math.min(...allLetterHeightArr);
 

diff --git a/js/objects/ocrObjects.js b/js/objects/ocrObjects.js
@@ -101,6 +101,8 @@ export function OcrWord(line, text, bbox, id) {
   this.smallCaps = false;
   /** @type {string} */
   this.text = text;
+  /** @type {?string} */
+  this.textAlt = null;
   /** @type {('normal'|'italic'|'bold')} */
   this.style = 'normal';
   /** @type {?string} */

diff --git a/js/worker/compareOCRModule.js b/js/worker/compareOCRModule.js
@@ -718,72 +718,87 @@ export async function compareOCRPageImp({
                 if (!evalConflicts) {
                   hocrAError = 1;
                 } else if (oneToOne) {
-                  // TODO: Figure out how to compare between small caps/non small-caps words (this is the only relevant style as it is the only style LSTM detects)
-
-                  // Clone hocrAWord and set text content equal to hocrBWord
-                  const wordAClone = ocr.cloneWord(wordA);
-                  wordAClone.text = wordB.text;
-
-                  if (wordB.smallCaps && !wordA.smallCaps) {
-                    wordAClone.smallCaps = true;
-                    wordAClone.size = calcWordFontSize(wordB);
-                  }
+                  // Some common patterns detected by Tesseract Legacy are so implausible that they are automatically rejected.
+                  if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) {
+                    hocrAError = 1;
+                  // If the top choice out of the Tesseract Legacy classifier (but not entire model) is the same as the Tesseract LSTM choice, use the LSTM choice.
+                  // This condition is common when the Legacy model improperly applies a dictionary "correction" to a word that was already correct.
+                  } else if (legacyLSTMComb && wordA.textAlt && wordA.textAlt === wordB.text) {
+                    hocrAError = 1;
+                  // Otherwise, the words are compared visually.
+                  } else {
+                    // TODO: Figure out how to compare between small caps/non small-caps words (this is the only relevant style as it is the only style LSTM detects)
+                    // Clone hocrAWord and set text content equal to hocrBWord
+                    const wordAClone = ocr.cloneWord(wordA);
+                    wordAClone.text = wordB.text;
+
+                    if (wordB.smallCaps && !wordA.smallCaps) {
+                      wordAClone.smallCaps = true;
+                      wordAClone.size = calcWordFontSize(wordB);
+                    }
 
-                  const evalRes = await evalWords({
-                    wordsA: [wordA], wordsB: [wordAClone], binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
-                  });
+                    const evalRes = await evalWords({
+                      wordsA: [wordA], wordsB: [wordAClone], binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
+                    });
 
-                  hocrAError = evalRes.metricA + (await penalizeWord([wordA]));
-                  hocrBError = evalRes.metricB + (await penalizeWord([wordB]));
+                    hocrAError = evalRes.metricA + (await penalizeWord([wordA]));
+                    hocrBError = evalRes.metricB + (await penalizeWord([wordB]));
 
-                  // Reject Tesseract Legacy word if appropriate
-                  if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) hocrAError = 1;
+                    // Reject Tesseract Legacy word if appropriate
+                    if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) hocrAError = 1;
 
-                  if (evalRes.debug) {
-                    const debugObj = evalRes.debug;
-                    debugObj.errorAdjA = hocrAError;
-                    debugObj.errorAdjB = hocrBError;
+                    if (evalRes.debug) {
+                      const debugObj = evalRes.debug;
+                      debugObj.errorAdjA = hocrAError;
+                      debugObj.errorAdjB = hocrBError;
 
-                    debugImg.push(debugObj);
+                      debugImg.push(debugObj);
+                    }
                   }
                 } else if (twoToOne) {
-                  const evalRes = await evalWords({
-                    wordsA: wordsAArr, wordsB: wordsBArr, binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
-                  });
-
                   const wordsAText = wordsAArr.map((x) => x.text).join('');
                   const wordsBText = wordsBArr.map((x) => x.text).join('');
 
-                  // The option with more words has a small penalty added, as otherwise words incorrectly split will often score slightly better (due to more precise positioning)
-                  hocrAError = evalRes.metricA + (wordsAArr.length - 1) * 0.025 + (await penalizeWord(wordsAArr));
-                  hocrBError = evalRes.metricB + (wordsBArr.length - 1) * 0.025 + (await penalizeWord(wordsBArr));
-
-                  // An additional penalty is added to the option with more words when (1) the text is the same in both options and (2) at least one word has no letters.
-                  // This has 2 primary motivations:
-                  //  1. Tesseract Legacy often splits numbers into separate words.
-                  //    For example, the "-" in a negative number may be a different word, or the digits before and after the decimal point may be split into separate words.
-                  //    TODO: It may be worth investigating if this issue can be improved in the engine.
-                  //  1. Punctuation characters should not be their own word (e.g. quotes should come before/after alphanumeric characters)
-                  if (wordsAText === wordsBText) {
-                    if (wordsAArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0 || wordsBArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0) {
-                      hocrAError += (wordsAArr.length - 1) * 0.05;
-                      hocrBError += (wordsBArr.length - 1) * 0.05;
+                  if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) {
+                    hocrAError = 1;
+                  } else {
+                    const evalRes = await evalWords({
+                      wordsA: wordsAArr, wordsB: wordsBArr, binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
+                    });
+
+                    // The option with more words has a small penalty added, as otherwise words incorrectly split will often score slightly better (due to more precise positioning)
+                    hocrAError = evalRes.metricA + (wordsAArr.length - 1) * 0.025 + (await penalizeWord(wordsAArr));
+                    hocrBError = evalRes.metricB + (wordsBArr.length - 1) * 0.025 + (await penalizeWord(wordsBArr));
+
+                    // An additional penalty is added to the option with more words when (1) the text is the same in both options and (2) at least one word has no letters.
+                    // This has 2 primary motivations:
+                    //  1. Tesseract Legacy often splits numbers into separate words.
+                    //    For example, the "-" in a negative number may be a different word, or the digits before and after the decimal point may be split into separate words.
+                    //    TODO: It may be worth investigating if this issue can be improved in the engine.
+                    //  1. Punctuation characters should not be their own word (e.g. quotes should come before/after alphanumeric characters)
+                    if (wordsAText === wordsBText) {
+                      if (wordsAArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0 || wordsBArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0) {
+                        hocrAError += (wordsAArr.length - 1) * 0.05;
+                        hocrBError += (wordsBArr.length - 1) * 0.05;
+                      }
                     }
-                  }
 
-                  // Reject Tesseract Legacy word if appropriate
-                  if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) hocrAError = 1;
+                    // Reject Tesseract Legacy word if appropriate
+                    if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) hocrAError = 1;
 
-                  if (evalRes.debug) {
-                    const debugObj = evalRes.debug;
-                    debugObj.errorAdjA = hocrAError;
-                    debugObj.errorAdjB = hocrBError;
+                    if (evalRes.debug) {
+                      const debugObj = evalRes.debug;
+                      debugObj.errorAdjA = hocrAError;
+                      debugObj.errorAdjB = hocrBError;
 
-                    debugImg.push(debugObj);
+                      debugImg.push(debugObj);
+                    }
                   }
                 }
 
-                if (hocrBError < hocrAError) {
+                // The LSTM model is known to be more accurate on average.
+                // Therefore, if both metrics are terrible (indicating the word isn't lined up at all), the LSTM word is used.
+                if (hocrBError < hocrAError || (legacyLSTMComb && hocrAError > 0.7)) {
                   const skip = ['eg', 'ie'].includes(wordA.text.replace(/\W/g, ''));
 
                   if (!skip) {