Skip to content

Commit

Permalink
Updated font comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 2, 2024
1 parent df50f09 commit 0705925
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 56 deletions.
9 changes: 9 additions & 0 deletions js/import/convertPageBlocks.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ export async function convertPageBlocks({
wordObj.lang = word.language;
wordObj.conf = word.confidence;

// Keep alternative choices if they have higher confidence than the primary choice.
// This happens when the original "best choice" is rejected due to an ad-hoc penalty, most frequently because it is a non-dictionary word.
if (word.choices.length > 0) {
word.choices.sort((a, b) => b.confidence - a.confidence);
if (word.choices[0].text !== word.text) {
wordObj.textAlt = word.choices[0].text;
}
}

// The `word` object has a `is_italic` property, but it is always false.
// Therefore, the font name is checked to determine if the word is italic.
// See: https://github.com/naptha/tesseract.js/issues/907
Expand Down
20 changes: 13 additions & 7 deletions js/import/convertPageShared.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,21 @@ export function pass2(pageObj, rotateAngle) {
// Word contains multiple capital letters, no lowercase letters, and has character-level data.
if (!/[a-z]/.test(wordObj.text) && /[A-Z].?[A-Z]/.test(wordObj.text) && wordObj.chars) {
// Filter to only include letters
const filterArr = wordObj.text.split('').map((x) => /[a-z]/i.test(x));
const charArrSub = wordObj.chars.filter((x, y) => filterArr[y]);
const letterChars = wordObj.chars.filter((x) => /[a-z]/i.test(x.text));

const firstLetterHeight = charArrSub[0].bbox.bottom - charArrSub[0].bbox.top;
const otherLetterHeightArr = charArrSub.slice(1).map((x) => x.bbox.bottom - x.bbox.top);
// The letter "Q" is a special case as the capital letter is larger than other capital letters.
const firstLetterHeight = letterChars[0].bbox.bottom - letterChars[0].bbox.top;
const otherLetters = letterChars.slice(1).filter((x) => !/[q]/i.test(x.text));
if (otherLetters.length === 0) continue;

const otherLetterHeightArr = otherLetters.map((x) => x.bbox.bottom - x.bbox.top);
const otherLetterHeightMax = Math.max(...otherLetterHeightArr);
const otherLetterHeightMin = Math.min(...otherLetterHeightArr);

const firstLetterThresh = ['q', 'Q'].includes(letterChars[0].text) ? 1.3 : 1.1;

// If the first letter is significantly larger than the others, then this word would need to be in title case.
if (firstLetterHeight > otherLetterHeightMax * 1.1) {
if (firstLetterHeight > otherLetterHeightMax * firstLetterThresh) {
// If the other letters are all around the same size, then the word is small caps.
if ((otherLetterHeightMax / otherLetterHeightMin) < 1.15) {
smallCapsWordArr.push(wordObj);
Expand All @@ -69,8 +74,9 @@ export function pass2(pageObj, rotateAngle) {
}
} else {
// Otherwise, all the letters need to be about the same size for this to be small caps.
const letterChars = wordObj.chars.filter((x) => /[a-z]/i.test(x.text));
const allLetterHeightArr = letterChars.map((x) => x.bbox.bottom - x.bbox.top);
const letterCharsAsc = wordObj.chars.filter((x) => /[a-pr-z]/i.test(x.text));
if (letterCharsAsc.length < 2) continue;
const allLetterHeightArr = letterCharsAsc.map((x) => x.bbox.bottom - x.bbox.top);
const allLetterHeightMax = Math.max(...allLetterHeightArr);
const allLetterHeightMin = Math.min(...allLetterHeightArr);

Expand Down
2 changes: 2 additions & 0 deletions js/objects/ocrObjects.js
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ export function OcrWord(line, text, bbox, id) {
this.smallCaps = false;
/** @type {string} */
this.text = text;
/** @type {?string} */
this.textAlt = null;
/** @type {('normal'|'italic'|'bold')} */
this.style = 'normal';
/** @type {?string} */
Expand Down
113 changes: 64 additions & 49 deletions js/worker/compareOCRModule.js
Original file line number Diff line number Diff line change
Expand Up @@ -718,72 +718,87 @@ export async function compareOCRPageImp({
if (!evalConflicts) {
hocrAError = 1;
} else if (oneToOne) {
// TODO: Figure out how to compare between small caps/non small-caps words (this is the only relevant style as it is the only style LSTM detects)

// Clone hocrAWord and set text content equal to hocrBWord
const wordAClone = ocr.cloneWord(wordA);
wordAClone.text = wordB.text;

if (wordB.smallCaps && !wordA.smallCaps) {
wordAClone.smallCaps = true;
wordAClone.size = calcWordFontSize(wordB);
}
// Some common patterns detected by Tesseract Legacy are so implausible that they are automatically rejected.
if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) {
hocrAError = 1;
// If the top choice out of the Tesseract Legacy classifier (but not entire model) is the same as the Tesseract LSTM choice, use the LSTM choice.
// This condition is common when the Legacy model improperly applies a dictionary "correction" to a word that was already correct.
} else if (legacyLSTMComb && wordA.textAlt && wordA.textAlt === wordB.text) {
hocrAError = 1;
// Otherwise, the words are compared visually.
} else {
// TODO: Figure out how to compare between small caps/non small-caps words (this is the only relevant style as it is the only style LSTM detects)
// Clone hocrAWord and set text content equal to hocrBWord
const wordAClone = ocr.cloneWord(wordA);
wordAClone.text = wordB.text;

if (wordB.smallCaps && !wordA.smallCaps) {
wordAClone.smallCaps = true;
wordAClone.size = calcWordFontSize(wordB);
}

const evalRes = await evalWords({
wordsA: [wordA], wordsB: [wordAClone], binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
});
const evalRes = await evalWords({
wordsA: [wordA], wordsB: [wordAClone], binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
});

hocrAError = evalRes.metricA + (await penalizeWord([wordA]));
hocrBError = evalRes.metricB + (await penalizeWord([wordB]));
hocrAError = evalRes.metricA + (await penalizeWord([wordA]));
hocrBError = evalRes.metricB + (await penalizeWord([wordB]));

// Reject Tesseract Legacy word if appropriate
if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) hocrAError = 1;
// Reject Tesseract Legacy word if appropriate
if (legacyLSTMComb && rejectWordLegacy(wordA.text, wordB.text)) hocrAError = 1;

if (evalRes.debug) {
const debugObj = evalRes.debug;
debugObj.errorAdjA = hocrAError;
debugObj.errorAdjB = hocrBError;
if (evalRes.debug) {
const debugObj = evalRes.debug;
debugObj.errorAdjA = hocrAError;
debugObj.errorAdjB = hocrBError;

debugImg.push(debugObj);
debugImg.push(debugObj);
}
}
} else if (twoToOne) {
const evalRes = await evalWords({
wordsA: wordsAArr, wordsB: wordsBArr, binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
});

const wordsAText = wordsAArr.map((x) => x.text).join('');
const wordsBText = wordsBArr.map((x) => x.text).join('');

// The option with more words has a small penalty added, as otherwise words incorrectly split will often score slightly better (due to more precise positioning)
hocrAError = evalRes.metricA + (wordsAArr.length - 1) * 0.025 + (await penalizeWord(wordsAArr));
hocrBError = evalRes.metricB + (wordsBArr.length - 1) * 0.025 + (await penalizeWord(wordsBArr));

// An additional penalty is added to the option with more words when (1) the text is the same in both options and (2) at least one word has no letters.
// This has 2 primary motivations:
// 1. Tesseract Legacy often splits numbers into separate words.
// For example, the "-" in a negative number may be a different word, or the digits before and after the decimal point may be split into separate words.
// TODO: It may be worth investigating if this issue can be improved in the engine.
// 1. Punctuation characters should not be their own word (e.g. quotes should come before/after alphanumeric characters)
if (wordsAText === wordsBText) {
if (wordsAArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0 || wordsBArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0) {
hocrAError += (wordsAArr.length - 1) * 0.05;
hocrBError += (wordsBArr.length - 1) * 0.05;
if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) {
hocrAError = 1;
} else {
const evalRes = await evalWords({
wordsA: wordsAArr, wordsB: wordsBArr, binaryImage: binaryImageBit, angle: imgAngle, imgDims, options: { view: Boolean(debugLabel) },
});

// The option with more words has a small penalty added, as otherwise words incorrectly split will often score slightly better (due to more precise positioning)
hocrAError = evalRes.metricA + (wordsAArr.length - 1) * 0.025 + (await penalizeWord(wordsAArr));
hocrBError = evalRes.metricB + (wordsBArr.length - 1) * 0.025 + (await penalizeWord(wordsBArr));

// An additional penalty is added to the option with more words when (1) the text is the same in both options and (2) at least one word has no letters.
// This has 2 primary motivations:
// 1. Tesseract Legacy often splits numbers into separate words.
// For example, the "-" in a negative number may be a different word, or the digits before and after the decimal point may be split into separate words.
// TODO: It may be worth investigating if this issue can be improved in the engine.
// 1. Punctuation characters should not be their own word (e.g. quotes should come before/after alphanumeric characters)
if (wordsAText === wordsBText) {
if (wordsAArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0 || wordsBArr.map((x) => /[a-z]/i.test(x.text)).filter((x) => !x).length > 0) {
hocrAError += (wordsAArr.length - 1) * 0.05;
hocrBError += (wordsBArr.length - 1) * 0.05;
}
}
}

// Reject Tesseract Legacy word if appropriate
if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) hocrAError = 1;
// Reject Tesseract Legacy word if appropriate
if (legacyLSTMComb && rejectWordLegacy(wordsAText, wordsBText)) hocrAError = 1;

if (evalRes.debug) {
const debugObj = evalRes.debug;
debugObj.errorAdjA = hocrAError;
debugObj.errorAdjB = hocrBError;
if (evalRes.debug) {
const debugObj = evalRes.debug;
debugObj.errorAdjA = hocrAError;
debugObj.errorAdjB = hocrBError;

debugImg.push(debugObj);
debugImg.push(debugObj);
}
}
}

if (hocrBError < hocrAError) {
// The LSTM model is known to be more accurate on average.
// Therefore, if both metrics are terrible (indicating the word isn't lined up at all), the LSTM word is used.
if (hocrBError < hocrAError || (legacyLSTMComb && hocrAError > 0.7)) {
const skip = ['eg', 'ie'].includes(wordA.text.replace(/\W/g, ''));

if (!skip) {
Expand Down

0 comments on commit 0705925

Please sign in to comment.