Skip to content

Commit

Permalink
Fixed crash when importing PDF with invalid dictionary with calcSuppF…
Browse files Browse the repository at this point in the history
…ontInfo option enabled, added new tests
  • Loading branch information
Balearica committed Oct 28, 2024
1 parent bd441e2 commit b9251d2
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 3 deletions.
1 change: 1 addition & 0 deletions js/fontSupp.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ const calcSuppFontInfoForWords = async (words) => {
* and we need to determine how large to render the text.
*/
export const calcSuppFontInfo = async (ocrArr) => {
if (!ocrArr) return;
await gs.initTesseract({ anyOk: true, langs: ['eng'] });
// console.time('calcSuppFontInfo');
const calcFonts = new Set();
Expand Down
Binary file not shown.
33 changes: 30 additions & 3 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,44 @@ describe('Check superscripts are detected in PDF imports.', function () {
});
}).timeout(120000);

// Note that these font sizes will not match the scribeocr.com interface, as `calcSuppFontInfo` is enabled in the interface but not the tests,
// and this setting scales the font sizes reported by the PDF parser.
describe('Check font size is correctly parsed in PDF imports.', function () {
this.timeout(10000);
// This word was problematic at one point due to the change in font size between the first and second word.
// Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option.
it('Should correctly parse font sizes (1st doc)', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
// This word was problematic at one point due to the change in font size between the first and second word.
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 32.5);
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
}).timeout(10000);

it('Should correctly parse font sizes and scale using calcSuppFontInfo option (1st doc)', async () => {
scribe.opt.calcSuppFontInfo = true;
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 39);
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
}).timeout(10000);

scribe.opt.calcSuppFontInfo = false;

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

describe('Check that text-native PDFs with broken encoding dictionaries are detected and skipped.', function () {
this.timeout(10000);
// Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option.
it('Should correctly parse font sizes (1st doc)', async () => {
// Set `calcSuppFontInfo` to `true` as this option previously crashed the program with this type of PDFs.
scribe.opt.calcSuppFontInfo = true;

await scribe.importFiles([`${ASSETS_PATH_KARMA}/coca-cola-business-and-sustainability-report-2022.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });

assert.strictEqual(scribe.data.ocr.active.length, 0);
}).timeout(10000);

scribe.opt.calcSuppFontInfo = false;

after(async () => {
await scribe.terminate();
});
Expand Down

0 comments on commit b9251d2

Please sign in to comment.