Skip to content

Commit

Permalink
Updated mupdf build to improve line detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Oct 28, 2024
1 parent b9251d2 commit 5944108
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
Binary file modified mupdf/libmupdf.wasm
Binary file not shown.
24 changes: 20 additions & 4 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,15 @@ describe('Check font size is correctly parsed in PDF imports.', function () {
it('Should correctly parse font sizes (1st doc)', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
// This word was problematic at one point due to the change in font size between the first and second word.
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 32.5);
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
assert.strictEqual(scribe.data.ocr.active[0].lines[253].words[1].size, 32.5);
assert.strictEqual(scribe.data.ocr.active[0].lines[253].words[1].text, 'Agent');
}).timeout(10000);

it('Should correctly parse font sizes and scale using calcSuppFontInfo option (1st doc)', async () => {
scribe.opt.calcSuppFontInfo = true;
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 39);
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
assert.strictEqual(scribe.data.ocr.active[0].lines[253].words[1].size, 39);
assert.strictEqual(scribe.data.ocr.active[0].lines[253].words[1].text, 'Agent');
}).timeout(10000);

scribe.opt.calcSuppFontInfo = false;
Expand Down Expand Up @@ -211,3 +211,19 @@ describe('Check that text-native PDFs with broken encoding dictionaries are dete
await scribe.terminate();
});
}).timeout(120000);

describe('Check that PDF imports split lines correctly.', function () {
this.timeout(10000);
// Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option.
it('Should correctly parse PDF lines (1st doc)', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });

// A previous version of the build 5 words across 3 distinct lines (including this one) are combined into a single line.
assert.strictEqual(scribe.data.ocr.active[0].lines[3].words.length, 1);
assert.strictEqual(scribe.data.ocr.active[0].lines[3].words[0].text, 'Apprehensions');
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

0 comments on commit 5944108

Please sign in to comment.