Skip to content

Commit

Permalink
Fixed issue when parsing font sizes from some PDF inputs, added new test
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Oct 27, 2024
1 parent 7fc1508 commit 6383a00
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
2 changes: 1 addition & 1 deletion js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ export async function convertPageStext({ ocrStr, n }) {
// (2) Runs of small caps that start with lower-case letters, which do not conform to the expectation that runs of small caps start with a capital letter.
const sizePrevRaw = sizeCurrentRaw;
sizeCurrentRaw = parseFloat(fontSizeStrI);
const secondLetter = wordInit && bboxesWordArr.length === 1;
const secondLetter = wordInit && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
const baselineNextLetter = parseFloat(letterOrFontArr[j + 1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[0]?.[6])
|| parseFloat(wordLetterOrFontArr[i + 1]?.[1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[2]?.[6]);
const fontSizeMin = Math.min(sizeCurrentRaw, sizePrevRaw);
Expand Down
Binary file added tests/assets/border_patrol_tables.pdf
Binary file not shown.
19 changes: 17 additions & 2 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ describe('Check superscripts are detected in PDF imports.', function () {
before(async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/superscript_examples.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
});

// First document
it('Should correctly import trailing superscripts printed using font size adjustments (1st doc)', async () => {
assert.strictEqual(scribe.data.ocr.active[0].lines[25].words[8].sup, true);
Expand Down Expand Up @@ -142,7 +142,6 @@ describe('Check superscripts are detected in PDF imports.', function () {
assert.strictEqual(scribe.data.ocr.active[5].lines[205].words[0].text, 'a');
}).timeout(10000);


// This document breaks when used with `mutool convert` so is not combined with the others.
// Any more tests included in the main stacked document should be inserted above this point.
it('Should correctly parse font size for lines with superscripts (addtl doc)', async () => {
Expand All @@ -169,3 +168,19 @@ describe('Check superscripts are detected in PDF imports.', function () {
await scribe.terminate();
});
}).timeout(120000);

// Note that these font sizes will not match the scribeocr.com interface, as `calcSuppFontInfo` is enabled in the interface but not the tests,
// and this setting scales the font sizes reported by the PDF parser.
describe('Check font size is correctly parsed in PDF imports.', function () {
this.timeout(10000);
// This word was problematic at one point due to the change in font size between the first and second word.
it('Should correctly parse font sizes (1st doc)', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 32.5);
assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

0 comments on commit 6383a00

Please sign in to comment.