Skip to content

Commit

Permalink
Improved italics detection; added new tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Nov 22, 2024
1 parent b9dc89a commit 0a80148
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ export async function convertPageStext({ ocrStr, n }) {
smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(charOrFont.name);
smallCapsWord = smallCapsCurrent;

if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /-it$/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
// The word is already initialized, so we need to change the last element of the style array.
// Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
styleCurrent = 'italic';
Expand Down
18 changes: 18 additions & 0 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -364,3 +364,21 @@ describe('Check that PDF text types are detected and imported correctly.', funct
await scribe.terminate();
});
}).timeout(120000);

describe('Check that font style is detected for PDF imports.', function () {
this.timeout(10000);

it('Bold style is detected', async () => {
scribe.opt.usePDFText.native.main = true;
await scribe.importFiles([`${ASSETS_PATH_KARMA}/superscript_examples.pdf`]);
assert.strictEqual(scribe.data.ocr.active[5].lines[26].words[0].style, 'bold');
}).timeout(10000);

it('Italic style is detected', async () => {
assert.strictEqual(scribe.data.ocr.active[5].lines[22].words[4].style, 'italic');
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

0 comments on commit 0a80148

Please sign in to comment.