Skip to content

Commit

Permalink
Initial implementation of support for different text orientations
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Nov 22, 2024
1 parent 96b1f1c commit 30eb7fd
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 28 deletions.
29 changes: 21 additions & 8 deletions js/export/writePdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ import { opt } from '../containers/app.js';
import { pageMetricsArr } from '../containers/dataContainer.js';
import ocr from '../objects/ocrObjects.js';

/**
* @param {number} x
*/
const formatNum = (x) => String(Math.round(x * 1e6) / 1e6);

// Creates 3 PDF objects necessary to embed font.
// These are (1) the font dictionary, (2) the font descriptor, and (3) the font file,
// which will be located at objects firstObjIndex, firstObjIndex + 1, and firstObjIndex + 2 (respectively).
Expand Down Expand Up @@ -290,8 +295,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
rotateText = false, rotateBackground = false, confThreshHigh = 85, confThreshMed = 75, fontChiSim = null) {
const { lines } = pageObj;

const sinAngle = Math.sin(angle * (Math.PI / 180));
const cosAngle = Math.cos(angle * (Math.PI / 180));
const cosAnglePage = Math.cos(angle * (Math.PI / 180));

// Start 1st object: Text Content
let textContentObjStr = '';
Expand Down Expand Up @@ -372,10 +376,19 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
const lineTopAdj = lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y;

if (rotateText) {
textContentObjStr += `${String(cosAngle)} ${String(-sinAngle)} ${String(sinAngle)} ${String(cosAngle)} ${String(lineLeftAdj)} ${String(outputDims.height - lineTopAdj + 1)} Tm\n`;
const lineAngleDeg = Number(rotateText) * angle + 90 * lineObj.orientation;

const sinAngleTm = Math.sin(lineAngleDeg * (Math.PI / 180));
const cosAngleTm = Math.cos(lineAngleDeg * (Math.PI / 180));

if (lineObj.orientation === 1) {
textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineTopAdj + 1)} ${formatNum(outputDims.height - lineLeftAdj)} Tm\n`;
} else if (lineObj.orientation === 2) {
textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineLeftAdj + 1)} ${formatNum(lineTopAdj)} Tm\n`;
} else if (lineObj.orientation === 3) {
textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineTopAdj)} ${formatNum(lineLeftAdj)} Tm\n`;
} else {
textContentObjStr += `${String(1)} ${String(0)} ${String(0)} ${String(1)} ${String(lineLeftAdj)} ${String(outputDims.height - lineTopAdj + 1)} Tm\n`;
textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineLeftAdj)} ${formatNum(outputDims.height - lineTopAdj + 1)} Tm\n`;
}

textContentObjStr += '[ ';
Expand Down Expand Up @@ -452,8 +465,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
// const pdfFont = word.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFontFamily][word.style];
const { name: pdfFont, type: pdfFontType } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];

const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAngle;
const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAngle;
const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAnglePage;
const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAnglePage;

// Add space character between words
if (j > 0 && !kernSpacing) {
Expand Down Expand Up @@ -531,7 +544,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
kernSpacing = true;
const wordNext = words[j + 1];
const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngleTm;
// const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;

const wordGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
Expand Down
106 changes: 88 additions & 18 deletions js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,18 @@ export async function convertPageStext({ ocrStr, n }) {
const xmlLinePreChar = xmlLine.match(/^[\s\S]*?(?=<char)/)?.[0];
if (!xmlLinePreChar) return;

const dirStr = xmlLinePreChar.match(/dir=['"]([^'"]*)/)?.[1];
const dirSlopeStr = dirStr?.match(/[-\d.]+$/)?.[0];
const dirSlope = dirSlopeStr ? parseFloat(dirSlopeStr) : null;
const dir = xmlLinePreChar.match(/dir=['"](\s*[\d.-]+)(\s*[\d.-]+)/)?.slice(1, 3).map((x) => parseFloat(x));

// TODO: This only works when the text gradient is 0.
// It should work with text with both a non-zero gradient and a non-zero orientation.
let orientation = 0;
if (dir && dir[0] === 0 && dir[1] === 1) {
orientation = 1;
} else if (dir && dir[0] === -1 && dir[1] === 0) {
orientation = 2;
} else if (dir && dir[0] === 0 && dir[1] === -1) {
orientation = 3;
}

const xmlLineFormatting = xmlLinePreChar?.match(/<font[^>]+/)?.[0];
const fontName = xmlLineFormatting?.match(/name=['"]([^'"]*)/)?.[1];
Expand Down Expand Up @@ -168,12 +177,36 @@ export async function convertPageStext({ ocrStr, n }) {
continue;
}

const quad = {
ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
};
let quad;
if (orientation === 1) {
quad = {
ul: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
ur: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
ll: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
lr: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
};
} else if (orientation === 2) {
quad = {
ul: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
ur: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
ll: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
lr: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
};
} else if (orientation === 3) {
quad = {
ul: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
ur: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
ll: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
lr: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
};
} else {
quad = {
ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
};
}

wordCharOrFontArr[i][j] = {
quad,
Expand Down Expand Up @@ -324,16 +357,51 @@ export async function convertPageStext({ ocrStr, n }) {
wordInit = true;
}

const bbox = {
left: Math.round(charOrFont.origin.x),
top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)),
bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
};
let bbox;
if (orientation === 1) {
bbox = {
left: Math.round(charOrFont.origin.y),
top: Math.round(pageDims.width - Math.max(charOrFont.quad.ur.x, charOrFont.quad.lr.x)),
right: Math.round(charOrFont.origin.y + (charOrFont.quad.lr.y - charOrFont.quad.ur.y)),
bottom: Math.round(pageDims.width - Math.min(charOrFont.quad.ul.x, charOrFont.quad.ll.x)),
};
} else if (orientation === 2) {
bbox = {
left: Math.round(pageDims.width - charOrFont.origin.x),
top: Math.round(pageDims.height - Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
right: Math.round(pageDims.width - (charOrFont.origin.x - (charOrFont.quad.ur.x - charOrFont.quad.ul.x))),
bottom: Math.round(pageDims.height - Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
};
} else if (orientation === 3) {
bbox = {
left: Math.round(pageDims.height - charOrFont.origin.y),
top: Math.round(Math.min(charOrFont.quad.ul.x, charOrFont.quad.ll.x)),
right: Math.round(pageDims.height - charOrFont.origin.y + (charOrFont.quad.lr.y - charOrFont.quad.ur.y)),
bottom: Math.round(Math.max(charOrFont.quad.ur.x, charOrFont.quad.lr.x)),
};
} else {
bbox = {
left: Math.round(charOrFont.origin.x),
top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)),
bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
};
}

if (!superCurrent) {
if (baselineFirst.length === 0) {
baselineFirst.push(bbox.left, charOrFont.origin.y);
let originY;
if (orientation === 1) {
originY = pageDims.width - charOrFont.origin.x;
} else if (orientation === 2) {
originY = pageDims.height - charOrFont.origin.y;
} else if (orientation === 3) {
originY = charOrFont.origin.x;
} else {
originY = charOrFont.origin.y;
}

baselineFirst.push(bbox.left, originY);
}
}

Expand Down Expand Up @@ -373,8 +441,8 @@ export async function convertPageStext({ ocrStr, n }) {
if (bboxes.length === 0) return;

let baselineSlope = 0;
if (dirSlope !== null) {
baselineSlope = dirSlope;
if (dir && dir[1] !== undefined && !Number.isNaN(dir[1])) {
baselineSlope = dir[1];
} else {
console.log('Unable to parse slope.');
}
Expand All @@ -394,6 +462,8 @@ export async function convertPageStext({ ocrStr, n }) {

const lineObj = new ocr.OcrLine(pageObj, lineBbox, baselineOut, letterHeightOut, null);

lineObj.orientation = orientation;

lineObj.raw = xmlLine;

let lettersKept = 0;
Expand Down
5 changes: 4 additions & 1 deletion js/modifyOCR.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ export function combineOCRPage(pageA, pageB, pageMetricsObj, replaceFontSize = f
for (lineI = 0; lineI < pageB.lines.length; lineI++) {
const line = pageB.lines[lineI];

if (line.words.length === 0) continue;
// Skip lines that are not horizontal for now.
// Horizontal lines should not be combined with vertical lines,
// and combining vertical lines likely brings up some edge cases we have not considered.
if (line.words.length === 0 || line.orientation !== 0) continue;

const lineRot = ocr.cloneLine(line);
if (pageMetricsObj.angle) ocr.rotateLine(lineRot, pageMetricsObj.angle * -1, pageMetricsObj.dims);
Expand Down
2 changes: 2 additions & 0 deletions js/objects/ocrObjects.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ export function OcrLine(page, bbox, baseline, ascHeight = null, xHeight = null)
this._angleAdj = null;
/** @type {OcrPar} */
this.par = null;
/** @type {number} */
this.orientation = 0;
}

/**
Expand Down
Binary file not shown.
42 changes: 41 additions & 1 deletion tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ describe('Check that page angle is calculated correctly.', function () {
}).timeout(10000);

it('Different orientations should not impact page angle.', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30.pdf`]);
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]);
assert.strictEqual(scribe.data.pageMetrics[0].angle, 0);
}).timeout(10000);

Expand All @@ -281,6 +281,46 @@ describe('Check that page angle is calculated correctly.', function () {
});
}).timeout(120000);

describe('Check that text orientation is handled correctly.', function () {
this.timeout(10000);

it('Lines printed at exactly 90/180/270 degrees have orientation detected correctly', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]);
assert.strictEqual(scribe.data.ocr.active[0].lines[2].words[0].line.orientation, 3);
assert.strictEqual(scribe.data.ocr.active[3].lines[2].words[0].line.orientation, 2);
assert.strictEqual(scribe.data.ocr.active[2].lines[2].words[0].line.orientation, 1);
}).timeout(10000);

// The following tests compare the coordinates of a rotated line to the same line in a non-rotated version of the same document.
it('Rotating text exactly 90 degrees counterclockwise does not have significant impact on word coordinates after parsing', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]);
assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1);
assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1);
assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1);
assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1);
}).timeout(10000);

it('Rotating text exactly 90 degrees clockwise does not have significant impact on word coordinates after parsing', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]);
assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1);
assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1);
assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1);
assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1);
}).timeout(10000);

it('Rotating text exactly 180 degrees does not have significant impact on word coordinates after parsing', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]);
assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1);
assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1);
assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1);
assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1);
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

describe('Check that PDF text types are detected and imported correctly.', function () {
this.timeout(10000);

Expand Down

0 comments on commit 30eb7fd

Please sign in to comment.