From 30eb7fd4ff3868ec87bc63c2cdb56b33617b6003 Mon Sep 17 00:00:00 2001 From: Balearica Date: Fri, 22 Nov 2024 00:58:47 -0800 Subject: [PATCH] Initial implementation of support for different text orientations --- js/export/writePdf.js | 29 +++-- js/import/convertPageStext.js | 106 +++++++++++++++--- js/modifyOCR.js | 5 +- js/objects/ocrObjects.js | 2 + ...Book_June_2024_r8_30_all_orientations.pdf} | Bin 134754 -> 268809 bytes tests/module/importPdfText.spec.js | 42 ++++++- 6 files changed, 156 insertions(+), 28 deletions(-) rename tests/assets/{CSF_Proposed_Budget_Book_June_2024_r8_30.pdf => CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf} (50%) diff --git a/js/export/writePdf.js b/js/export/writePdf.js index 2389079..aff6a0f 100644 --- a/js/export/writePdf.js +++ b/js/export/writePdf.js @@ -11,6 +11,11 @@ import { opt } from '../containers/app.js'; import { pageMetricsArr } from '../containers/dataContainer.js'; import ocr from '../objects/ocrObjects.js'; +/** + * @param {number} x + */ +const formatNum = (x) => String(Math.round(x * 1e6) / 1e6); + // Creates 3 PDF objects necessary to embed font. // These are (1) the font dictionary, (2) the font descriptor, and (3) the font file, // which will be located at objects firstObjIndex, firstObjIndex + 1, and firstObjIndex + 2 (respectively). @@ -290,8 +295,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle rotateText = false, rotateBackground = false, confThreshHigh = 85, confThreshMed = 75, fontChiSim = null) { const { lines } = pageObj; - const sinAngle = Math.sin(angle * (Math.PI / 180)); - const cosAngle = Math.cos(angle * (Math.PI / 180)); + const cosAnglePage = Math.cos(angle * (Math.PI / 180)); // Start 1st object: Text Content let textContentObjStr = ''; @@ -372,10 +376,19 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x; const lineTopAdj = lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y; - if (rotateText) { - textContentObjStr += `${String(cosAngle)} ${String(-sinAngle)} ${String(sinAngle)} ${String(cosAngle)} ${String(lineLeftAdj)} ${String(outputDims.height - lineTopAdj + 1)} Tm\n`; + const lineAngleDeg = Number(rotateText) * angle + 90 * lineObj.orientation; + + const sinAngleTm = Math.sin(lineAngleDeg * (Math.PI / 180)); + const cosAngleTm = Math.cos(lineAngleDeg * (Math.PI / 180)); + + if (lineObj.orientation === 1) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineTopAdj + 1)} ${formatNum(outputDims.height - lineLeftAdj)} Tm\n`; + } else if (lineObj.orientation === 2) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineLeftAdj + 1)} ${formatNum(lineTopAdj)} Tm\n`; + } else if (lineObj.orientation === 3) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineTopAdj)} ${formatNum(lineLeftAdj)} Tm\n`; } else { - textContentObjStr += `${String(1)} ${String(0)} ${String(0)} ${String(1)} ${String(lineLeftAdj)} ${String(outputDims.height - lineTopAdj + 1)} Tm\n`; + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineLeftAdj)} ${formatNum(outputDims.height - lineTopAdj + 1)} Tm\n`; } textContentObjStr += '[ '; @@ -452,8 +465,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle // const pdfFont = word.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFontFamily][word.style]; const { name: pdfFont, type: pdfFontType } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style]; - const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAngle; - const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAngle; + const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAnglePage; + const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAnglePage; // Add space character between words if (j > 0 && !kernSpacing) { @@ -531,7 +544,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') { kernSpacing = true; const wordNext = words[j + 1]; - const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle; + const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngleTm; // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right; const wordGlyph = wordFontOpentype.charToGlyph(charArr.at(-1)); diff --git a/js/import/convertPageStext.js b/js/import/convertPageStext.js index d32599f..592974d 100644 --- a/js/import/convertPageStext.js +++ b/js/import/convertPageStext.js @@ -50,9 +50,18 @@ export async function convertPageStext({ ocrStr, n }) { const xmlLinePreChar = xmlLine.match(/^[\s\S]*?(?= parseFloat(x)); + + // TODO: This only works when the text gradient is 0. + // It should work with text with both a non-zero gradient and a non-zero orientation. + let orientation = 0; + if (dir && dir[0] === 0 && dir[1] === 1) { + orientation = 1; + } else if (dir && dir[0] === -1 && dir[1] === 0) { + orientation = 2; + } else if (dir && dir[0] === 0 && dir[1] === -1) { + orientation = 3; + } const xmlLineFormatting = xmlLinePreChar?.match(/]+/)?.[0]; const fontName = xmlLineFormatting?.match(/name=['"]([^'"]*)/)?.[1]; @@ -168,12 +177,36 @@ export async function convertPageStext({ ocrStr, n }) { continue; } - const quad = { - ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) }, - ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) }, - ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) }, - lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) }, - }; + let quad; + if (orientation === 1) { + quad = { + ul: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) }, + ur: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) }, + ll: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) }, + lr: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) }, + }; + } else if (orientation === 2) { + quad = { + ul: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) }, + ur: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) }, + ll: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) }, + lr: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) }, + }; + } else if (orientation === 3) { + quad = { + ul: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) }, + ur: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) }, + ll: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) }, + lr: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) }, + }; + } else { + quad = { + ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) }, + ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) }, + ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) }, + lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) }, + }; + } wordCharOrFontArr[i][j] = { quad, @@ -324,16 +357,51 @@ export async function convertPageStext({ ocrStr, n }) { wordInit = true; } - const bbox = { - left: Math.round(charOrFont.origin.x), - top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)), - right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)), - bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)), - }; + let bbox; + if (orientation === 1) { + bbox = { + left: Math.round(charOrFont.origin.y), + top: Math.round(pageDims.width - Math.max(charOrFont.quad.ur.x, charOrFont.quad.lr.x)), + right: Math.round(charOrFont.origin.y + (charOrFont.quad.lr.y - charOrFont.quad.ur.y)), + bottom: Math.round(pageDims.width - Math.min(charOrFont.quad.ul.x, charOrFont.quad.ll.x)), + }; + } else if (orientation === 2) { + bbox = { + left: Math.round(pageDims.width - charOrFont.origin.x), + top: Math.round(pageDims.height - Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)), + right: Math.round(pageDims.width - (charOrFont.origin.x - (charOrFont.quad.ur.x - charOrFont.quad.ul.x))), + bottom: Math.round(pageDims.height - Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)), + }; + } else if (orientation === 3) { + bbox = { + left: Math.round(pageDims.height - charOrFont.origin.y), + top: Math.round(Math.min(charOrFont.quad.ul.x, charOrFont.quad.ll.x)), + right: Math.round(pageDims.height - charOrFont.origin.y + (charOrFont.quad.lr.y - charOrFont.quad.ur.y)), + bottom: Math.round(Math.max(charOrFont.quad.ur.x, charOrFont.quad.lr.x)), + }; + } else { + bbox = { + left: Math.round(charOrFont.origin.x), + top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)), + right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)), + bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)), + }; + } if (!superCurrent) { if (baselineFirst.length === 0) { - baselineFirst.push(bbox.left, charOrFont.origin.y); + let originY; + if (orientation === 1) { + originY = pageDims.width - charOrFont.origin.x; + } else if (orientation === 2) { + originY = pageDims.height - charOrFont.origin.y; + } else if (orientation === 3) { + originY = charOrFont.origin.x; + } else { + originY = charOrFont.origin.y; + } + + baselineFirst.push(bbox.left, originY); } } @@ -373,8 +441,8 @@ export async function convertPageStext({ ocrStr, n }) { if (bboxes.length === 0) return; let baselineSlope = 0; - if (dirSlope !== null) { - baselineSlope = dirSlope; + if (dir && dir[1] !== undefined && !Number.isNaN(dir[1])) { + baselineSlope = dir[1]; } else { console.log('Unable to parse slope.'); } @@ -394,6 +462,8 @@ export async function convertPageStext({ ocrStr, n }) { const lineObj = new ocr.OcrLine(pageObj, lineBbox, baselineOut, letterHeightOut, null); + lineObj.orientation = orientation; + lineObj.raw = xmlLine; let lettersKept = 0; diff --git a/js/modifyOCR.js b/js/modifyOCR.js index f99937c..be654df 100644 --- a/js/modifyOCR.js +++ b/js/modifyOCR.js @@ -41,7 +41,10 @@ export function combineOCRPage(pageA, pageB, pageMetricsObj, replaceFontSize = f for (lineI = 0; lineI < pageB.lines.length; lineI++) { const line = pageB.lines[lineI]; - if (line.words.length === 0) continue; + // Skip lines that are not horizontal for now. + // Horizontal lines should not be combined with vertical lines, + // and combining vertical lines likely brings up some edge cases we have not considered. + if (line.words.length === 0 || line.orientation !== 0) continue; const lineRot = ocr.cloneLine(line); if (pageMetricsObj.angle) ocr.rotateLine(lineRot, pageMetricsObj.angle * -1, pageMetricsObj.dims); diff --git a/js/objects/ocrObjects.js b/js/objects/ocrObjects.js index a47a551..88fb764 100644 --- a/js/objects/ocrObjects.js +++ b/js/objects/ocrObjects.js @@ -84,6 +84,8 @@ export function OcrLine(page, bbox, baseline, ascHeight = null, xHeight = null) this._angleAdj = null; /** @type {OcrPar} */ this.par = null; + /** @type {number} */ + this.orientation = 0; } /** diff --git a/tests/assets/CSF_Proposed_Budget_Book_June_2024_r8_30.pdf b/tests/assets/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf similarity index 50% rename from tests/assets/CSF_Proposed_Budget_Book_June_2024_r8_30.pdf rename to tests/assets/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf index 04472ea2a141710aac6a9e74a7ba3d239e8f99b6..0e382bc27af54452cfa66849d79cd5974c1a3ec1 100644 GIT binary patch delta 8986 zcmd^EO^g)B6`o$O3=4yaARvqvwCxV$} zW-yB{lO++3FLt649*0E9HA*a5_L6LrLt-gWE;;y;TMmhG$u&}>Tq5u0z3RW}>SozJ zd!^awc~!68t9sw}URCu!f4b$}M?1g!edE#4wH-b+ZL@XlJLR!4>wM6>-kmcY;`!Fa z+t<2x=L74~h4WSgA1(ylTD{h-jgNP_?VxtE{K@_D&^Agc#5>N(apv@YHq=vpFCE^r z!#K#enDOW}chn|47-jy9J6a=-H#+0Z5>}gKH8LA4UEciRmWlFk(5y#e!xQ@Cq`pnI zZa2FovtDl0J5>(yDO0%NmluAC{*m(jk&#iRjkkhhH)gHb*6lWa&)uD$3!2AfYMsDpHD%L$tsOLP z1hZXBKDE)V1w%Gkb?escRu`U7(rGN<0G;{TOkj2A+JRK=-r3$l@!LTgzXv~m$M{&A z081yp>|_G{_jY}KaO=m$*}=z;jIZ`&oX@nHUF-F#Q>E0N_Nv^qP5ih*OcPS2E;HdX zO*phWCUb;IiEA=KWv{sR8sF0n#X+YXx5|CjbU~XlX1a_Io_T0=`fon_)MiS2bMV85 z#tE#mW(KORL3 zur*nL>$PTMrd1D2LIn&z z6=EL)xO8=I*GRw(;Zr6&dYL##tSRes6SN!6>(;qCEMlX3cl2z+d$eF%iU|ivq?kw@ zbyM#2pWA);Na83AK{7+2AIcC*_jkXcc$tP3aoIn-@7&sRmb_Qyytwb6GFqBDU^Exz zj9zYZ&ghcjurlX3rHqaB|N3mOwxcpnDUP1o&xg|Br1_*@-V;22^(f*&ai)K7kDHEy zHX=;Fx9dRvFC*Vx+b5YqQ=j^a`}Xut99T&QR7ipYS`W7N`_DfiA{sBEViTAGHU zoaJ36OGAxI7#tyb%^7O}&H`Fw=;2=SH_l;Si%&XVp|Azuyr!3Ws>z^PD7*kPKYro# zoT?^hp->_M^O_!MVFcJ^^Rv*%5n*}FF*VNSp)7aQQF`9CgAY-sKe6LV@9@fMX`ytp zG+#Pas+St@IbL$$zbLsS3g313HCo~&uS7}=o*4X|fws}owbCp++w!RnJ$NaA76O`e z=fi{C7*2kMuu!=3A8_r))Vi z{wFXv>U*GKgOZRkzvP2-vUCQlkkSNvGx*^D(=a$Wvh@DW2SyK$CnhjpeB3&{(0%D* zx7H2daP-ne+az-HKLyVOo+VG97jC$q!z8^5+vISK;hC6jgnfeYD4qo65l$+wiHuTK z1*8!MD{oTR45I^9rQ(=w#KsD2kt4Ujr*M*Yl`E)ug>v#rI z*q~%746W^bDML$T9Ew5rnV7QhVc_t<6vYsv zQSd7@h))Cx2(7Gd*qNEKG6CY!&=RqM*eZo%+YUA){4Zr}ZON;m*xKQu%h=kITUu;o z>77Cf6H^wYIJOZ#k+!l7gt3(+A0I&)CDlrgnzG`c*fC|g$46$$Or41>^)?V&^dL;h z@%;?-nSk?rbX78Z%bJALG_xdLIMjKr;1b104{?P4j~;yO=}2qocMm=>3VS0~quvL$ zQMmYsHI{z$QL#o`XJP!2?VKwrfB5X!P@xfwM`DCKiIL?ydyPLATZ!vuXobK*IdW`y z^D*P;VlxPc$jr!g%6OtP_{}}zzlEN~TSPDe(^owE;0MMp3|_Rpab~T`o0X~Q_)bY~ zO%n5*DlUKVf$^1Mm*a5bG_NSjKYaGxhYSB%eSc;7?^pl5y1M$k#k2Dz^Rv?D$A<3} zI7Q^%5sg=wEE|beMD7q{w+1IZHJ*w=vHbF<#*0HEg~O>q-p13Z4DS8bc=7o{l{m<7 zrFM127#ZwbF%A&9)lcfGHB&Q*)tWcEuvt!v6_F%GA4A=-*B+xbTf5Q8=*1DQTIVJw zPr?OT9qxu?2~`$AiSfwbixp!yS%4j@#xui3K_+$_<@BC3%flu6h0#v&+QsP^Vwxpj zad!m*c?K4$$kSLH(mVx=buvWHaj;lNMZtESBVn;pj04Hs91Dwe6t)-8&?~~5JPe13 zmpM5NI9xF*c_2^3;^E?e=LjTk4dT@=2>V5v*lqp9m2ol4^DX;W(f_{bixMW zFpmTJ@y~3=GGSx()?~5(#xe+HG&FH~g5qf+Y2s;<;0z)gx4>Q`Q4&60JS8nOZSfhN zUXm71X@PfSN{b)0^lW$s;;2e+gcWu)8F*2`6fGQ=Fak%Jbfie;V7A7;WFZ(4cpI3N zQR+c8RAgwMGE6^;5M(f(GI?wRnS&>0t)IXP+pLUbmK56vVjzQNXUr)V`i#Mq+-h;=LE-IXjR;1$4kQ6A z(+3n_MPCKM1s?S>P79ae)>D)D_>w%70srxV2+zSCVJJ*w0(afI3@cbx243q}5Q5_l zLF>nvuGD+|en9GAt`uXSk2`NtflFc3quKFf}i4 zqvwkJ(zt?aA)(XxgJy}qfc)6u^ zDZFjgjeu$0L6^EZW~l2x=u=%8e-!4uc7)L5?mB+n2$Ki0Oh}+=s9Chzt~G81?eZ|} zE-yCThWFc!4_BToc+bBL%e8f`IomSX;D6OU}SMczW`*LnqkDlM~Lw uJ>gFJe3Fk(Td%f*w_pqhDhHe~hdbR`yBij5>bjl}<@wN|)32N<5C0!R3wD?Q delta 13 UcmeC&BJgMqM?(wa7N#UV04Vna{{R30 diff --git a/tests/module/importPdfText.spec.js b/tests/module/importPdfText.spec.js index a2354c9..e43ac57 100644 --- a/tests/module/importPdfText.spec.js +++ b/tests/module/importPdfText.spec.js @@ -272,7 +272,7 @@ describe('Check that page angle is calculated correctly.', function () { }).timeout(10000); it('Different orientations should not impact page angle.', async () => { - await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30.pdf`]); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]); assert.strictEqual(scribe.data.pageMetrics[0].angle, 0); }).timeout(10000); @@ -281,6 +281,46 @@ describe('Check that page angle is calculated correctly.', function () { }); }).timeout(120000); +describe('Check that text orientation is handled correctly.', function () { + this.timeout(10000); + + it('Lines printed at exactly 90/180/270 degrees have orientation detected correctly', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]); + assert.strictEqual(scribe.data.ocr.active[0].lines[2].words[0].line.orientation, 3); + assert.strictEqual(scribe.data.ocr.active[3].lines[2].words[0].line.orientation, 2); + assert.strictEqual(scribe.data.ocr.active[2].lines[2].words[0].line.orientation, 1); + }).timeout(10000); + + // The following tests compare the coordinates of a rotated line to the same line in a non-rotated version of the same document. + it('Rotating text exactly 90 degrees counterclockwise does not have significant impact on word coordinates after parsing', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]); + assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1); + assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1); + assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1); + assert.approximately(scribe.data.ocr.active[0].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1); + }).timeout(10000); + + it('Rotating text exactly 90 degrees clockwise does not have significant impact on word coordinates after parsing', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]); + assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1); + assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1); + assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1); + assert.approximately(scribe.data.ocr.active[2].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1); + }).timeout(10000); + + it('Rotating text exactly 180 degrees does not have significant impact on word coordinates after parsing', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf`]); + assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.left, scribe.data.ocr.active[1].lines[2].words[0].bbox.left, 1); + assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.right, scribe.data.ocr.active[1].lines[2].words[0].bbox.right, 1); + assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.top, scribe.data.ocr.active[1].lines[2].words[0].bbox.top, 1); + assert.approximately(scribe.data.ocr.active[3].lines[2].words[0].bbox.bottom, scribe.data.ocr.active[1].lines[2].words[0].bbox.bottom, 1); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + describe('Check that PDF text types are detected and imported correctly.', function () { this.timeout(10000);