diff --git a/js/extractTables.js b/js/extractTables.js index 1a39ba9..bc6f1db 100644 --- a/js/extractTables.js +++ b/js/extractTables.js @@ -1,5 +1,5 @@ -import { calcBoxOverlap } from './modifyOCR.js'; import ocr from './objects/ocrObjects.js'; +import { calcBoxOverlap } from './utils/miscUtils.js'; /** * diff --git a/js/import/convertPageHocr.js b/js/import/convertPageHocr.js index 1e05286..26790af 100644 --- a/js/import/convertPageHocr.js +++ b/js/import/convertPageHocr.js @@ -7,6 +7,7 @@ import { import { LayoutDataTablePage } from '../objects/layoutObjects.js'; import { pass2, pass3 } from './convertPageShared.js'; +import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js'; // If enabled, raw strings are saved in OCR objects for debugging purposes. const debugMode = true; @@ -374,7 +375,18 @@ export async function convertPageHocr({ pass2(pageObj, rotateAngle); const langSet = pass3(pageObj); + const autoDetectTables = false; + const dataTablePage = new LayoutDataTablePage(n); + if (autoDetectTables) { + const tableBboxes = detectTablesInPage(pageObj); + tableBboxes.forEach((bbox) => { + const dataTable = makeTableFromBbox(pageObj, bbox); + dataTable.page = dataTablePage; + dataTablePage.tables.push(dataTable); + }); + } + return { - pageObj, dataTables: new LayoutDataTablePage(n), warn, langSet, + pageObj, dataTables: dataTablePage, warn, langSet, }; } diff --git a/js/import/convertPageStext.js b/js/import/convertPageStext.js index 42c6262..d390f49 100644 --- a/js/import/convertPageStext.js +++ b/js/import/convertPageStext.js @@ -2,6 +2,7 @@ import ocr from '../objects/ocrObjects.js'; import { calcBboxUnion, + calcBoxOverlap, calcLang, mean50, quantile, @@ -10,6 +11,8 @@ import { } from '../utils/miscUtils.js'; import { LayoutDataTablePage } from '../objects/layoutObjects.js'; +import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js'; +import { splitLineAgressively } from '../utils/ocrUtils.js'; /** * @param {Object} params @@ -410,6 +413,10 @@ export async function convertPageStext({ ocrStr, n }) { // If there are no letters in the line, drop the entire line element if (lettersKept === 0) return; + // Recalculate the bounding box. + // The bounding boxes reported by mupdf are often significantly larger than the actual text. + ocr.updateLineBbox(lineObj); + pageObj.lines.push(lineObj); parLineArr.push(lineObj); // eslint-disable-next-line consistent-return @@ -449,5 +456,32 @@ export async function convertPageStext({ ocrStr, n }) { pageObj.angle = angleOut; - return { pageObj, dataTables: new LayoutDataTablePage(n), langSet }; + const autoDetectTables = false; + const dataTablePage = new LayoutDataTablePage(n); + if (autoDetectTables) { + const tableBboxes = detectTablesInPage(pageObj); + + for (let i = 0; i < pageObj.lines.length; i++) { + const line = pageObj.lines[i]; + let inTable = false; + for (let j = 0; j < tableBboxes.length; j++) { + if (calcBoxOverlap(line.bbox, tableBboxes[j]) > 0.25) { + inTable = true; + break; + } + } + if (inTable) { + const newLines = splitLineAgressively(line); + pageObj.lines.splice(i, 1, ...newLines); + } + } + + tableBboxes.forEach((bbox) => { + const dataTable = makeTableFromBbox(pageObj, bbox); + dataTable.page = dataTablePage; + dataTablePage.tables.push(dataTable); + }); + } + + return { pageObj, dataTables: dataTablePage, langSet }; } diff --git a/js/modifyOCR.js b/js/modifyOCR.js index 406e2a7..f99937c 100644 --- a/js/modifyOCR.js +++ b/js/modifyOCR.js @@ -1,27 +1,5 @@ import ocr from './objects/ocrObjects.js'; -import { getRandomAlphanum } from './utils/miscUtils.js'; - -/** - * Returns the proportion of boxA's area contained in boxB - * @param {bbox} boxA - * @param {bbox} boxB - */ -export function calcBoxOverlap(boxA, boxB) { - const left = Math.max(boxA.left, boxB.left); - const top = Math.max(boxA.top, boxB.top); - const right = Math.min(boxA.right, boxB.right); - const bottom = Math.min(boxA.bottom, boxB.bottom); - - const width = right - left; - const height = bottom - top; - - if (width < 0 || height < 0) return 0; - - const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left); - const area = width * height; - - return area / areaA; -} +import { calcBoxOverlap, getRandomAlphanum } from './utils/miscUtils.js'; /** * Adds lines from a new page to an existing page. diff --git a/js/utils/detectTables.js b/js/utils/detectTables.js index 57502d9..02f659d 100644 --- a/js/utils/detectTables.js +++ b/js/utils/detectTables.js @@ -1,3 +1,10 @@ +import { LayoutDataColumn, LayoutDataTable } from '../objects/layoutObjects.js'; +import ocr from '../objects/ocrObjects.js'; +import { + calcBboxUnion, calcBoxOverlap, calcHorizontalOverlap, mean50, +} from './miscUtils.js'; +import { splitLineAgressively } from './ocrUtils.js'; + /** * * @param {Array} boundingBoxes @@ -37,5 +44,279 @@ export function calcColumnBounds(boundingBoxes) { } }); + // Expand column bounds so there is no empty space between columns. + for (let i = 0; i < columnBounds.length - 1; i++) { + const boundRight = (columnBounds[i].right + columnBounds[i + 1].left) / 2; + columnBounds[i].right = boundRight; + columnBounds[i + 1].left = boundRight; + } + return columnBounds; } + +/** + * Detects tables in an OcrPage and returns a structured object. + * Each table contains columns, and each column contains rows (lines). + * @param {OcrPage} ocrPage - OcrPage object containing OcrLine objects. + */ +export function detectTablesInPage(ocrPage) { + const lines = ocr.clonePage(ocrPage).lines; + + // Sort lines by the top position of their bounding boxes + lines.sort((a, b) => a.bbox.top - b.bbox.top); + + /** @type {Array<{avgTop: number, items: Array}>} */ + const rows = []; + // TODO: Make this dynamic so it adjusts based on font size. + const rowThreshold = 10; // Threshold for vertical alignment + + // Group lines into rows based on vertical proximity + lines.forEach((item) => { + let addedToRow = false; + + for (const row of rows) { + // Check if the line is vertically aligned with the row + if (Math.abs(item.bbox.top - row.avgTop) <= rowThreshold) { + row.items.push(item); + // Update the average top position of the row + row.avgTop = row.items.reduce((sum, itm) => sum + itm.bbox.top, 0) / row.items.length; + addedToRow = true; + break; + } + } + + if (!addedToRow) { + // Create a new row if the line doesn't fit in existing rows + rows.push({ avgTop: item.bbox.top, items: [item] }); + } + }); + + // Sort the lines within each row by their left position + rows.forEach((row) => { + row.items.sort((a, b) => a.bbox.left - b.bbox.left); + }); + + /** + * + * @param {{avgTop: number, items: Array}} row + */ + const containsNumbers = (row) => { + let wordsNumN = 0; + row.items.forEach((line) => { + line.words.forEach((word) => { + if (/[0-9]/.test(word.text)) wordsNumN++; + }); + }); + + if (wordsNumN < 4) return false; + return true; + }; + + /** + * + * @param {{avgTop: number, items: Array}} row + */ + const splitRowLinesAgressively = (row) => { + const row2 = { avgTop: row.avgTop, items: /** @type {Array} */ ([]) }; + row.items.forEach((line) => { + row2.items.push(...splitLineAgressively(line)); + }); + return row2; + }; + + /** + * + * @param {Array} linesA + * @param {Array} linesB + */ + const hasWordOverlap = (linesA, linesB) => { + for (let i = 0; i < linesA.length; i++) { + const lineI = linesA[i]; + const lineJOverlapArr = []; + for (let j = 0; j < linesB.length; j++) { + const lineJ = linesB[j]; + if (lineI.bbox.right < lineJ.bbox.left) break; + if (calcHorizontalOverlap(lineI.bbox, lineJ.bbox) > 0) { + lineJOverlapArr.push(lineJ); + } + } + if (lineJOverlapArr.length > 1) { + const wordsI = lineI.words; + const wordsJ = lineJOverlapArr.map((line) => line.words).flat(); + + for (const wordI of wordsI) { + let overlapCount = 0; + + for (const wordJ of wordsJ) { + if (calcHorizontalOverlap(wordI.bbox, wordJ.bbox) > 0) { + overlapCount++; + if (overlapCount >= 2) { + return true; + } + } + } + } + } + } + return false; + }; + + /** + * + * @param {Array<{avgTop: number, items: Array}>} tableRows + * @param {{avgTop: number, items: Array}} row + */ + const isCompat = (tableRows, row) => { + if (!tableRows || tableRows.length === 0) return false; + + const expectedColumns = mean50(tableRows.map((x) => x.items.length)); + + // const lastRow = tableRows[tableRows.length - 1]; + + const existingLines = tableRows.map((x) => x.items).flat(); + + if (Math.abs(expectedColumns - row.items.length) <= 1) { + return true; + } + + if (globalThis.testControl) return false; + + if (hasWordOverlap(existingLines, row.items) || hasWordOverlap(row.items, existingLines)) { + return false; + } + + return true; + }; + + const minRows = 4; // Minimum number of rows to consider a table + + /** @type {Array}>>} */ + const tables = []; + /** @type {Array<{avgTop: number, items: Array}>} */ + let currentTable = []; + /** @type {Array<{avgTop: number, items: Array}>} */ + let currentTableCompat = []; + let currentTableStartIndex = 0; + + const rowsSplit = rows.map((row) => splitRowLinesAgressively(row)); + + // Detect tables by finding consecutive rows with similar numbers of items + for (let i = 0; i < rowsSplit.length; i++) { + const rowSplit = rowsSplit[i]; + // const rowSplit = rows[i]; + // let rowSplit = rowsSplit[i]; + // let rowSplit; + + if (containsNumbers(rowsSplit[i])) { + // rowSplit = splitLinesAgressively(row); + if (currentTable.length > 0) { + if (isCompat(currentTableCompat, rowSplit)) { + // Continue the current table + currentTable.push(rowSplit); + currentTableCompat.push(rowSplit); + } else if (currentTable.length >= minRows) { + // TODO: Handle case where the the header row is a table row but is not compatible + // with the rows that come afterwards, which puts us in this block. + // End the current table and start a new one + const headerRows = []; + if (rowsSplit[currentTableStartIndex - 1] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 1])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 1])) { + headerRows.push(rowsSplit[currentTableStartIndex - 1]); + if (rowsSplit[currentTableStartIndex - 2] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 2])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 2])) { + headerRows.push(rowsSplit[currentTableStartIndex - 2]); + } + } + tables.push([...headerRows, ...currentTable]); + + currentTable = [rowSplit]; + currentTableCompat = [rowSplit]; + currentTableStartIndex = i; + } else { + currentTable = [rowSplit]; + currentTableCompat = [rowSplit]; + currentTableStartIndex = i; + } + } else { + currentTable.push(rowSplit); + currentTableCompat.push(rowSplit); + currentTableStartIndex = i; + } + } else if (currentTable.length > 0) { + // If the current row does not pass the checks, but the next two rows do, it is still included. + const nextRowSplit = rowsSplit[i + 1]; + const nextRowSplit2 = rowsSplit[i + 2]; + if (nextRowSplit && nextRowSplit2 && containsNumbers(nextRowSplit) && containsNumbers(nextRowSplit2) + && isCompat(currentTableCompat, nextRowSplit) && isCompat(currentTableCompat, nextRowSplit2)) { + currentTable.push(rowSplit); + continue; + } + + if (currentTable.length >= minRows) { + const headerRows = []; + if (rowsSplit[currentTableStartIndex - 1] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 1])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 1])) { + headerRows.push(rowsSplit[currentTableStartIndex - 1]); + if (rowsSplit[currentTableStartIndex - 2] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 2])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 2])) { + headerRows.push(rowsSplit[currentTableStartIndex - 2]); + } + } + tables.push([...headerRows, ...currentTable]); + } + + currentTable = []; + currentTableCompat = []; + } + } + + // Add the last table if it exists + if (currentTable.length >= minRows) { + tables.push(currentTable); + } + + const tableLineBboxes = tables.map((table) => calcBboxUnion(table.map((row) => calcBboxUnion(row.items.map((item) => item.bbox))))); + + return tableLineBboxes; +} + +/** + * + * @param {OcrPage} page + * @param {bbox} bbox + */ +export const makeTableFromBbox = (page, bbox) => { + const lines = page.lines.filter((line) => calcBoxOverlap(line.bbox, bbox) > 0.5); + + let columnBboxArr; + if (lines.length > 0) { + const lineBoxes = lines.map((line) => line.bbox); + const columnBoundArr = calcColumnBounds(lineBoxes); + columnBboxArr = columnBoundArr.map((column) => ({ + left: column.left, + top: bbox.top, + right: column.right, + bottom: bbox.bottom, + })); + + // Expand column bounds so there is no empty space between columns. + columnBboxArr[0].left = bbox.left; + columnBboxArr[columnBboxArr.length - 1].right = bbox.right; + for (let i = 0; i < columnBboxArr.length - 1; i++) { + const boundRight = (columnBboxArr[i].right + columnBboxArr[i + 1].left) / 2; + columnBboxArr[i].right = boundRight; + columnBboxArr[i + 1].left = boundRight; + } + } else { + columnBboxArr = [{ ...bbox }]; + } + + const dataTable = new LayoutDataTable(); + + columnBboxArr.forEach((columnBbox) => { + const layoutBox = new LayoutDataColumn(columnBbox, dataTable); + dataTable.boxes.push(layoutBox); + }); + + return dataTable; +}; diff --git a/js/utils/miscUtils.js b/js/utils/miscUtils.js index 72c5473..ed83136 100644 --- a/js/utils/miscUtils.js +++ b/js/utils/miscUtils.js @@ -18,6 +18,46 @@ export const calcBboxUnion = (bboxArr) => ({ bottom: Math.max(...bboxArr.map((x) => x.bottom)), }); +/** + * Returns the proportion of boxA's area contained in boxB + * @param {bbox} boxA + * @param {bbox} boxB + */ +export function calcBoxOverlap(boxA, boxB) { + const left = Math.max(boxA.left, boxB.left); + const top = Math.max(boxA.top, boxB.top); + const right = Math.min(boxA.right, boxB.right); + const bottom = Math.min(boxA.bottom, boxB.bottom); + + const width = right - left; + const height = bottom - top; + + if (width < 0 || height < 0) return 0; + + const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left); + const area = width * height; + + return area / areaA; +} + +/** + * Returns the proportion of boxA's width contained in boxB + * @param {bbox} boxA + * @param {bbox} boxB + */ +export function calcHorizontalOverlap(boxA, boxB) { + const left = Math.max(boxA.left, boxB.left); + const right = Math.min(boxA.right, boxB.right); + + const widthOverlap = right - left; + + if (widthOverlap < 0) return 0; + + const widthA = boxA.right - boxA.left; + + return widthOverlap / widthA; +} + /** * Generates a random integer. * diff --git a/js/utils/ocrUtils.js b/js/utils/ocrUtils.js index b199690..d3385be 100644 --- a/js/utils/ocrUtils.js +++ b/js/utils/ocrUtils.js @@ -89,3 +89,36 @@ export const checkOcrWordsAdjacent = (words) => { const lastIndex = lineWords.findIndex((x) => x.id === sortedWords[sortedWords.length - 1].id); return lastIndex - firstIndex === sortedWords.length - 1; }; + +/** + * + * @param {OcrLine} line + */ +export const splitLineAgressively = (line) => { + /** @type {Array} */ + const linesOut = []; + const lineHeight = line.bbox.bottom - line.bbox.top; + let wordPrev = line.words[0]; + let lineCurrent = ocr.cloneLine(line); + lineCurrent.words = [line.words[0]]; + for (let i = 1; i < line.words.length; i++) { + const word = ocr.cloneWord(line.words[i]); + if (word.bbox.left - wordPrev.bbox.right > lineHeight) { + linesOut.push(lineCurrent); + lineCurrent = ocr.cloneLine(line); + word.line = lineCurrent; + lineCurrent.words = [word]; + } else { + word.line = lineCurrent; + lineCurrent.words.push(word); + } + wordPrev = word; + } + linesOut.push(lineCurrent); + + linesOut.forEach((x) => { + ocr.updateLineBbox(x); + }); + + return linesOut; +}; diff --git a/mupdf/libmupdf.wasm b/mupdf/libmupdf.wasm index 90219d9..75eb50b 100755 Binary files a/mupdf/libmupdf.wasm and b/mupdf/libmupdf.wasm differ diff --git a/scribe.js b/scribe.js index e11806a..152af00 100644 --- a/scribe.js +++ b/scribe.js @@ -20,7 +20,10 @@ import { extractSingleTableContent } from './js/extractTables.js'; import { enableFontOpt, loadBuiltInFontsRaw } from './js/fontContainerMain.js'; import { gs } from './js/generalWorkerMain.js'; import { importFiles, importFilesSupp } from './js/import/import.js'; -import { calcBoxOverlap, combineOCRPage } from './js/modifyOCR.js'; +import { combineOCRPage } from './js/modifyOCR.js'; +import { + calcBoxOverlap, countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes, +} from './js/utils/miscUtils.js'; import layout, { calcTableBbox } from './js/objects/layoutObjects.js'; import ocr from './js/objects/ocrObjects.js'; import { @@ -32,13 +35,12 @@ import { } from './js/recognizeConvert.js'; import { calcWordMetrics } from './js/utils/fontUtils.js'; import { getImageBitmap, imageStrToBlob } from './js/utils/imageUtils.js'; -import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js'; import { calcConf, checkOcrWordsAdjacent, mergeOcrWords, splitOcrWord, } from './js/utils/ocrUtils.js'; import { assignParagraphs } from './js/utils/reflowPars.js'; import { writeXlsx } from './js/export/writeTabular.js'; -import { calcColumnBounds } from './js/utils/detectTables.js'; +import { calcColumnBounds, detectTablesInPage, makeTableFromBbox } from './js/utils/detectTables.js'; /** * Initialize the program and optionally pre-load resources. @@ -204,6 +206,10 @@ class utils { static extractSingleTableContent = extractSingleTableContent; + static detectTablesInPage = detectTablesInPage; + + static makeTableFromBbox = makeTableFromBbox; + // Font utils static calcWordMetrics = calcWordMetrics; diff --git a/tests/module/importPdfText.spec.js b/tests/module/importPdfText.spec.js index 9f6c50b..0a5c27b 100644 --- a/tests/module/importPdfText.spec.js +++ b/tests/module/importPdfText.spec.js @@ -100,8 +100,8 @@ describe('Check superscripts are detected in PDF imports.', function () { // Third document it('Should correctly import leading superscripts printed using font size adjustments (3rd doc)', async () => { - assert.strictEqual(scribe.data.ocr.active[2].lines[24].words[0].sup, true); - assert.strictEqual(scribe.data.ocr.active[2].lines[24].words[0].text, '2'); + assert.strictEqual(scribe.data.ocr.active[2].lines[22].words[4].sup, true); + assert.strictEqual(scribe.data.ocr.active[2].lines[22].words[4].text, '2'); }).timeout(10000); it('Should correctly parse font size for lines with superscripts (3rd doc)', async () => { @@ -150,8 +150,8 @@ describe('Check superscripts are detected in PDF imports.', function () { assert.strictEqual(scribe.data.ocr.active[0].lines[96].words[0].sup, true); assert.strictEqual(scribe.data.ocr.active[0].lines[96].words[0].text, '(1)'); - assert.strictEqual(scribe.data.ocr.active[0].lines[104].words[0].sup, true); - assert.strictEqual(scribe.data.ocr.active[0].lines[104].words[0].text, '(3)'); + assert.strictEqual(scribe.data.ocr.active[0].lines[103].words[0].sup, true); + assert.strictEqual(scribe.data.ocr.active[0].lines[103].words[0].text, '(3)'); }).timeout(10000); it('Should correctly parse font size for lines with superscripts (addtl doc 2)', async () => { @@ -223,6 +223,20 @@ describe('Check that PDF imports split lines correctly.', function () { assert.strictEqual(scribe.data.ocr.active[0].lines[3].words[0].text, 'Apprehensions'); }).timeout(10000); + it('Should correctly parse PDF lines (2nd doc)', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/superscript_examples.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + + // A previous version of the build split this line into 9 separate lines. + assert.strictEqual(scribe.data.ocr.active[2].lines[58].words.map((x) => x.text).join(' '), 'ment’s (DOE’s) issuance of Accounting and Auditing Enforcement Releases'); + }).timeout(10000); + + it('Should correctly parse PDF lines (3rd doc)', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/superscript_example_report1.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + + // A previous version of the build split this line into 2 separate lines, by putting the leading superscript on a separate line. + assert.strictEqual(scribe.data.ocr.active[0].lines[99].words.map((x) => x.text).join(' '), '(2) Beginning with the year ended December 31, 2023, the Company changed the presentation of interest income on forgivable loans on our Consolidated Statement of'); + }).timeout(10000); + after(async () => { await scribe.terminate(); });