From 87fa04fe33fe049e32e0b6b58f4f0709593a9bf8 Mon Sep 17 00:00:00 2001 From: Balearica Date: Wed, 30 Oct 2024 19:47:20 -0700 Subject: [PATCH] Updated table detection code --- js/extractTables.js | 2 +- js/import/convertPageHocr.js | 4 +- js/import/convertPageStext.js | 26 +++- js/modifyOCR.js | 24 +--- js/utils/detectTables.js | 225 +++++++++++++++++++++------------- js/utils/miscUtils.js | 40 ++++++ js/utils/ocrUtils.js | 33 +++++ mupdf/libmupdf.wasm | Bin 4636731 -> 4636780 bytes scribe.js | 6 +- 9 files changed, 245 insertions(+), 115 deletions(-) diff --git a/js/extractTables.js b/js/extractTables.js index 1a39ba9..bc6f1db 100644 --- a/js/extractTables.js +++ b/js/extractTables.js @@ -1,5 +1,5 @@ -import { calcBoxOverlap } from './modifyOCR.js'; import ocr from './objects/ocrObjects.js'; +import { calcBoxOverlap } from './utils/miscUtils.js'; /** * diff --git a/js/import/convertPageHocr.js b/js/import/convertPageHocr.js index 9c70f60..26790af 100644 --- a/js/import/convertPageHocr.js +++ b/js/import/convertPageHocr.js @@ -375,7 +375,7 @@ export async function convertPageHocr({ pass2(pageObj, rotateAngle); const langSet = pass3(pageObj); - const autoDetectTables = true; + const autoDetectTables = false; const dataTablePage = new LayoutDataTablePage(n); if (autoDetectTables) { const tableBboxes = detectTablesInPage(pageObj); @@ -384,7 +384,7 @@ export async function convertPageHocr({ dataTable.page = dataTablePage; dataTablePage.tables.push(dataTable); }); - } + } return { pageObj, dataTables: dataTablePage, warn, langSet, diff --git a/js/import/convertPageStext.js b/js/import/convertPageStext.js index 14a74bd..d390f49 100644 --- a/js/import/convertPageStext.js +++ b/js/import/convertPageStext.js @@ -2,6 +2,7 @@ import ocr from '../objects/ocrObjects.js'; import { calcBboxUnion, + calcBoxOverlap, calcLang, mean50, quantile, @@ -11,6 +12,7 @@ import { import { LayoutDataTablePage } from '../objects/layoutObjects.js'; import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js'; +import { splitLineAgressively } from '../utils/ocrUtils.js'; /** * @param {Object} params @@ -411,6 +413,10 @@ export async function convertPageStext({ ocrStr, n }) { // If there are no letters in the line, drop the entire line element if (lettersKept === 0) return; + // Recalculate the bounding box. + // The bounding boxes reported by mupdf are often significantly larger than the actual text. + ocr.updateLineBbox(lineObj); + pageObj.lines.push(lineObj); parLineArr.push(lineObj); // eslint-disable-next-line consistent-return @@ -450,16 +456,32 @@ export async function convertPageStext({ ocrStr, n }) { pageObj.angle = angleOut; - const autoDetectTables = true; + const autoDetectTables = false; const dataTablePage = new LayoutDataTablePage(n); if (autoDetectTables) { const tableBboxes = detectTablesInPage(pageObj); + + for (let i = 0; i < pageObj.lines.length; i++) { + const line = pageObj.lines[i]; + let inTable = false; + for (let j = 0; j < tableBboxes.length; j++) { + if (calcBoxOverlap(line.bbox, tableBboxes[j]) > 0.25) { + inTable = true; + break; + } + } + if (inTable) { + const newLines = splitLineAgressively(line); + pageObj.lines.splice(i, 1, ...newLines); + } + } + tableBboxes.forEach((bbox) => { const dataTable = makeTableFromBbox(pageObj, bbox); dataTable.page = dataTablePage; dataTablePage.tables.push(dataTable); }); - } + } return { pageObj, dataTables: dataTablePage, langSet }; } diff --git a/js/modifyOCR.js b/js/modifyOCR.js index 406e2a7..f99937c 100644 --- a/js/modifyOCR.js +++ b/js/modifyOCR.js @@ -1,27 +1,5 @@ import ocr from './objects/ocrObjects.js'; -import { getRandomAlphanum } from './utils/miscUtils.js'; - -/** - * Returns the proportion of boxA's area contained in boxB - * @param {bbox} boxA - * @param {bbox} boxB - */ -export function calcBoxOverlap(boxA, boxB) { - const left = Math.max(boxA.left, boxB.left); - const top = Math.max(boxA.top, boxB.top); - const right = Math.min(boxA.right, boxB.right); - const bottom = Math.min(boxA.bottom, boxB.bottom); - - const width = right - left; - const height = bottom - top; - - if (width < 0 || height < 0) return 0; - - const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left); - const area = width * height; - - return area / areaA; -} +import { calcBoxOverlap, getRandomAlphanum } from './utils/miscUtils.js'; /** * Adds lines from a new page to an existing page. diff --git a/js/utils/detectTables.js b/js/utils/detectTables.js index f1b735b..02f659d 100644 --- a/js/utils/detectTables.js +++ b/js/utils/detectTables.js @@ -1,7 +1,9 @@ -import { calcBoxOverlap } from "../modifyOCR.js"; -import { LayoutDataColumn, LayoutDataTable } from "../objects/layoutObjects.js"; -import ocr from "../objects/ocrObjects.js"; -import { calcBboxUnion, mean50 } from "./miscUtils.js"; +import { LayoutDataColumn, LayoutDataTable } from '../objects/layoutObjects.js'; +import ocr from '../objects/ocrObjects.js'; +import { + calcBboxUnion, calcBoxOverlap, calcHorizontalOverlap, mean50, +} from './miscUtils.js'; +import { splitLineAgressively } from './ocrUtils.js'; /** * @@ -42,7 +44,6 @@ export function calcColumnBounds(boundingBoxes) { } }); - // Expand column bounds so there is no empty space between columns. for (let i = 0; i < columnBounds.length - 1; i++) { const boundRight = (columnBounds[i].right + columnBounds[i + 1].left) / 2; @@ -59,22 +60,21 @@ export function calcColumnBounds(boundingBoxes) { * @param {OcrPage} ocrPage - OcrPage object containing OcrLine objects. */ export function detectTablesInPage(ocrPage) { - const lines = ocr.clonePage(ocrPage).lines; // Sort lines by the top position of their bounding boxes lines.sort((a, b) => a.bbox.top - b.bbox.top); - /**@type {Array<{avgTop: number, items: Array}>} */ + /** @type {Array<{avgTop: number, items: Array}>} */ const rows = []; // TODO: Make this dynamic so it adjusts based on font size. const rowThreshold = 10; // Threshold for vertical alignment // Group lines into rows based on vertical proximity - lines.forEach(item => { + lines.forEach((item) => { let addedToRow = false; - for (let row of rows) { + for (const row of rows) { // Check if the line is vertically aligned with the row if (Math.abs(item.bbox.top - row.avgTop) <= rowThreshold) { row.items.push(item); @@ -92,36 +92,79 @@ export function detectTablesInPage(ocrPage) { }); // Sort the lines within each row by their left position - rows.forEach(row => { + rows.forEach((row) => { row.items.sort((a, b) => a.bbox.left - b.bbox.left); }); /** - * - * @param {{avgTop: number, items: Array}} row + * + * @param {{avgTop: number, items: Array}} row */ - const isTableRow = (row) => { - if (row.items.length < 4) return false; - let fewWordsN = 0; - let majorityNumbersN = 0; + const containsNumbers = (row) => { + let wordsNumN = 0; row.items.forEach((line) => { - const totalN = line.words.map((word) => word.text.length).reduce((a, b) => a + b, 0); - const digitN = line.words.map((word) => word.text.split('').filter((char) => /[0-9\W]/.test(char)).length).reduce((a, b) => a + b, 0); - - if (line.words.length <= 2) fewWordsN++; - // if (digitN / totalN > 0.5) majorityNumbersN++; - if (digitN > 0) majorityNumbersN++; + line.words.forEach((word) => { + if (/[0-9]/.test(word.text)) wordsNumN++; + }); }); - if (fewWordsN < row.items.length * 0.75) return false; - if (majorityNumbersN < row.items.length * 0.75) return false; + if (wordsNumN < 4) return false; return true; - } + }; + + /** + * + * @param {{avgTop: number, items: Array}} row + */ + const splitRowLinesAgressively = (row) => { + const row2 = { avgTop: row.avgTop, items: /** @type {Array} */ ([]) }; + row.items.forEach((line) => { + row2.items.push(...splitLineAgressively(line)); + }); + return row2; + }; + + /** + * + * @param {Array} linesA + * @param {Array} linesB + */ + const hasWordOverlap = (linesA, linesB) => { + for (let i = 0; i < linesA.length; i++) { + const lineI = linesA[i]; + const lineJOverlapArr = []; + for (let j = 0; j < linesB.length; j++) { + const lineJ = linesB[j]; + if (lineI.bbox.right < lineJ.bbox.left) break; + if (calcHorizontalOverlap(lineI.bbox, lineJ.bbox) > 0) { + lineJOverlapArr.push(lineJ); + } + } + if (lineJOverlapArr.length > 1) { + const wordsI = lineI.words; + const wordsJ = lineJOverlapArr.map((line) => line.words).flat(); + + for (const wordI of wordsI) { + let overlapCount = 0; + + for (const wordJ of wordsJ) { + if (calcHorizontalOverlap(wordI.bbox, wordJ.bbox) > 0) { + overlapCount++; + if (overlapCount >= 2) { + return true; + } + } + } + } + } + } + return false; + }; /** - * + * * @param {Array<{avgTop: number, items: Array}>} tableRows - * @param {{avgTop: number, items: Array}} row + * @param {{avgTop: number, items: Array}} row */ const isCompat = (tableRows, row) => { if (!tableRows || tableRows.length === 0) return false; @@ -129,87 +172,101 @@ export function detectTablesInPage(ocrPage) { const expectedColumns = mean50(tableRows.map((x) => x.items.length)); // const lastRow = tableRows[tableRows.length - 1]; + + const existingLines = tableRows.map((x) => x.items).flat(); + if (Math.abs(expectedColumns - row.items.length) <= 1) { return true; } - return false; - } + + if (globalThis.testControl) return false; + + if (hasWordOverlap(existingLines, row.items) || hasWordOverlap(row.items, existingLines)) { + return false; + } + + return true; + }; const minRows = 4; // Minimum number of rows to consider a table - /**@type {Array}>>} */ + /** @type {Array}>>} */ const tables = []; - /**@type {Array<{avgTop: number, items: Array}>} */ + /** @type {Array<{avgTop: number, items: Array}>} */ let currentTable = []; - /**@type {Array<{avgTop: number, items: Array}>} */ + /** @type {Array<{avgTop: number, items: Array}>} */ let currentTableCompat = []; - /**@type {?{avgTop: number, items: Array}} */ - let headerRow = null; + let currentTableStartIndex = 0; - // Detect tables by finding consecutive rows with similar numbers of items - for (let i = 0; i < rows.length; i++) { - const row = rows[i]; - - if (isTableRow(row)) { + const rowsSplit = rows.map((row) => splitRowLinesAgressively(row)); + // Detect tables by finding consecutive rows with similar numbers of items + for (let i = 0; i < rowsSplit.length; i++) { + const rowSplit = rowsSplit[i]; + // const rowSplit = rows[i]; + // let rowSplit = rowsSplit[i]; + // let rowSplit; + + if (containsNumbers(rowsSplit[i])) { + // rowSplit = splitLinesAgressively(row); if (currentTable.length > 0) { - - // const prevRow = currentTable[currentTable.length - 1]; - if (isCompat(currentTableCompat, row)) { + if (isCompat(currentTableCompat, rowSplit)) { // Continue the current table - currentTable.push(row); - currentTableCompat.push(row); - } else { + currentTable.push(rowSplit); + currentTableCompat.push(rowSplit); + } else if (currentTable.length >= minRows) { // TODO: Handle case where the the header row is a table row but is not compatible // with the rows that come afterwards, which puts us in this block. // End the current table and start a new one - if (currentTable.length >= minRows) { - if (headerRow && Math.abs(headerRow.items.length - currentTable[0].items.length) <= 1) { - tables.push([headerRow, ...currentTable]); - } else { - tables.push(currentTable); + const headerRows = []; + if (rowsSplit[currentTableStartIndex - 1] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 1])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 1])) { + headerRows.push(rowsSplit[currentTableStartIndex - 1]); + if (rowsSplit[currentTableStartIndex - 2] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 2])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 2])) { + headerRows.push(rowsSplit[currentTableStartIndex - 2]); } - - } else if (currentTable.length === 1) { - headerRow = currentTable[0]; - currentTable = [row]; - - - } else { - currentTable = [row]; - headerRow = null; - } + tables.push([...headerRows, ...currentTable]); + + currentTable = [rowSplit]; + currentTableCompat = [rowSplit]; + currentTableStartIndex = i; + } else { + currentTable = [rowSplit]; + currentTableCompat = [rowSplit]; + currentTableStartIndex = i; } } else { - currentTable.push(row); - currentTableCompat.push(row); + currentTable.push(rowSplit); + currentTableCompat.push(rowSplit); + currentTableStartIndex = i; } - - - } else { - + } else if (currentTable.length > 0) { // If the current row does not pass the checks, but the next two rows do, it is still included. - const nextRow = rows[i + 1]; - const nextRow2 = rows[i + 2]; - if (nextRow && nextRow2 && isTableRow(nextRow) && isTableRow(nextRow2) - && isCompat(currentTableCompat, nextRow) && isCompat(currentTableCompat, nextRow2)) { - currentTable.push(row); + const nextRowSplit = rowsSplit[i + 1]; + const nextRowSplit2 = rowsSplit[i + 2]; + if (nextRowSplit && nextRowSplit2 && containsNumbers(nextRowSplit) && containsNumbers(nextRowSplit2) + && isCompat(currentTableCompat, nextRowSplit) && isCompat(currentTableCompat, nextRowSplit2)) { + currentTable.push(rowSplit); continue; } - - // Not a table row if (currentTable.length >= minRows) { - if (headerRow && Math.abs(headerRow.items.length - currentTable[0].items.length) <= 1) { - tables.push([headerRow, ...currentTable]); - } else { - tables.push(currentTable); + const headerRows = []; + if (rowsSplit[currentTableStartIndex - 1] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 1])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 1])) { + headerRows.push(rowsSplit[currentTableStartIndex - 1]); + if (rowsSplit[currentTableStartIndex - 2] && (tables.length === 0 || !tables[tables.length - 1].includes(rowsSplit[currentTableStartIndex - 2])) + && isCompat(currentTableCompat, rowsSplit[currentTableStartIndex - 2])) { + headerRows.push(rowsSplit[currentTableStartIndex - 2]); + } } - + tables.push([...headerRows, ...currentTable]); } + currentTable = []; - headerRow = row; + currentTableCompat = []; } } @@ -223,11 +280,10 @@ export function detectTablesInPage(ocrPage) { return tableLineBboxes; } - /** - * - * @param {OcrPage} page - * @param {bbox} bbox + * + * @param {OcrPage} page + * @param {bbox} bbox */ export const makeTableFromBbox = (page, bbox) => { const lines = page.lines.filter((line) => calcBoxOverlap(line.bbox, bbox) > 0.5); @@ -263,5 +319,4 @@ export const makeTableFromBbox = (page, bbox) => { }); return dataTable; -} - +}; diff --git a/js/utils/miscUtils.js b/js/utils/miscUtils.js index 72c5473..ed83136 100644 --- a/js/utils/miscUtils.js +++ b/js/utils/miscUtils.js @@ -18,6 +18,46 @@ export const calcBboxUnion = (bboxArr) => ({ bottom: Math.max(...bboxArr.map((x) => x.bottom)), }); +/** + * Returns the proportion of boxA's area contained in boxB + * @param {bbox} boxA + * @param {bbox} boxB + */ +export function calcBoxOverlap(boxA, boxB) { + const left = Math.max(boxA.left, boxB.left); + const top = Math.max(boxA.top, boxB.top); + const right = Math.min(boxA.right, boxB.right); + const bottom = Math.min(boxA.bottom, boxB.bottom); + + const width = right - left; + const height = bottom - top; + + if (width < 0 || height < 0) return 0; + + const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left); + const area = width * height; + + return area / areaA; +} + +/** + * Returns the proportion of boxA's width contained in boxB + * @param {bbox} boxA + * @param {bbox} boxB + */ +export function calcHorizontalOverlap(boxA, boxB) { + const left = Math.max(boxA.left, boxB.left); + const right = Math.min(boxA.right, boxB.right); + + const widthOverlap = right - left; + + if (widthOverlap < 0) return 0; + + const widthA = boxA.right - boxA.left; + + return widthOverlap / widthA; +} + /** * Generates a random integer. * diff --git a/js/utils/ocrUtils.js b/js/utils/ocrUtils.js index b199690..d3385be 100644 --- a/js/utils/ocrUtils.js +++ b/js/utils/ocrUtils.js @@ -89,3 +89,36 @@ export const checkOcrWordsAdjacent = (words) => { const lastIndex = lineWords.findIndex((x) => x.id === sortedWords[sortedWords.length - 1].id); return lastIndex - firstIndex === sortedWords.length - 1; }; + +/** + * + * @param {OcrLine} line + */ +export const splitLineAgressively = (line) => { + /** @type {Array} */ + const linesOut = []; + const lineHeight = line.bbox.bottom - line.bbox.top; + let wordPrev = line.words[0]; + let lineCurrent = ocr.cloneLine(line); + lineCurrent.words = [line.words[0]]; + for (let i = 1; i < line.words.length; i++) { + const word = ocr.cloneWord(line.words[i]); + if (word.bbox.left - wordPrev.bbox.right > lineHeight) { + linesOut.push(lineCurrent); + lineCurrent = ocr.cloneLine(line); + word.line = lineCurrent; + lineCurrent.words = [word]; + } else { + word.line = lineCurrent; + lineCurrent.words.push(word); + } + wordPrev = word; + } + linesOut.push(lineCurrent); + + linesOut.forEach((x) => { + ocr.updateLineBbox(x); + }); + + return linesOut; +}; diff --git a/mupdf/libmupdf.wasm b/mupdf/libmupdf.wasm index 90219d9f20888a76351911e36553d6c0a58fc813..75eb50b780e8ef22a5e1e0ba2e7421bd359b923f 100755 GIT binary patch delta 404 zcmXxaOHKko6op}`yR=m(Tfn!XMT?5>SF7c_TP)mxiDhKy0$2fw0~X;XoVAGqx8Wij zt3&@lJjs`w+(DB{zm8<8n=03h zox&(WjG>4U${5E4CNYI+%wQICm`4Q*ScFCu5tdNHGFGsPI@Yj`4Qyfy+t|S__OOow g#5lwej$v?uQ=H)(7r4X~5~R3B12?c)?3%&TABqWZ<^TWy delta 355 zcmW;HNlpSm0EJHQ@>z zf<0S*dC8Z&?LYtgvVQUXs>Nhv^w?+t3t>pCU==pju#OFEA`%$w{A9&6Nc5dWxT50S z5PSER$MfF&kAhUTV<`zyvSz}S9o3WGcwihvv4t465k~?$NMaW$>|q~i93X=%a>zrW z02hZi!ZC_CK?!A4aEdC