Skip to content

Commit

Permalink
Updated mupdf to improve line detection; added WIP table detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica authored Oct 31, 2024
2 parents 0f16bb5 + 38b369d commit 850152f
Show file tree
Hide file tree
Showing 10 changed files with 431 additions and 33 deletions.
2 changes: 1 addition & 1 deletion js/extractTables.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { calcBoxOverlap } from './modifyOCR.js';
import ocr from './objects/ocrObjects.js';
import { calcBoxOverlap } from './utils/miscUtils.js';

/**
*
Expand Down
14 changes: 13 additions & 1 deletion js/import/convertPageHocr.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {

import { LayoutDataTablePage } from '../objects/layoutObjects.js';
import { pass2, pass3 } from './convertPageShared.js';
import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js';

// If enabled, raw strings are saved in OCR objects for debugging purposes.
const debugMode = true;
Expand Down Expand Up @@ -374,7 +375,18 @@ export async function convertPageHocr({
pass2(pageObj, rotateAngle);
const langSet = pass3(pageObj);

const autoDetectTables = false;
const dataTablePage = new LayoutDataTablePage(n);
if (autoDetectTables) {
const tableBboxes = detectTablesInPage(pageObj);
tableBboxes.forEach((bbox) => {
const dataTable = makeTableFromBbox(pageObj, bbox);
dataTable.page = dataTablePage;
dataTablePage.tables.push(dataTable);
});
}

return {
pageObj, dataTables: new LayoutDataTablePage(n), warn, langSet,
pageObj, dataTables: dataTablePage, warn, langSet,
};
}
36 changes: 35 additions & 1 deletion js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import ocr from '../objects/ocrObjects.js';

import {
calcBboxUnion,
calcBoxOverlap,
calcLang,
mean50,
quantile,
Expand All @@ -10,6 +11,8 @@ import {
} from '../utils/miscUtils.js';

import { LayoutDataTablePage } from '../objects/layoutObjects.js';
import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js';
import { splitLineAgressively } from '../utils/ocrUtils.js';

/**
* @param {Object} params
Expand Down Expand Up @@ -410,6 +413,10 @@ export async function convertPageStext({ ocrStr, n }) {
// If there are no letters in the line, drop the entire line element
if (lettersKept === 0) return;

// Recalculate the bounding box.
// The bounding boxes reported by mupdf are often significantly larger than the actual text.
ocr.updateLineBbox(lineObj);

pageObj.lines.push(lineObj);
parLineArr.push(lineObj);
// eslint-disable-next-line consistent-return
Expand Down Expand Up @@ -449,5 +456,32 @@ export async function convertPageStext({ ocrStr, n }) {

pageObj.angle = angleOut;

return { pageObj, dataTables: new LayoutDataTablePage(n), langSet };
const autoDetectTables = false;
const dataTablePage = new LayoutDataTablePage(n);
if (autoDetectTables) {
const tableBboxes = detectTablesInPage(pageObj);

for (let i = 0; i < pageObj.lines.length; i++) {
const line = pageObj.lines[i];
let inTable = false;
for (let j = 0; j < tableBboxes.length; j++) {
if (calcBoxOverlap(line.bbox, tableBboxes[j]) > 0.25) {
inTable = true;
break;
}
}
if (inTable) {
const newLines = splitLineAgressively(line);
pageObj.lines.splice(i, 1, ...newLines);
}
}

tableBboxes.forEach((bbox) => {
const dataTable = makeTableFromBbox(pageObj, bbox);
dataTable.page = dataTablePage;
dataTablePage.tables.push(dataTable);
});
}

return { pageObj, dataTables: dataTablePage, langSet };
}
24 changes: 1 addition & 23 deletions js/modifyOCR.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,5 @@
import ocr from './objects/ocrObjects.js';
import { getRandomAlphanum } from './utils/miscUtils.js';

/**
* Returns the proportion of boxA's area contained in boxB
* @param {bbox} boxA
* @param {bbox} boxB
*/
export function calcBoxOverlap(boxA, boxB) {
const left = Math.max(boxA.left, boxB.left);
const top = Math.max(boxA.top, boxB.top);
const right = Math.min(boxA.right, boxB.right);
const bottom = Math.min(boxA.bottom, boxB.bottom);

const width = right - left;
const height = bottom - top;

if (width < 0 || height < 0) return 0;

const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left);
const area = width * height;

return area / areaA;
}
import { calcBoxOverlap, getRandomAlphanum } from './utils/miscUtils.js';

/**
* Adds lines from a new page to an existing page.
Expand Down
Loading

0 comments on commit 850152f

Please sign in to comment.