Skip to content

Commit

Permalink
Updated HOCR parsing and added new tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Nov 24, 2024
1 parent a5d7a02 commit 11a718b
Show file tree
Hide file tree
Showing 4 changed files with 19,639 additions and 0 deletions.
12 changes: 12 additions & 0 deletions js/import/convertPageHocr.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ export async function convertPageHocr({
ocrStr = ocrStr.replace(/(class=')ocr_textfloat/ig, '$1ocr_line');
ocrStr = ocrStr.replace(/(class=')ocr_header/ig, '$1ocr_line');

// Delete additional elements created by the `lstm_choice_mode` option in Tesseract
const lstmChoiceMode1 = !!/id='timestep1/.test(ocrStr);
const lstmChoiceMode2 = !!/id='lstm_choices/.test(ocrStr);

if (lstmChoiceMode1) {
ocrStr = ocrStr.replace(/<span class='ocr_symbol'[\s\S]+?(?:<\/span>\s*){3}/ig, '');
}

if (lstmChoiceMode2) {
ocrStr = ocrStr.replace(/<span class='ocrx_cinfo' id='lstm_choices[\s\S]+?(?:<\/span>\s*){2}/ig, '');
}

/**
* @param {string} match
*/
Expand Down
Loading

0 comments on commit 11a718b

Please sign in to comment.