From 8d1cbc4ae1b4cb6609a4007eb0df5f4248101edd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 27 Aug 2024 11:10:29 +0200 Subject: [PATCH 1/4] adapt to ocrd v3 --- ocrd_segment/config.py | 4 - ocrd_segment/evaluate.py | 96 +++-- ocrd_segment/extract_glyphs.py | 284 ++++++------- ocrd_segment/extract_lines.py | 425 ++++++++++--------- ocrd_segment/extract_pages.py | 491 ++++++++++++---------- ocrd_segment/extract_regions.py | 394 ++++++++--------- ocrd_segment/extract_words.py | 283 ++++++------- ocrd_segment/import_coco_segmentation.py | 290 ++++++------- ocrd_segment/import_image_segmentation.py | 243 +++++------ ocrd_segment/ocrd-tool.json | 111 ++--- ocrd_segment/project.py | 142 +++---- ocrd_segment/repair.py | 285 ++++++------- ocrd_segment/replace_original.py | 141 +++---- ocrd_segment/replace_page.py | 161 +++---- ocrd_segment/replace_text.py | 160 +++---- requirements.txt | 2 +- 16 files changed, 1642 insertions(+), 1870 deletions(-) delete mode 100644 ocrd_segment/config.py diff --git a/ocrd_segment/config.py b/ocrd_segment/config.py deleted file mode 100644 index 01e0b23..0000000 --- a/ocrd_segment/config.py +++ /dev/null @@ -1,4 +0,0 @@ -import json -from pkg_resources import resource_string - -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_segment/evaluate.py b/ocrd_segment/evaluate.py index 7df7ac7..a1cdac8 100644 --- a/ocrd_segment/evaluate.py +++ b/ocrd_segment/evaluate.py @@ -2,6 +2,7 @@ import sys import os +from typing import Optional import json from itertools import chain import click @@ -10,17 +11,15 @@ from PIL import Image from shapely.geometry import Polygon -from ocrd import Processor +from ocrd import Workspace, Processor from ocrd_utils import ( getLogger, initLogging, - assert_file_grp_cardinality, xywh_from_polygon, polygon_from_points, coordinates_of_segment, MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import parse as parse_page from pycocotools.coco import COCO @@ -31,45 +30,37 @@ area as maskArea ) -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-evaluate' - class EvaluateSegmentation(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(EvaluateSegmentation, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-evaluate' - def process(self): + def process_workspace(self, workspace: Workspace) -> None: """Performs segmentation evaluation with pycocotools on the workspace. - + Open and deserialize PAGE files from the first and second input file group (the first as ground truth, the second as prediction). Then iterate over the element hierarchy down to ``level-of-operation``. Aggregate and convert all pages' segmentation (coordinates and classes) to COCO: + \b - On the region level, unless ``ignore-subtype``, differentiate segment classes by their `@type`, if applicable. - On the region level, unless ``for-categories`` is empty, select only segment classes in that (comma-separated) list. - If ``only-fg``, then use the foreground mask from the binarized image inside each segment for overlap calculations. - + Next, configure and run COCOEval for comparison of all pages. Show the matching pairs (GT segment ID, prediction segment ID, IoU) for every overlap on each page. Also, calculate per-class precision and recall (at the point of maximum recall). Finally, get the typical summary mean average precision / recall (but without restriction on the number of segments). - + Write a JSON report to the output file group. """ - LOG = getLogger('processor.EvaluateSegmentation') - - assert_file_grp_cardinality(self.output_file_grp, 1) - assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and evaluation data') # region or line level? level = self.parameter['level-of-operation'] onlyfg = self.parameter['only-fg'] @@ -77,10 +68,13 @@ def process(self): selected = self.parameter['for-categories'] if selected: selected = selected.split(',') + self.workspace = workspace + self.verify() + # FIXME: add configurable error handling as in super().process_workspace() # get input file groups ifgs = self.input_file_grp.split(",") # get input file tuples - ifts = self.zip_input_files(mimetype=MIMETYPE_PAGE) + ifts = self.zip_input_files(mimetype=MIMETYPE_PAGE, require_first=False) # convert to 2 COCO datasets from all page pairs categories = ["bg"] # needed by cocoeval images = [] @@ -89,14 +83,18 @@ def process(self): for ift in ifts: file_gt, file_dt = ift if not file_gt: - LOG.warning("skipping page %s missing from GT", file_gt.pageId) + self.logger.warning("skipping page %s missing from GT", file_gt.pageId) continue if not file_dt: - LOG.warning("skipping page %s missing from prediction", file_gt.pageId) + self.logger.warning("skipping page %s missing from prediction", file_gt.pageId) continue - LOG.info("processing page %s", file_gt.pageId) - pcgts_gt = page_from_file(self.workspace.download_file(file_gt)) - pcgts_dt = page_from_file(self.workspace.download_file(file_dt)) + self.logger.info("processing page %s", file_gt.pageId) + if self.download: + file_gt = self.workspace.download_file(file_gt) + file_dt = self.workspace.download_file(file_dt) + with pushd_popd(self.workspace.directory): + pcgts_gt = page_from_file(file_gt) + pcgts_dt = page_from_file(file_dt) page_gt = pcgts_gt.get_Page() page_dt = pcgts_dt.get_Page() if onlyfg: @@ -115,11 +113,13 @@ def process(self): _add_annotations(annotations_gt, page_gt, imgid, categories, level=level, typed=typed, coords=page_coords if onlyfg else None, - mask=page_mask if onlyfg else None) + mask=page_mask if onlyfg else None, + log=self.logger) _add_annotations(annotations_dt, page_dt, imgid, categories, level=level, typed=typed, coords=page_coords if onlyfg else None, - mask=page_mask if onlyfg else None) + mask=page_mask if onlyfg else None, + log=self.logger) if level == 'line': categories.append('textline') @@ -130,17 +130,17 @@ def process(self): _add_ids(annotations_gt, 1) # cocoeval expects annotation IDs starting at 1 _add_ids(annotations_dt, 1) # cocoeval expects annotation IDs starting at 1 - LOG.info(f"found {len(annotations_gt)} GT / {len(annotations_dt)} DT segments" - f" in {len(categories) - 1} categories for {len(images)} images") + self.logger.info(f"found {len(annotations_gt)} GT / {len(annotations_dt)} DT segments" + f" in {len(categories) - 1} categories for {len(images)} images") coco_gt = _create_coco(categories, images, annotations_gt) coco_dt = _create_coco(categories, images, annotations_dt) - stats = evaluate_coco(coco_gt, coco_dt, self.parameter, selected) + stats = evaluate_coco(coco_gt, coco_dt, self.parameter, selected, log=self.logger) # write regions to custom JSON for this page file_id = 'id' + self.output_file_grp + '_report' - self.workspace.add_file( + workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=None, @@ -203,6 +203,7 @@ def standalone_cli(gt_page_filelst, \b Write a JSON report to the output file group. """ + initLogging() assert (tabfile is None) == (gt_page_filelst is not None) == (dt_page_filelst is not None), \ "pass file lists either as tab-separated single file or as separate files" if tabfile is None: @@ -238,8 +239,7 @@ def standalone_cli(gt_page_filelst, # standalone entry point def evaluate_files(gt_files, dt_files, img_files=None, level='region', typed=True, selected=None): - initLogging() - LOG = getLogger('processor.EvaluateSegmentation') + log = getLogger('EvaluateSegmentation') categories = ["bg"] # needed by cocoeval images = [] annotations_gt = [] @@ -249,7 +249,7 @@ def evaluate_files(gt_files, dt_files, img_files=None, level='region', typed=Tru pcgts_gt = parse_page(gt_file) pcgts_dt = parse_page(dt_file) page_id = pcgts_gt.pcGtsId or gt_file - LOG.info("processing page %s", page_id) + log.info("processing page %s", page_id) page_gt = pcgts_gt.get_Page() page_dt = pcgts_dt.get_Page() if img_file: @@ -271,11 +271,13 @@ def evaluate_files(gt_files, dt_files, img_files=None, level='region', typed=Tru _add_annotations(annotations_gt, page_gt, imgid, categories, level=level, typed=typed, coords=page_coords if img_file else None, - mask=page_mask if img_file else None) + mask=page_mask if img_file else None, + log=log) _add_annotations(annotations_dt, page_dt, imgid, categories, level=level, typed=typed, coords=page_coords if img_file else None, - mask=page_mask if img_file else None) + mask=page_mask if img_file else None, + log=log) if level == 'line': categories.append('textline') @@ -286,7 +288,7 @@ def evaluate_files(gt_files, dt_files, img_files=None, level='region', typed=Tru _add_ids(annotations_gt, 1) # cocoeval expects annotation IDs starting at 1 _add_ids(annotations_dt, 1) # cocoeval expects annotation IDs starting at 1 - LOG.info(f"found {len(annotations_gt)} GT / {len(annotations_dt)} DT segments" + log.info(f"found {len(annotations_gt)} GT / {len(annotations_dt)} DT segments" f" in {len(categories) - 1} categories for {len(images)} images") coco_gt = _create_coco(categories, images, annotations_gt) @@ -299,9 +301,10 @@ def evaluate_files(gt_files, dt_files, img_files=None, level='region', typed=Tru stats = evaluate_coco(coco_gt, coco_dt, parameters, selected) return stats -def evaluate_coco(coco_gt, coco_dt, parameters, catIds=None): - LOG = getLogger('processor.EvaluateSegmentation') - LOG.info("comparing segmentations") +def evaluate_coco(coco_gt, coco_dt, parameters, catIds=None, log=None): + if log is None: + log = getLogger('EvaluateSegmentation') + log.info("comparing segmentations") stats = dict(parameters) coco_eval = COCOeval(coco_gt, coco_dt, 'segm') # bbox if catIds: @@ -553,7 +556,7 @@ def _create_coco(categories, images, annotations): return coco def _add_annotations(annotations, page, imgid, categories, - level='region', typed=True, coords=None, mask=None): + level='region', typed=True, coords=None, mask=None, log=None): for region in page.get_AllRegions(classes=None if level == 'region' else ['Text']): if level == 'region': cat = region.__class__.__name__[:-4] @@ -563,18 +566,19 @@ def _add_annotations(annotations, page, imgid, categories, categories.append(cat) catid = categories.index(cat) _add_annotation(annotations, region, imgid, catid, - coords=coords, mask=mask) + coords=coords, mask=mask, log=log) continue for line in region.get_TextLine(): _add_annotation(annotations, line, imgid, 1, - coords=coords, mask=mask) + coords=coords, mask=mask, log=log) -def _add_annotation(annotations, segment, imgid, catid, coords=None, mask=None): - LOG = getLogger('processor.EvaluateSegmentation') +def _add_annotation(annotations, segment, imgid, catid, coords=None, mask=None, log=None): + if log is None: + log = getLogger('EvaluateSegmentation') score = segment.get_Coords().get_conf() or 1.0 polygon = polygon_from_points(segment.get_Coords().points) if len(polygon) < 3: - LOG.warning('ignoring segment "%s" with only %d points', segment.id, len(polygon)) + log.warning('ignoring segment "%s" with only %d points', segment.id, len(polygon)) return xywh = xywh_from_polygon(polygon) if mask is None: diff --git a/ocrd_segment/extract_glyphs.py b/ocrd_segment/extract_glyphs.py index 5f3efa7..1e05c11 100644 --- a/ocrd_segment/extract_glyphs.py +++ b/ocrd_segment/extract_glyphs.py @@ -1,10 +1,11 @@ from __future__ import absolute_import +from typing import Optional import json import itertools from ocrd_utils import ( - getLogger, + config, make_file_id, assert_file_grp_cardinality, coordinates_of_segment, @@ -12,25 +13,22 @@ MIME_TO_EXT ) from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_file import OcrdFileType from ocrd import Processor -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-extract-glyphs' class ExtractGlyphs(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ExtractGlyphs, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-extract-glyphs' - def process(self): + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: """Extract glyph images and texts from the workspace. - + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the glyph level. - + Extract an image for each glyph (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. @@ -39,7 +37,8 @@ def process(self): specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. - + + \b Create a JSON file with: * the IDs of the glyph and its parents, * the glyph's text content, @@ -54,150 +53,153 @@ def process(self): * the parent textregion's @type, * the page's @type, * the page's DPI value. - + Create a plain text file for the text content, too. - + + \b Write all files in the directory of the output file group, named like so: * ID + '.raw.png': glyph image (if the workflow provides raw images) * ID + '.bin.png': glyph image (if the workflow provides binarized images) * ID + '.nrm.png': glyph image (if the workflow provides grayscale-normalized images) * ID + '.json': glyph metadata. * ID + '.gt.txt': glyph text. - + (This is intended for training and evaluation of script detection models.) """ - LOG = getLogger('processor.ExtractGlyph') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, + input_file = input_files[0] + page_id = input_file.pageId + try: + pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self.logger.error(f"non-PAGE input for page {page_id}: {err}") + raise + + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + transparency=self.parameter['transparency']) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None + ptype = page.get_type() + + regions = itertools.chain.from_iterable( + [page.get_TextRegion()] + + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - else: - dpi = None - ptype = page.get_type() - - regions = itertools.chain.from_iterable( - [page.get_TextRegion()] + - [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) - if not regions: - LOG.warning("Page '%s' contains no text regions", page_id) - for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, + rtype = region.get_type() + + lines = region.get_TextLine() + if not lines: + self.logger.warning("Region '%s' contains no text lines", region.id) + for line in lines: + line_image, line_coords = self.workspace.image_from_segment( + line, region_image, region_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - rtype = region.get_type() - - lines = region.get_TextLine() - if not lines: - LOG.warning("Region '%s' contains no text lines", region.id) - for line in lines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, + words = line.get_Word() + if not words: + self.logger.warning("Line '%s' contains no words", line.id) + for word in words: + word_image, word_coords = self.workspace.image_from_segment( + word, line_image, line_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - words = line.get_Word() - if not words: - LOG.warning("Line '%s' contains no words", line.id) - for word in words: - word_image, word_coords = self.workspace.image_from_segment( - word, line_image, line_coords, + glyphs = word.get_Glyph() + if not glyphs: + self.logger.warning("Word '%s' contains no glyphs", word.id) + for glyph in glyphs: + glyph_image, glyph_coords = self.workspace.image_from_segment( + glyph, word_image, word_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - glyphs = word.get_Glyph() - if not glyphs: - LOG.warning("Word '%s' contains no glyphs", word.id) - for glyph in glyphs: - glyph_image, glyph_coords = self.workspace.image_from_segment( - glyph, word_image, word_coords, - feature_filter=self.parameter['feature_filter'], - transparency=self.parameter['transparency']) - lpolygon_rel = coordinates_of_segment( - glyph, glyph_image, glyph_coords).tolist() - lpolygon_abs = polygon_from_points(glyph.get_Coords().points) - ltext = glyph.get_TextEquiv() - if not ltext: - LOG.warning("Glyph '%s' contains no text content", glyph.id) - ltext = '' - else: - ltext = ltext[0].Unicode - lstyle = glyph.get_TextStyle() or word.get_TextStyle() or line.get_TextStyle() or region.get_TextStyle() - if lstyle: - lstyle = { - 'fontFamily': lstyle.fontFamily, - 'fontSize': lstyle.fontSize, - 'xHeight': lstyle.xHeight, - 'kerning': lstyle.kerning, - 'serif': lstyle.serif, - 'monospace': lstyle.monospace, - 'bold': lstyle.bold, - 'italic': lstyle.italic, - 'smallCaps': lstyle.smallCaps, - 'letterSpaced': lstyle.letterSpaced, - 'strikethrough': lstyle.strikethrough, - 'underlined': lstyle.underlined, - 'underlineStyle': lstyle.underlineStyle, - 'subscript': lstyle.subscript, - 'superscript': lstyle.superscript - } - lfeatures = glyph_coords['features'] - description = { 'glyph.ID': glyph.id, - 'text': ltext, - 'style': lstyle, - 'production': ( - glyph.get_production() or - word.get_production() or - line.get_production() or - region.get_production()), - 'script': ( - glyph.get_script() or - word.get_primaryScript() or - line.get_primaryScript() or - region.get_primaryScript() or - page.get_primaryScript()), - 'ligature': glyph.get_ligature(), - 'symbol': glyph.get_symbol(), - 'features': lfeatures, - 'DPI': dpi, - 'coords_rel': lpolygon_rel, - 'coords_abs': lpolygon_abs, - 'word.ID': word.id, - 'line.ID': line.id, - 'region.ID': region.id, - 'region.type': rtype, - 'page.ID': page_id, - 'page.type': ptype, - 'file_grp': self.input_file_grp, - 'METS.UID': self.workspace.mets.unique_identifier + lpolygon_rel = coordinates_of_segment( + glyph, glyph_image, glyph_coords).tolist() + lpolygon_abs = polygon_from_points(glyph.get_Coords().points) + ltext = glyph.get_TextEquiv() + if not ltext: + self.logger.warning("Glyph '%s' contains no text content", glyph.id) + ltext = '' + else: + ltext = ltext[0].Unicode + lstyle = glyph.get_TextStyle() or word.get_TextStyle() or line.get_TextStyle() or region.get_TextStyle() + if lstyle: + lstyle = { + 'fontFamily': lstyle.fontFamily, + 'fontSize': lstyle.fontSize, + 'xHeight': lstyle.xHeight, + 'kerning': lstyle.kerning, + 'serif': lstyle.serif, + 'monospace': lstyle.monospace, + 'bold': lstyle.bold, + 'italic': lstyle.italic, + 'smallCaps': lstyle.smallCaps, + 'letterSpaced': lstyle.letterSpaced, + 'strikethrough': lstyle.strikethrough, + 'underlined': lstyle.underlined, + 'underlineStyle': lstyle.underlineStyle, + 'subscript': lstyle.subscript, + 'superscript': lstyle.superscript } - if 'binarized' in lfeatures: - extension = '.bin' - elif 'grayscale_normalized' in lfeatures: - extension = '.nrm' - else: - extension = '.raw' - - file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file( - glyph_image, - file_id + '_' + region.id + '_' + line.id + '_' + word.id + '_' + glyph.id + extension, - self.output_file_grp, - page_id=page_id, - mimetype=self.parameter['mimetype']) - file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') - json.dump(description, open(file_path, 'w')) - file_path = file_path.replace('.json', '.gt.txt') - with open(file_path, 'wb') as f: - f.write((ltext + '\n').encode('utf-8')) + lfeatures = glyph_coords['features'] + description = { 'glyph.ID': glyph.id, + 'text': ltext, + 'style': lstyle, + 'production': ( + glyph.get_production() or + word.get_production() or + line.get_production() or + region.get_production()), + 'script': ( + glyph.get_script() or + word.get_primaryScript() or + line.get_primaryScript() or + region.get_primaryScript() or + page.get_primaryScript()), + 'ligature': glyph.get_ligature(), + 'symbol': glyph.get_symbol(), + 'features': lfeatures, + 'DPI': dpi, + 'coords_rel': lpolygon_rel, + 'coords_abs': lpolygon_abs, + 'word.ID': word.id, + 'line.ID': line.id, + 'region.ID': region.id, + 'region.type': rtype, + 'page.ID': page_id, + 'page.type': ptype, + 'file_grp': self.input_file_grp, + 'METS.UID': self.workspace.mets.unique_identifier + } + if 'binarized' in lfeatures: + extension = '.bin' + elif 'grayscale_normalized' in lfeatures: + extension = '.nrm' + else: + extension = '.raw' + + file_id = make_file_id(input_file, self.output_file_grp) + file_path = self.workspace.save_image_file( + glyph_image, + file_id + '_' + region.id + '_' + line.id + '_' + word.id + '_' + glyph.id + extension, + self.output_file_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') + json.dump(description, open(file_path, 'w'), indent=2) + file_path = file_path.replace('.json', '.gt.txt') + with open(file_path, 'wb') as f: + f.write((ltext + '\n').encode('utf-8')) diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py index aac2cfd..fa08842 100644 --- a/ocrd_segment/extract_lines.py +++ b/ocrd_segment/extract_lines.py @@ -1,38 +1,35 @@ from __future__ import absolute_import +from typing import Optional import os import json import xlsxwriter from ocrd_utils import ( - getLogger, + config, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, polygon_from_points, MIME_TO_EXT ) -from ocrd_models.constants import NAMESPACES from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES +from ocrd_models.ocrd_file import OcrdFileType from ocrd import Processor -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-extract-lines' class ExtractLines(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ExtractLines, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-extract-lines' - def process(self): + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: """Extract textline images and texts from the workspace. - - Open and deserialize PAGE input files and their respective images, + + Open and deserialize PAGE input file and their respective images, then iterate over the element hierarchy down to the textline level. - + For each textline, if the textline's image size and text length do not satisfy ``min-line-height``, ``min-line-width`` or ``min-line-length``, then skip. @@ -45,7 +42,7 @@ def process(self): specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. - + \b If ``output-types`` contains `json`, then create a JSON file with: * the IDs of the textline and its parents, @@ -61,12 +58,12 @@ def process(self): * the parent textregion's @type, * the page's @type, * the page's DPI value. - + If ``output-types`` contains `text`, then create a plain text file for the content, too. - + If ``output-types`` contains `xlsx`, then (on the page level) create a spreadsheet which contains all extracted lines and images in different columns for manual editing. - + \b Write all output files in the directory of the output file group, named like so: * fileID + regionID + lineID + '.raw.png': line image (if the workflow provides raw images) @@ -75,13 +72,9 @@ def process(self): * fileID + regionID + lineID + '.json': line metadata. * fileID + regionID + lineID + '.gt.txt': line text. * fileID + '.xlsx': spreadsheet file. - + (This is intended for correction, training and evaluation of OCR models.) """ - LOG = getLogger('processor.ExtractLines') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - library_convention = self.parameter['library-convention'] textequiv_index = self.parameter['textequiv-index'] min_line_length = self.parameter['min-line-length'] @@ -89,203 +82,207 @@ def process(self): min_line_height = self.parameter['min-line-height'] out_types = self.parameter['output-types'] - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, + input_file = input_files[0] + page_id = input_file.pageId + try: + pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self.logger.error(f"non-PAGE input for page {page_id}: {err}") + raise + + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + transparency=self.parameter['transparency']) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None + ptype = page.get_type() + file_id = make_file_id(input_file, self.output_file_grp) + + if 'xlsx' in out_types: + self.logger.info('Writing Excel result file "%s.xlsx" in "%s"', file_id, self.output_file_grp) + excel_path = '%s.xlsx' % os.path.join(self.output_file_grp, file_id) + self.workspace.add_file( + ID=file_id, + mimetype='application/vnd.ms-excel', + pageId=page_id, + local_filename=excel_path, + file_grp=self.output_file_grp, + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + workbook = xlsxwriter.Workbook(excel_path, + {'strings_to_numbers': False, + 'strings_to_formulas': False, + 'strings_to_urls': False}) + worksheet = workbook.add_worksheet() + bold = workbook.add_format({'bold': True}) + normal = workbook.add_format({'valign': 'top'}) + editable = workbook.add_format({'valign': 'top'}) + editable.set_locked(False) + worksheet.set_default_row(height=40) + worksheet.freeze_panes(1, 0) + worksheet.write('A1', 'ID', bold) + worksheet.write('B1', 'Text', bold) + worksheet.write('C1', 'Status', bold) + worksheet.write('D1', 'Image', bold) + symbols = 'ſ ꝛ aͤ oͤ uͤ æ œ Æ Œ ℳ ç ę ë č ř š ž ě — – - ⸗ = Α α Β β ϐ Γ γ Δ δ Ε ε ϵ Ζ ζ Η η Θ θ ϑ Ι ι ' \ + 'Κ κ ϰ Λ λ Μ μ Ν ν Ξ ξ Ο ο Π π ϖ Ρ ρ ϱ Σ σ ς ϲ Τ τ Υ υ ϒ Φ φ ϕ Χ χ Ψ ψ Ω ω'.split(' ') + for i, s in enumerate(symbols): + col_idx = 4 + i + worksheet.write_string(0, col_idx, s, editable) + worksheet.set_column(col_idx, col_idx, 2) + worksheet.protect('', { + 'objects': True, + 'scenarios': True, + 'format_cells': False, + 'format_columns': False, + 'format_rows': False, + 'insert_columns': False, + 'insert_rows': False, + 'insert_hyperlinks': False, + 'delete_columns': False, + 'delete_rows': False, + 'select_locked_cells': True, + 'sort': True, + 'autofilter': True, + 'pivot_tables': True, + 'select_unlocked_cells': True, + }) + url = self._get_presentation_image(input_file, library_convention) + i = 2 + max_text_length = 0 + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - else: - dpi = None - ptype = page.get_type() - if not os.path.isdir(self.output_file_grp): - os.mkdir(self.output_file_grp) - - if 'xlsx' in out_types: - LOG.info('Writing Excel result file "%s.xlsx" in "%s"', file_id, self.output_file_grp) - excel_path = '%s.xlsx' % os.path.join(self.output_file_grp, file_id) - workbook = xlsxwriter.Workbook(excel_path, - {'strings_to_numbers': False, - 'strings_to_formulas': False, - 'strings_to_urls': False}) - worksheet = workbook.add_worksheet() - bold = workbook.add_format({'bold': True}) - normal = workbook.add_format({'valign': 'top'}) - editable = workbook.add_format({'valign': 'top'}) - editable.set_locked(False) - worksheet.set_default_row(height=40) - worksheet.freeze_panes(1, 0) - worksheet.write('A1', 'ID', bold) - worksheet.write('B1', 'Text', bold) - worksheet.write('C1', 'Status', bold) - worksheet.write('D1', 'Image', bold) - symbols = 'ſ ꝛ aͤ oͤ uͤ æ œ Æ Œ ℳ ç ę ë č ř š ž ě — – - ⸗ = Α α Β β ϐ Γ γ Δ δ Ε ε ϵ Ζ ζ Η η Θ θ ϑ Ι ι ' \ - 'Κ κ ϰ Λ λ Μ μ Ν ν Ξ ξ Ο ο Π π ϖ Ρ ρ ϱ Σ σ ς ϲ Τ τ Υ υ ϒ Φ φ ϕ Χ χ Ψ ψ Ω ω'.split(' ') - for i, s in enumerate(symbols): - col_idx = 4 + i - worksheet.write_string(0, col_idx, s, editable) - worksheet.set_column(col_idx, col_idx, 2) - worksheet.protect('', { - 'objects': True, - 'scenarios': True, - 'format_cells': False, - 'format_columns': False, - 'format_rows': False, - 'insert_columns': False, - 'insert_rows': False, - 'insert_hyperlinks': False, - 'delete_columns': False, - 'delete_rows': False, - 'select_locked_cells': True, - 'sort': True, - 'autofilter': True, - 'pivot_tables': True, - 'select_unlocked_cells': True, - }) - self.workspace.add_file( - ID=file_id, - mimetype='application/vnd.ms-excel', - pageId=page_id, - url=excel_path, - file_grp=self.output_file_grp, - ) - url = self._get_presentation_image(input_file, library_convention) - i = 2 - max_text_length = 0 - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - LOG.warning("Page '%s' contains no text regions", page_id) - for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, + rtype = region.get_type() + lines = region.get_TextLine() + if not lines: + self.logger.warning("Region '%s' contains no text lines", region.id) + for line in lines: + line_image, line_coords = self.workspace.image_from_segment( + line, region_image, region_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - rtype = region.get_type() - lines = region.get_TextLine() - if not lines: - LOG.warning("Region '%s' contains no text lines", region.id) - for line in lines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, - feature_filter=self.parameter['feature_filter'], - transparency=self.parameter['transparency']) - if not line_image.height or not line_image.width: - LOG.error("Page '%s' line '%s' is of zero height/width", page_id, line.id) - continue - ltext = line.get_TextEquiv() - if not ltext: - LOG.warning("Line '%s' contains no text content", line.id) - ltext = '' - elif textequiv_index in ['first', 'last']: - ltext = ltext[{'first': 0, 'last': -1}[textequiv_index]].Unicode - else: - for textequiv in ltext: - if textequiv.index == int(textequiv_index): - ltext = textequiv.Unicode - break - if not isinstance(ltext, str): - LOG.error("Page '%s' line '%s' has no TextEquiv/@index='%s'", page_id, line.id, textequiv_index) - continue - if (len(ltext) < min_line_length or - line_image.width < min_line_width or - line_image.height < min_line_height): + if not line_image.height or not line_image.width: + self.logger.error("Page '%s' line '%s' is of zero height/width", page_id, line.id) + continue + ltext = line.get_TextEquiv() + if not ltext: + self.logger.warning("Line '%s' contains no text content", line.id) + ltext = '' + elif textequiv_index in ['first', 'last']: + ltext = ltext[{'first': 0, 'last': -1}[textequiv_index]].Unicode + else: + for textequiv in ltext: + if textequiv.index == int(textequiv_index): + ltext = textequiv.Unicode + break + if not isinstance(ltext, str): + self.logger.error("Page '%s' line '%s' has no TextEquiv/@index='%s'", page_id, line.id, textequiv_index) continue - lpolygon_rel = coordinates_of_segment( - line, line_image, line_coords).tolist() - lpolygon_abs = polygon_from_points(line.get_Coords().points) - lstyle = line.get_TextStyle() or region.get_TextStyle() - if lstyle: - lstyle = { - 'fontFamily': lstyle.fontFamily, - 'fontSize': lstyle.fontSize, - 'xHeight': lstyle.xHeight, - 'kerning': lstyle.kerning, - 'serif': lstyle.serif, - 'monospace': lstyle.monospace, - 'bold': lstyle.bold, - 'italic': lstyle.italic, - 'smallCaps': lstyle.smallCaps, - 'letterSpaced': lstyle.letterSpaced, - 'strikethrough': lstyle.strikethrough, - 'underlined': lstyle.underlined, - 'underlineStyle': lstyle.underlineStyle, - 'subscript': lstyle.subscript, - 'superscript': lstyle.superscript - } - lfeatures = line_coords['features'] - description = { 'line.ID': line.id, - 'text': ltext, - 'style': lstyle, - 'production': ( - line.get_production() or - region.get_production()), - 'readingDirection': ( - line.get_readingDirection() or - region.get_readingDirection() or - page.get_readingDirection()), - 'primaryScript': ( - line.get_primaryScript() or - region.get_primaryScript() or - page.get_primaryScript()), - 'primaryLanguage': ( - line.get_primaryLanguage() or - region.get_primaryLanguage() or - page.get_primaryLanguage()), - 'features': lfeatures, - 'DPI': dpi, - 'coords_rel': lpolygon_rel, - 'coords_abs': lpolygon_abs, - 'region.ID': region.id, - 'region.type': rtype, - 'page.ID': page_id, - 'page.type': ptype, - 'file_grp': self.input_file_grp, - 'METS.UID': self.workspace.mets.unique_identifier + if (len(ltext) < min_line_length or + line_image.width < min_line_width or + line_image.height < min_line_height): + continue + lpolygon_rel = coordinates_of_segment( + line, line_image, line_coords).tolist() + lpolygon_abs = polygon_from_points(line.get_Coords().points) + lstyle = line.get_TextStyle() or region.get_TextStyle() + if lstyle: + lstyle = { + 'fontFamily': lstyle.fontFamily, + 'fontSize': lstyle.fontSize, + 'xHeight': lstyle.xHeight, + 'kerning': lstyle.kerning, + 'serif': lstyle.serif, + 'monospace': lstyle.monospace, + 'bold': lstyle.bold, + 'italic': lstyle.italic, + 'smallCaps': lstyle.smallCaps, + 'letterSpaced': lstyle.letterSpaced, + 'strikethrough': lstyle.strikethrough, + 'underlined': lstyle.underlined, + 'underlineStyle': lstyle.underlineStyle, + 'subscript': lstyle.subscript, + 'superscript': lstyle.superscript } - if 'binarized' in lfeatures: - extension = '.bin' - elif 'grayscale_normalized' in lfeatures: - extension = '.nrm' - else: - extension = '.raw' - file_path = self.workspace.save_image_file( - line_image, - file_id + '_' + region.id + '_' + line.id + extension, - self.output_file_grp, - page_id=page_id, - mimetype=self.parameter['mimetype']) - if 'xlsx' in out_types: - scale = 40.0 / line_image.height - worksheet.write('A%d' % i, file_id + '_' + region.id + '_' + line.id, normal) - if len(ltext) > max_text_length: - max_text_length = len(ltext) - worksheet.set_column('B:B', max_text_length) - worksheet.write('B%d' % i, ltext, editable) - worksheet.data_validation('C%d' % i, { - 'validate': 'list', 'source': ['ToDo', 'Done', 'Error']}) - worksheet.write('C%d' % i, 'ToDo', editable) - worksheet.insert_image('D%d' % i, file_path, { - 'object_position': 1, 'url': url, 'y_scale': scale, 'x_scale': scale}) - file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') - if 'json' in out_types: - json.dump(description, open(file_path, 'w')) - file_path = file_path.replace('.json', '.gt.txt') - if 'text' in out_types: - with open(file_path, 'wb') as f: - f.write((ltext + '\n').encode('utf-8')) - i += 1 - if 'xlsx' in out_types: - workbook.close() + lfeatures = line_coords['features'] + description = { 'line.ID': line.id, + 'text': ltext, + 'style': lstyle, + 'production': ( + line.get_production() or + region.get_production()), + 'readingDirection': ( + line.get_readingDirection() or + region.get_readingDirection() or + page.get_readingDirection()), + 'primaryScript': ( + line.get_primaryScript() or + region.get_primaryScript() or + page.get_primaryScript()), + 'primaryLanguage': ( + line.get_primaryLanguage() or + region.get_primaryLanguage() or + page.get_primaryLanguage()), + 'features': lfeatures, + 'DPI': dpi, + 'coords_rel': lpolygon_rel, + 'coords_abs': lpolygon_abs, + 'region.ID': region.id, + 'region.type': rtype, + 'page.ID': page_id, + 'page.type': ptype, + 'file_grp': self.input_file_grp, + 'METS.UID': self.workspace.mets.unique_identifier + } + if 'binarized' in lfeatures: + extension = '.bin' + elif 'grayscale_normalized' in lfeatures: + extension = '.nrm' + else: + extension = '.raw' + file_path = self.workspace.save_image_file( + line_image, + file_id + '_' + region.id + '_' + line.id + extension, + self.output_file_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + if 'xlsx' in out_types: + scale = 40.0 / line_image.height + worksheet.write('A%d' % i, file_id + '_' + region.id + '_' + line.id, normal) + if len(ltext) > max_text_length: + max_text_length = len(ltext) + worksheet.set_column('B:B', max_text_length) + worksheet.write('B%d' % i, ltext, editable) + worksheet.data_validation('C%d' % i, { + 'validate': 'list', 'source': ['ToDo', 'Done', 'Error']}) + worksheet.write('C%d' % i, 'ToDo', editable) + worksheet.insert_image('D%d' % i, file_path, { + 'object_position': 1, 'url': url, 'y_scale': scale, 'x_scale': scale}) + file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') + if 'json' in out_types: + json.dump(description, open(file_path, 'w'), indent=2) + file_path = file_path.replace('.json', '.gt.txt') + if 'text' in out_types: + with open(file_path, 'wb') as f: + f.write((ltext + '\n').encode('utf-8')) + i += 1 + if 'xlsx' in out_types: + workbook.close() def _get_presentation_image(self, input_file, library_convention): if library_convention == 'slub': diff --git a/ocrd_segment/extract_pages.py b/ocrd_segment/extract_pages.py index fd6ed4e..3c1efa8 100644 --- a/ocrd_segment/extract_pages.py +++ b/ocrd_segment/extract_pages.py @@ -1,35 +1,37 @@ from __future__ import absolute_import +from dataclasses import dataclass +from typing import Optional import json -from collections import namedtuple import os.path + import numpy as np import cv2 from PIL import Image, ImageDraw from shapely.geometry import Polygon from shapely.validation import explain_validity from shapely.prepared import prep -import xlsxwriter from ocrd_utils import ( - getLogger, + config, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, xywh_from_polygon, polygon_from_bbox, MIME_TO_EXT ) from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import ( - OrderedGroupType, OrderedGroupIndexedType, - RegionRefType, RegionRefIndexedType, + OcrdPage, + OrderedGroupType, + OrderedGroupIndexedType, + RegionRefType, + RegionRefIndexedType, ) -from ocrd import Processor +from ocrd import Workspace, Processor -from .config import OCRD_TOOL -TOOL = 'ocrd-segment-extract-pages' # region classes and their colours in mask (pseg) images: # (from prima-page-viewer/src/org/primaresearch/page/viewer/ui/render/PageContentColors, # but added alpha channel to also discern subtype, if not visually; @@ -98,19 +100,19 @@ } # pragma pylint: enable=bad-whitespace + class ExtractPages(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ExtractPages, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-extract-pages' - def process(self): + def process_workspace(self, workspace: Workspace) -> None: """Extract page images and region descriptions (type and coordinates) from the workspace. - + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. - + Get all regions with their types (region element class), sub-types (@type) and coordinates relative to the page (which depending on the workflow could already be cropped, deskewed, dewarped, binarized etc). Extract the image of @@ -120,13 +122,13 @@ def process(self): to skip specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. - + In addition, create a new (third) image with masks for each segment type in ``plot_segmasks``, color-coded by class according to ``colordict``. - + Create two JSON files with region types and coordinates: one (page-wise) in our custom format and one (global) in MS-COCO. - + \b The output file group may be given as a comma-separated list to separate these 3 kinds of images. If fewer than 3 fileGrps are specified, they will @@ -147,39 +149,37 @@ def process(self): with a blank (white) image - unless ``plot_overlay`` is true, in which case each layer and segment is superimposed (alpha blended) onto the previous one, starting with the above raw image. - + \b In addition, write a file for all pages at once: * in the third (or second or only) output file group (directory): - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format) - output_file_grp + '.colordict.json': the used ``colordict`` - + (This is intended for training and evaluation of region segmentation models.) """ - LOG = getLogger('processor.ExtractPages') - assert_file_grp_cardinality(self.input_file_grp, 1) file_groups = self.output_file_grp.split(',') if len(file_groups) > 3: - raise Exception("at most 3 output file grps allowed (raw, [binarized, [mask]] image)") + raise ValueError("at most 3 output file grps allowed (raw, [binarized, [mask]] image)") if len(file_groups) > 2: - mask_image_grp = file_groups[2] + self.mask_image_grp = file_groups[2] else: - mask_image_grp = file_groups[0] - LOG.info("No output file group for mask images specified, falling back to output filegrp '%s'", mask_image_grp) + self.mask_image_grp = file_groups[0] + self.logger.info("No output file group for mask images specified, falling back to output filegrp '%s'", self.mask_image_grp) if len(file_groups) > 1: - bin_image_grp = file_groups[1] + self.bin_image_grp = file_groups[1] else: - bin_image_grp = file_groups[0] - LOG.info("No output file group for binarized images specified, falling back to output filegrp '%s'", bin_image_grp) + self.bin_image_grp = file_groups[0] + self.logger.info("No output file group for binarized images specified, falling back to output filegrp '%s'", self.bin_image_grp) + # reduce to just a single fileGrp, so core's process_page_file can be reused self.output_file_grp = file_groups[0] - classes = self.parameter['colordict'] # COCO: init data structures - images = list() - annotations = list() - categories = list() - i = 0 - for cat, color in classes.items(): + self.images = [] + self.annotations = [] + self.categories = [] + cat_id = 0 + for cat, color in self.parameter['colordict'].items(): # COCO format does not allow alpha channel color = (int(color[0:2], 16), int(color[2:4], 16), @@ -189,206 +189,228 @@ def process(self): except ValueError: name = cat supercat = '' - categories.append( - {'id': i, 'name': name, 'supercategory': supercat, + self.categories.append( + {'id': cat_id, 'name': name, 'supercategory': supercat, 'source': 'PAGE', 'color': color}) - i += 1 + cat_id += 1 - i = 0 - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - try: - # separate non-numeric part of page ID to retain the numeric part - num_page_id = int(page_id.strip(page_id.strip("0123456789"))) - except Exception: - num_page_id = n - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - ptype = page.get_type() - page_image, page_coords, page_image_info = self.workspace.image_from_page( + # create per-page image and JSON files + self.ann_id = 0 + super().process_workspace(workspace) + + # COCO: write result + file_id = self.mask_image_grp + '.coco.json' + self.logger.info('Writing COCO result file "%s" in "%s"', file_id, self.mask_image_grp) + workspace.add_file( + ID=file_id, + file_grp=self.mask_image_grp, + local_filename=os.path.join(self.mask_image_grp, file_id), + mimetype='application/json', + pageId=None, + content=json.dumps( + {'categories': self.categories, + 'images': self.images, + 'annotations': self.annotations}, + indent=2), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + # write inverse colordict (for ocrd-segment-from-masks) + file_id = self.mask_image_grp + '.colordict.json' + self.logger.info('Writing colordict file "%s" in .', file_id) + # FIXME: add to METS as well? + with open(os.path.join(workspace.directory, file_id), 'w') as out: + json.dump(dict((col, name) + for name, col in classes.items() + if name), + out, indent=2) + + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: + classes = self.parameter['colordict'] + input_file = input_files[0] + page_id = input_file.pageId + try: + # separate non-numeric part of page ID to retain the numeric part + num_page_id = int(page_id.strip(page_id.strip("0123456789"))) + except Exception: + num_page_id = self.workspace.mets.physical_pages.index(page_id) + self.logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self.logger.error(f"non-PAGE input for page {page_id}: {err}") + raise + page = pcgts.get_Page() + ptype = page.get_type() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + transparency=self.parameter['transparency']) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None + + file_id = make_file_id(input_file, self.output_file_grp) + file_path = self.workspace.save_image_file( + page_image, + file_id, + self.output_file_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + try: + page_image_bin, _, _ = self.workspace.image_from_page( page, page_id, - feature_filter=self.parameter['feature_filter'], + feature_selector='binarized', transparency=self.parameter['transparency']) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) + self.workspace.save_image_file( + page_image_bin, + file_id + '.bin', + self.bin_image_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + except Exception as err: + if err.args[0].startswith('Found no AlternativeImage'): + self.logger.warning('Page "%s" has no binarized images, skipping .bin', page_id) else: - dpi = None - file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file(page_image, - file_id, - self.output_file_grp, - page_id=page_id, - mimetype=self.parameter['mimetype']) - try: - page_image_bin, _, _ = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized', - transparency=self.parameter['transparency']) - self.workspace.save_image_file(page_image_bin, - file_id + '.bin', - bin_image_grp, - page_id=page_id) - except Exception as err: - if err.args[0].startswith('Found no AlternativeImage'): - LOG.warning('Page "%s" has no binarized images, skipping .bin', page_id) + raise + # init multi-level mask output + if self.parameter['plot_overlay']: + page_image_segmask = page_image.convert('RGBA') + else: + page_image_segmask = Image.new(mode='RGBA', + size=page_image.size, + color='#FFFFFF00') + neighbors = {} + for level in ['page', 'region', 'line', 'word', 'glyph']: + neighbors[level] = [] + # produce border mask plot, if necessary + if page.get_Border(): + poly = segment_poly(self.logger, page_id, page.get_Border(), page_coords) + else: + poly = Polygon(polygon_from_bbox(0, 0, page_image.width, page_image.height)) + if 'page' in self.parameter['plot_segmasks']: + plot_segment(self.logger, page_id, page.get_Border(), poly, 'Border', classes, + page_image_segmask, [], self.parameter['plot_overlay']) + # get regions and aggregate masks on all hierarchy levels + description = {'angle': page.get_orientation()} + regions = {} + for name in classes: + if not name or not name.endswith('Region'): + # no region subtypes or non-region types here + continue + #regions[name] = getattr(page, 'get_' + name)() + regions[name] = page.get_AllRegions(classes=name[:-6], order='reading-order') + for rtype, rlist in regions.items(): + for region in rlist: + if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: + subrtype = region.get_type() else: - raise - # init multi-level mask output - if self.parameter['plot_overlay']: - page_image_segmask = page_image.convert('RGBA') - else: - page_image_segmask = Image.new(mode='RGBA', - size=page_image.size, - color='#FFFFFF00') - neighbors = dict() - for level in ['page', 'region', 'line', 'word', 'glyph']: - neighbors[level] = list() - # produce border mask plot, if necessary - if page.get_Border(): - poly = segment_poly(page_id, page.get_Border(), page_coords) - else: - poly = Polygon(polygon_from_bbox(0, 0, page_image.width, page_image.height)) - if 'page' in self.parameter['plot_segmasks']: - plot_segment(page_id, page.get_Border(), poly, 'Border', classes, - page_image_segmask, [], self.parameter['plot_overlay']) - # get regions and aggregate masks on all hierarchy levels - description = {'angle': page.get_orientation()} - regions = dict() - for name in classes.keys(): - if not name or not name.endswith('Region'): - # no region subtypes or non-region types here - continue - #regions[name] = getattr(page, 'get_' + name)() - regions[name] = page.get_AllRegions(classes=name[:-6], order='reading-order') - for rtype, rlist in regions.items(): - for region in rlist: - if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: - subrtype = region.get_type() - else: - subrtype = None - if subrtype: - rtype0 = rtype + ':' + subrtype - else: - rtype0 = rtype - poly = segment_poly(page_id, region, page_coords) - # produce region mask plot, if necessary - if 'region' in self.parameter['plot_segmasks']: - plot_segment(page_id, region, poly, rtype0, classes, - page_image_segmask, neighbors['region'], - self.parameter['plot_overlay']) - if rtype == 'TextRegion': - lines = region.get_TextLine() - for line in lines: + subrtype = None + if subrtype: + rtype0 = rtype + ':' + subrtype + else: + rtype0 = rtype + poly = segment_poly(self.logger, page_id, region, page_coords) + # produce region mask plot, if necessary + if 'region' in self.parameter['plot_segmasks']: + plot_segment(self.logger, page_id, region, poly, rtype0, classes, + page_image_segmask, neighbors['region'], + self.parameter['plot_overlay']) + if rtype == 'TextRegion': + lines = region.get_TextLine() + for line in lines: + # produce line mask plot, if necessary + if 'line' in self.parameter['plot_segmasks']: + poly2 = segment_poly(self.logger, page_id, line, page_coords) + plot_segment(self.logger, page_id, line, poly2, 'TextLine', classes, + page_image_segmask, neighbors['line'], + self.parameter['plot_overlay']) + words = line.get_Word() + for word in words: # produce line mask plot, if necessary - if 'line' in self.parameter['plot_segmasks']: - poly2 = segment_poly(page_id, line, page_coords) - plot_segment(page_id, line, poly2, 'TextLine', classes, - page_image_segmask, neighbors['line'], + if 'word' in self.parameter['plot_segmasks']: + poly2 = segment_poly(self.logger, page_id, word, page_coords) + plot_segment(self.logger, page_id, word, poly2, 'Word', classes, + page_image_segmask, neighbors['word'], self.parameter['plot_overlay']) - words = line.get_Word() - for word in words: + glyphs = word.get_Glyph() + for glyph in glyphs: # produce line mask plot, if necessary - if 'word' in self.parameter['plot_segmasks']: - poly2 = segment_poly(page_id, word, page_coords) - plot_segment(page_id, word, poly2, 'Word', classes, - page_image_segmask, neighbors['word'], + if 'glyph' in self.parameter['plot_segmasks']: + poly2 = segment_poly(self.logger, page_id, glyph, page_coords) + plot_segment(self.logger, page_id, glyph, poly2, 'Glyph', classes, + page_image_segmask, neighbors['glyph'], self.parameter['plot_overlay']) - glyphs = word.get_Glyph() - for glyph in glyphs: - # produce line mask plot, if necessary - if 'glyph' in self.parameter['plot_segmasks']: - poly2 = segment_poly(page_id, glyph, page_coords) - plot_segment(page_id, glyph, poly2, 'Glyph', classes, - page_image_segmask, neighbors['glyph'], - self.parameter['plot_overlay']) - if not poly: - continue - polygon = np.array(poly.exterior.coords, int)[:-1].tolist() - xywh = xywh_from_polygon(polygon) - area = poly.area - description.setdefault('regions', []).append( - { 'type': rtype, - 'subtype': subrtype, - 'coords': polygon, - 'area': area, - 'features': page_coords['features'], - 'DPI': dpi, - 'region.ID': region.id, - 'page.ID': page_id, - 'page.type': ptype, - 'file_grp': self.input_file_grp, - 'METS.UID': self.workspace.mets.unique_identifier - }) - # COCO: add annotations - i += 1 - annotations.append( - {'id': i, 'image_id': num_page_id, - 'category_id': next((cat['id'] for cat in categories if cat['name'] == subrtype), - next((cat['id'] for cat in categories if cat['name'] == rtype))), - 'segmentation': np.array(poly.exterior.coords, int)[:-1].reshape(1, -1).tolist(), - 'area': area, - 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], - 'iscrowd': 0}) - - if 'order' in self.parameter['plot_segmasks']: - plot_order(page.get_ReadingOrder(), classes, page_image_segmask, - neighbors['region'], self.parameter['plot_overlay']) - if self.parameter['plot_segmasks']: - self.workspace.save_image_file(page_image_segmask, - file_id + '.pseg', - mask_image_grp, - page_id=page_id, - mimetype=self.parameter['mimetype']) - self.workspace.add_file( - ID=file_id + '.json', - file_grp=mask_image_grp, - pageId=input_file.pageId, - local_filename=file_path.replace(MIME_TO_EXT[self.parameter['mimetype']], '.json'), - mimetype='application/json', - content=json.dumps(description)) + if not poly: + continue + polygon = np.array(poly.exterior.coords, int)[:-1].tolist() + xywh = xywh_from_polygon(polygon) + area = poly.area + description.setdefault('regions', []).append( + { 'type': rtype, + 'subtype': subrtype, + 'coords': polygon, + 'area': area, + 'features': page_coords['features'], + 'DPI': dpi, + 'region.ID': region.id, + 'page.ID': page_id, + 'page.type': ptype, + 'file_grp': self.input_file_grp, + 'METS.UID': self.workspace.mets.unique_identifier + }) + # COCO: add annotations + self.ann_id += 1 + self.annotations.append( + {'id': self.ann_id, 'image_id': num_page_id, + 'category_id': next((cat['id'] for cat in self.categories if cat['name'] == subrtype), + next((cat['id'] for cat in self.categories if cat['name'] == rtype))), + 'segmentation': np.array(poly.exterior.coords, int)[:-1].reshape(1, -1).tolist(), + 'area': area, + 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], + 'iscrowd': 0}) - # COCO: add image - images.append({ - # COCO does not allow string identifiers: - # -> use numerical part of page_id - 'id': num_page_id, - # all exported coordinates are relative to the cropped page: - # -> use that for reference (instead of original page.imageFilename) - 'file_name': file_path, - # -> use its size (instead of original page.imageWidth/page.imageHeight) - 'width': page_image.width, - 'height': page_image.height}) - - # COCO: write result - file_id = mask_image_grp + '.coco.json' - LOG.info('Writing COCO result file "%s" in "%s"', file_id, mask_image_grp) + if 'order' in self.parameter['plot_segmasks']: + plot_order(self.logger, page.get_ReadingOrder(), classes, page_image_segmask, + neighbors['region'], self.parameter['plot_overlay']) + if self.parameter['plot_segmasks']: + self.workspace.save_image_file( + page_image_segmask, + file_id + '.pseg', + self.mask_image_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) self.workspace.add_file( - ID=file_id, - file_grp=mask_image_grp, - local_filename=os.path.join(mask_image_grp, file_id), + ID=file_id + '.json', + file_grp=self.mask_image_grp, + pageId=page_id, + local_filename=file_path.replace(MIME_TO_EXT[self.parameter['mimetype']], '.json'), mimetype='application/json', - pageId=None, - content=json.dumps( - {'categories': categories, - 'images': images, - 'annotations': annotations})) + content=json.dumps(description), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + # COCO: add image + self.images.append({ + # COCO does not allow string identifiers: + # -> use numerical part of page_id + 'id': num_page_id, + # all exported coordinates are relative to the cropped page: + # -> use that for reference (instead of original page.imageFilename) + 'file_name': file_path, + # -> use its size (instead of original page.imageWidth/page.imageHeight) + 'width': page_image.width, + 'height': page_image.height}) - # write inverse colordict (for ocrd-segment-from-masks) - file_id = mask_image_grp + '.colordict.json' - LOG.info('Writing colordict file "%s" in .', file_id) - with open(file_id, 'w') as out: - json.dump(dict((col, name) - for name, col in classes.items() - if name), - out) - -def segment_poly(page_id, segment, coords): - LOG = getLogger('processor.ExtractPages') +def segment_poly(log, page_id, segment, coords): polygon = coordinates_of_segment(segment, None, coords) # validate coordinates try: @@ -408,12 +430,11 @@ def segment_poly(page_id, segment, coords): tag = segment.__class__.__name__.replace('Type', '') if tag != 'Border': tag += ' "%s"' % segment.id - LOG.error('Page "%s" %s %s', page_id, tag, reason) + log.error('Page "%s" %s %s', page_id, tag, reason) return None return poly -def plot_order(readingorder, classes, image, regions, alpha=False): - LOG = getLogger('processor.ExtractPages') +def plot_order(log, readingorder, classes, image, regions, alpha=False): regiondict = dict((region.id, region.poly) for region in regions) def get_points(rogroup, level): points = list() @@ -449,7 +470,7 @@ def get_points(rogroup, level): for p1, p2 in zip(points[:-1], points[1:]): color = 'ReadingOrderLevel%s' % (str(p1[0]) if p1[0] < 2 else 'N') if color not in classes: - LOG.error('mask plots requested, but "colordict" does not contain a "%s" mapping', color) + log.error('mask plots requested, but "colordict" does not contain a "%s" mapping', color) return color = classes[color] color = (int(color[0:2], 16), @@ -460,16 +481,13 @@ def get_points(rogroup, level): layer.putalpha(Image.fromarray(255 * np.any(newimg < 255, axis=2).astype(np.uint8), mode='L')) image.alpha_composite(layer) -def plot_segment(page_id, segment, poly, stype, classes, image, neighbors, alpha=False): - LOG = getLogger('processor.ExtractPages') +def plot_segment(log, page_id, segment, poly, stype, classes, image, neighbors, alpha=False): if not poly: return if stype not in classes: - LOG.error('mask plots requested, but "colordict" does not contain a "%s" mapping', stype) + log.error('mask plots requested, but "colordict" does not contain a "%s" mapping', stype) return color = classes[stype] - Neighbor = namedtuple('Neighbor', ['id', 'poly', 'type']) - LOG = getLogger('processor.ExtractPages') # check intersection with neighbours # (which would melt into another in the mask image) if segment and hasattr(segment, 'id') and not alpha: @@ -480,11 +498,11 @@ def plot_segment(page_id, segment, poly, stype, classes, image, neighbors, alpha poly.intersection(neighbor.poly).area > 0): inter = poly.intersection(neighbor.poly).area union = poly.union(neighbor.poly).area - LOG.warning('Page "%s" segment "%s" intersects neighbour "%s" (IoU: %.3f)', + log.warning('Page "%s" segment "%s" intersects neighbour "%s" (IoU: %.3f)', page_id, segment.id, neighbor.id, inter / union) elif (stype != neighbor.type and poly_prep.within(neighbor.poly)): - LOG.warning('Page "%s" segment "%s" within neighbour "%s" (IoU: %.3f)', + log.warning('Page "%s" segment "%s" within neighbour "%s" (IoU: %.3f)', page_id, segment.id, neighbor.id, poly.area / neighbor.poly.area) if segment and hasattr(segment, 'id'): @@ -499,3 +517,10 @@ def plot_segment(page_id, segment, poly, stype, classes, image, neighbors, alpha else: ImageDraw.Draw(image).polygon(list(map(tuple, poly.exterior.coords[:-1])), fill='#' + color) + +@dataclass +class Neighbor(): + id : str + poly : Polygon + type : str + """color string (four-byte hexadecimal - RGBA)""" diff --git a/ocrd_segment/extract_regions.py b/ocrd_segment/extract_regions.py index d51b715..8abe772 100644 --- a/ocrd_segment/extract_regions.py +++ b/ocrd_segment/extract_regions.py @@ -1,39 +1,38 @@ from __future__ import absolute_import -import os +from typing import Optional +import os.path import json + import numpy as np from ocrd_utils import ( - getLogger, + config, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, xywh_from_polygon, polygon_from_points, MIME_TO_EXT ) from ocrd_modelfactory import page_from_file -from ocrd import Processor +from ocrd_models.ocrd_file import OcrdFileType +from ocrd import Workspace, Processor -from .config import OCRD_TOOL from .extract_pages import CLASSES, segment_poly -TOOL = 'ocrd-segment-extract-regions' class ExtractRegions(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ExtractRegions, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-extract-regions' - def process(self): + def process_workspace(self, workspace: Workspace) -> None: """Extract region images from the workspace. - + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. - + Extract an image for each region (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. @@ -42,10 +41,10 @@ def process(self): specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. - + Create two JSON files with region types and coordinates: one (page-wise) in our custom format and one (global) in MS-COCO. - + The custom JSON files contain: * the IDs of the region and its parents, * the region's coordinates relative to the region image, @@ -62,7 +61,7 @@ def process(self): * the region's @type, * the page's @type, * the page's DPI value. - + Write all files in the directory of the output file group, named like so: * ID + '.raw.png': region image (if the workflow provides raw images) * ID + '.bin.png': region image (if the workflow provides binarized images) @@ -70,20 +69,17 @@ def process(self): * ID + '.json': region metadata. * output_file_grp + '.coco.json' """ - LOG = getLogger('processor.ExtractRegions') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) classes = dict(CLASSES) - LOG.info("Extracting %s region classes!" % self.parameter["classes"]) + self.logger.info("Extracting %s region classes!", str(self.parameter["classes"])) # extract specific classes only if self.parameter["classes"]: selected_classes = self.parameter["classes"] classes = { region: classes[region] for region in selected_classes } # COCO: init data structures - images = list() - annotations = list() - categories = list() - i = 0 + self.images = [] + self.annotations = [] + self.categories = [] + cat_id = 0 for cat, color in classes.items(): # COCO format does not allow alpha channel color = (int(color[0:2], 16), @@ -94,181 +90,195 @@ def process(self): except ValueError: name = cat supercat = '' - categories.append( - {'id': i, 'name': name, 'supercategory': supercat, + self.categories.append( + {'id': cat_id, 'name': name, 'supercategory': supercat, 'source': 'PAGE', 'color': color}) - i += 1 - i = 0 # subregion count (i.e. annotation id) - j = 0 # region count (i.e. image id) - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_filter=self.parameter['feature_filter'], - transparency=self.parameter['transparency']) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - else: - dpi = None - ptype = page.get_type() + cat_id += 1 + + self.ann_id = 0 # subregion count (i.e. annotation id) + self.image_id = 0 # region count (i.e. image id) + super().process_workspace(workspace) - regions = dict() - for name in classes: - if not name or not name.endswith("Region"): - # only top-level regions here - continue - regions[name] = getattr(page, 'get_' + name)() - for rtype, rlist in regions.items(): - for region in rlist: - description = {'region.ID': region.id, 'region.type': rtype} - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, - transparency=self.parameter['transparency']) - if not region_image.width or not region_image.height: - LOG.error("ignoring zero-size region '%s'", region.id) - continue - if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: - subrtype = region.get_type() - else: - subrtype = None - j += 1 - description['subtype'] = subrtype - description['coords_rel'] = coordinates_of_segment( - region, region_image, region_coords).tolist() - description['coords_abs'] = polygon_from_points(region.get_Coords().points) - if rtype == 'text': - rtext = region.get_TextEquiv() - if rtext: - description['region.text'] = rtext[0].Unicode - else: - description['region.text'] = '' - rstyle = region.get_TextStyle() or page.get_TextStyle() - if rstyle: - description['region.style'] = { - 'fontFamily': rstyle.fontFamily, - 'fontSize': rstyle.fontSize, - 'xHeight': rstyle.xHeight, - 'kerning': rstyle.kerning, - 'serif': rstyle.serif, - 'monospace': rstyle.monospace, - 'bold': rstyle.bold, - 'italic': rstyle.italic, - 'smallCaps': rstyle.smallCaps, - 'letterSpaced': rstyle.letterSpaced, - 'strikethrough': rstyle.strikethrough, - 'underlined': rstyle.underlined, - 'underlineStyle': rstyle.underlineStyle, - 'subscript': rstyle.subscript, - 'superscript': rstyle.superscript - } - description['production'] = region.get_production() - description['readingDirection'] = ( - region.get_readingDirection() or - page.get_readingDirection()) - description['textLineOrder'] = ( - region.get_textLineOrder() or - page.get_textLineOrder()) - description['primaryScript'] = ( - region.get_primaryScript() or - page.get_primaryScript()) - description['primaryLanguage'] = ( - region.get_primaryLanguage() or - page.get_primaryLanguage()) - description['features'] = region_coords['features'] - description['DPI'] = dpi - description['page.ID'] = page_id - description['page.type'] = ptype - description['file_grp'] = self.input_file_grp - description['METS.UID'] = self.workspace.mets.unique_identifier - if 'binarized' in region_coords['features']: - extension = '.bin' - elif 'grayscale_normalized' in region_coords['features']: - extension = '.nrm' - else: - extension = '.raw' - subregions = dict() - for name in classes: - if not name or ':' in name: - # no subtypes here - continue - if not hasattr(region, 'get_' + name): - continue - subregions[name] = getattr(region, 'get_' + name)() - for subrtype, subrlist in subregions.items(): - for subregion in subrlist: - poly = segment_poly(page_id, subregion, region_coords) - if not poly: - continue - polygon = np.array(poly.exterior.coords, int)[:-1].tolist() - xywh = xywh_from_polygon(polygon) - area = poly.area - if subrtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: - subsubrtype = subregion.get_type() - else: - subsubrtype = None - if subsubrtype: - subrtype0 = subrtype + ':' + subsubrtype - else: - subrtype0 = subrtype - description.setdefault('regions', []).append( - { 'type': subrtype, - 'subtype': subsubrtype, - 'coords': polygon, - 'area': area, - 'region.ID': subregion.id - }) - # COCO: add annotations - i += 1 - annotations.append( - {'id': i, 'image_id': j, - 'category_id': next((cat['id'] for cat in categories if cat['name'] == subsubrtype), - next((cat['id'] for cat in categories if cat['name'] == subrtype))), - 'segmentation': np.array(poly.exterior.coords, int)[:-1].reshape(1, -1).tolist(), - 'area': area, - 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], - 'iscrowd': 0}) - - - - file_id = make_file_id(input_file, self.output_file_grp) + '_' + region.id + extension - file_path = self.workspace.save_image_file( - region_image, - file_id, - self.output_file_grp, - page_id=input_file.pageId, - mimetype=self.parameter['mimetype']) - self.workspace.add_file( - ID=file_id + '.json', - file_grp=self.output_file_grp, - local_filename=file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json'), - pageId=input_file.pageId, - mimetype='application/json', - content=json.dumps(description)) - # COCO: add image - images.append({ - 'id': j, - # all exported coordinates are relative to the cropped region: - # -> use that for reference - 'file_name': file_path, - # -> use its size - 'width': region_image.width, - 'height': region_image.height}) # COCO: write result file_id = self.output_file_grp + '.coco.json' - LOG.info('Writing COCO result file "%s"', file_id) - self.workspace.add_file( + self.logger.info('Writing COCO result file "%s"', file_id) + workspace.add_file( ID=file_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id), mimetype='application/json', pageId=None, content=json.dumps( - {'categories': categories, - 'images': images, - 'annotations': annotations})) + {'categories': self.categories, + 'images': self.images, + 'annotations': self.annotations}, + indent=2), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: + input_file = input_files[0] + page_id = input_file.pageId + self.logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self.logger.error(f"non-PAGE input for page {page_id}: {err}") + raise + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + transparency=self.parameter['transparency']) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None + ptype = page.get_type() + + regions = {} + for name in classes: + if not name or not name.endswith("Region"): + # only top-level regions here + continue + regions[name] = getattr(page, 'get_' + name)() + for rtype, rlist in regions.items(): + for region in rlist: + description = {'region.ID': region.id, 'region.type': rtype} + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, + transparency=self.parameter['transparency']) + if not region_image.width or not region_image.height: + self.logger.error("ignoring zero-size region '%s'", region.id) + continue + if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: + subrtype = region.get_type() + else: + subrtype = None + self.image_id += 1 + description['subtype'] = subrtype + description['coords_rel'] = coordinates_of_segment( + region, region_image, region_coords).tolist() + description['coords_abs'] = polygon_from_points(region.get_Coords().points) + if rtype == 'text': + rtext = region.get_TextEquiv() + if rtext: + description['region.text'] = rtext[0].Unicode + else: + description['region.text'] = '' + rstyle = region.get_TextStyle() or page.get_TextStyle() + if rstyle: + description['region.style'] = { + 'fontFamily': rstyle.fontFamily, + 'fontSize': rstyle.fontSize, + 'xHeight': rstyle.xHeight, + 'kerning': rstyle.kerning, + 'serif': rstyle.serif, + 'monospace': rstyle.monospace, + 'bold': rstyle.bold, + 'italic': rstyle.italic, + 'smallCaps': rstyle.smallCaps, + 'letterSpaced': rstyle.letterSpaced, + 'strikethrough': rstyle.strikethrough, + 'underlined': rstyle.underlined, + 'underlineStyle': rstyle.underlineStyle, + 'subscript': rstyle.subscript, + 'superscript': rstyle.superscript + } + description['production'] = region.get_production() + description['readingDirection'] = ( + region.get_readingDirection() or + page.get_readingDirection()) + description['textLineOrder'] = ( + region.get_textLineOrder() or + page.get_textLineOrder()) + description['primaryScript'] = ( + region.get_primaryScript() or + page.get_primaryScript()) + description['primaryLanguage'] = ( + region.get_primaryLanguage() or + page.get_primaryLanguage()) + description['features'] = region_coords['features'] + description['DPI'] = dpi + description['page.ID'] = page_id + description['page.type'] = ptype + description['file_grp'] = self.input_file_grp + description['METS.UID'] = self.workspace.mets.unique_identifier + if 'binarized' in region_coords['features']: + extension = '.bin' + elif 'grayscale_normalized' in region_coords['features']: + extension = '.nrm' + else: + extension = '.raw' + subregions = dict() + for name in classes: + if not name or ':' in name: + # no subtypes here + continue + if not hasattr(region, 'get_' + name): + continue + subregions[name] = getattr(region, 'get_' + name)() + for subrtype, subrlist in subregions.items(): + for subregion in subrlist: + poly = segment_poly(page_id, subregion, region_coords) + if not poly: + continue + polygon = np.array(poly.exterior.coords, int)[:-1].tolist() + xywh = xywh_from_polygon(polygon) + area = poly.area + if subrtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: + subsubrtype = subregion.get_type() + else: + subsubrtype = None + if subsubrtype: + subrtype0 = subrtype + ':' + subsubrtype + else: + subrtype0 = subrtype + description.setdefault('regions', []).append( + { 'type': subrtype, + 'subtype': subsubrtype, + 'coords': polygon, + 'area': area, + 'region.ID': subregion.id + }) + # COCO: add annotations + self.ann_id += 1 + self.annotations.append( + {'id': self.ann_id, 'image_id': self.image_id, + 'category_id': next((cat['id'] for cat in self.categories if cat['name'] == subsubrtype), + next((cat['id'] for cat in self.categories if cat['name'] == subrtype))), + 'segmentation': np.array(poly.exterior.coords, int)[:-1].reshape(1, -1).tolist(), + 'area': area, + 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], + 'iscrowd': 0}) + + file_id = make_file_id(input_file, self.output_file_grp) + '_' + region.id + extension + file_path = self.workspace.save_image_file( + region_image, + file_id, + self.output_file_grp, + page_id=input_file.pageId, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + self.workspace.add_file( + ID=file_id + '.json', + file_grp=self.output_file_grp, + local_filename=file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json'), + pageId=page_id, + mimetype='application/json', + content=json.dumps(description, indent=2), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + # COCO: add image + self.images.append({ + 'id': self.image_id, + # all exported coordinates are relative to the cropped region: + # -> use that for reference + 'file_name': file_path, + # -> use its size + 'width': region_image.width, + 'height': region_image.height}) diff --git a/ocrd_segment/extract_words.py b/ocrd_segment/extract_words.py index b4b126a..c06fa63 100644 --- a/ocrd_segment/extract_words.py +++ b/ocrd_segment/extract_words.py @@ -1,36 +1,33 @@ from __future__ import absolute_import +from typing import Optional import json import itertools from ocrd_utils import ( - getLogger, + config, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, polygon_from_points, MIME_TO_EXT ) from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_file import OcrdFileType from ocrd import Processor -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-extract-words' class ExtractWords(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ExtractWords, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-extract-words' - def process(self): + def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: """Extract word images and texts from the workspace. - + Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the word level. - + Extract an image for each word (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. @@ -39,7 +36,8 @@ def process(self): specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. - + + \b Create a JSON file with: * the IDs of the word and its parents, * the word's text content, @@ -54,147 +52,150 @@ def process(self): * the parent textregion's @type, * the page's @type, * the page's DPI value. - + Create a plain text file for the text content, too. - + + \b Write all files in the directory of the output file group, named like so: * ID + '.raw.png': word image (if the workflow provides raw images) * ID + '.bin.png': word image (if the workflow provides binarized images) * ID + '.nrm.png': word image (if the workflow provides grayscale-normalized images) * ID + '.json': word metadata. * ID + '.gt.txt': word text. - + (This is intended for training and evaluation of OCR models.) """ - LOG = getLogger('processor.ExtractWords') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, + input_file = input_files[0] + page_id = input_file.pageId + try: + pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self.logger.error(f"non-PAGE input for page {page_id}: {err}") + raise + + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + transparency=self.parameter['transparency']) + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + else: + dpi = None + ptype = page.get_type() + + regions = itertools.chain.from_iterable( + [page.get_TextRegion()] + + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - else: - dpi = None - ptype = page.get_type() - - regions = itertools.chain.from_iterable( - [page.get_TextRegion()] + - [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) - if not regions: - LOG.warning("Page '%s' contains no text regions", page_id) - for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, + rtype = region.get_type() + + lines = region.get_TextLine() + if not lines: + self.logger.warning("Region '%s' contains no text lines", region.id) + for line in lines: + line_image, line_coords = self.workspace.image_from_segment( + line, region_image, region_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - rtype = region.get_type() - - lines = region.get_TextLine() - if not lines: - LOG.warning("Region '%s' contains no text lines", region.id) - for line in lines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, + words = line.get_Word() + if not words: + self.logger.warning("Line '%s' contains no words", line.id) + for word in words: + word_image, word_coords = self.workspace.image_from_segment( + word, line_image, line_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) - words = line.get_Word() - if not words: - LOG.warning("Line '%s' contains no words", line.id) - for word in words: - word_image, word_coords = self.workspace.image_from_segment( - word, line_image, line_coords, - feature_filter=self.parameter['feature_filter'], - transparency=self.parameter['transparency']) - lpolygon_rel = coordinates_of_segment( - word, word_image, word_coords).tolist() - lpolygon_abs = polygon_from_points(word.get_Coords().points) - ltext = word.get_TextEquiv() - if not ltext: - LOG.warning("Word '%s' contains no text content", word.id) - ltext = '' - else: - ltext = ltext[0].Unicode - lstyle = word.get_TextStyle() or line.get_TextStyle() or region.get_TextStyle() - if lstyle: - lstyle = { - 'fontFamily': lstyle.fontFamily, - 'fontSize': lstyle.fontSize, - 'xHeight': lstyle.xHeight, - 'kerning': lstyle.kerning, - 'serif': lstyle.serif, - 'monospace': lstyle.monospace, - 'bold': lstyle.bold, - 'italic': lstyle.italic, - 'smallCaps': lstyle.smallCaps, - 'letterSpaced': lstyle.letterSpaced, - 'strikethrough': lstyle.strikethrough, - 'underlined': lstyle.underlined, - 'underlineStyle': lstyle.underlineStyle, - 'subscript': lstyle.subscript, - 'superscript': lstyle.superscript - } - lfeatures = word_coords['features'] - description = { 'word.ID': word.id, - 'text': ltext, - 'style': lstyle, - 'production': ( - word.get_production() or - line.get_production() or - region.get_production()), - 'readingDirection': ( - word.get_readingDirection() or - line.get_readingDirection() or - region.get_readingDirection() or - page.get_readingDirection()), - 'primaryScript': ( - word.get_primaryScript() or - line.get_primaryScript() or - region.get_primaryScript() or - page.get_primaryScript()), - 'language': ( - word.get_language() or - line.get_primaryLanguage() or - region.get_primaryLanguage() or - page.get_primaryLanguage()), - 'features': lfeatures, - 'DPI': dpi, - 'coords_rel': lpolygon_rel, - 'coords_abs': lpolygon_abs, - 'line.ID': line.id, - 'region.ID': region.id, - 'region.type': rtype, - 'page.ID': page_id, - 'page.type': ptype, - 'file_grp': self.input_file_grp, - 'METS.UID': self.workspace.mets.unique_identifier + lpolygon_rel = coordinates_of_segment( + word, word_image, word_coords).tolist() + lpolygon_abs = polygon_from_points(word.get_Coords().points) + ltext = word.get_TextEquiv() + if not ltext: + self.logger.warning("Word '%s' contains no text content", word.id) + ltext = '' + else: + ltext = ltext[0].Unicode + lstyle = word.get_TextStyle() or line.get_TextStyle() or region.get_TextStyle() + if lstyle: + lstyle = { + 'fontFamily': lstyle.fontFamily, + 'fontSize': lstyle.fontSize, + 'xHeight': lstyle.xHeight, + 'kerning': lstyle.kerning, + 'serif': lstyle.serif, + 'monospace': lstyle.monospace, + 'bold': lstyle.bold, + 'italic': lstyle.italic, + 'smallCaps': lstyle.smallCaps, + 'letterSpaced': lstyle.letterSpaced, + 'strikethrough': lstyle.strikethrough, + 'underlined': lstyle.underlined, + 'underlineStyle': lstyle.underlineStyle, + 'subscript': lstyle.subscript, + 'superscript': lstyle.superscript } - if 'binarized' in lfeatures: - extension = '.bin' - elif 'grayscale_normalized' in lfeatures: - extension = '.nrm' - else: - extension = '.raw' - - file_id = make_file_id(input_file, self.output_file_grp) - file_path = self.workspace.save_image_file( - word_image, - file_id + '_' + region.id + '_' + line.id + '_' + word.id + extension, - self.output_file_grp, - page_id=page_id, - mimetype=self.parameter['mimetype']) - file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') - json.dump(description, open(file_path, 'w')) - file_path = file_path.replace('.json', '.gt.txt') - with open(file_path, 'wb') as f: - f.write((ltext + '\n').encode('utf-8')) + lfeatures = word_coords['features'] + description = { 'word.ID': word.id, + 'text': ltext, + 'style': lstyle, + 'production': ( + word.get_production() or + line.get_production() or + region.get_production()), + 'readingDirection': ( + word.get_readingDirection() or + line.get_readingDirection() or + region.get_readingDirection() or + page.get_readingDirection()), + 'primaryScript': ( + word.get_primaryScript() or + line.get_primaryScript() or + region.get_primaryScript() or + page.get_primaryScript()), + 'language': ( + word.get_language() or + line.get_primaryLanguage() or + region.get_primaryLanguage() or + page.get_primaryLanguage()), + 'features': lfeatures, + 'DPI': dpi, + 'coords_rel': lpolygon_rel, + 'coords_abs': lpolygon_abs, + 'line.ID': line.id, + 'region.ID': region.id, + 'region.type': rtype, + 'page.ID': page_id, + 'page.type': ptype, + 'file_grp': self.input_file_grp, + 'METS.UID': self.workspace.mets.unique_identifier + } + if 'binarized' in lfeatures: + extension = '.bin' + elif 'grayscale_normalized' in lfeatures: + extension = '.nrm' + else: + extension = '.raw' + + file_id = make_file_id(input_file, self.output_file_grp) + file_path = self.workspace.save_image_file( + word_image, + file_id + '_' + region.id + '_' + line.id + '_' + word.id + extension, + self.output_file_grp, + page_id=page_id, + mimetype=self.parameter['mimetype'], + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + file_path = file_path.replace(extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') + json.dump(description, open(file_path, 'w'), indent=2) + file_path = file_path.replace('.json', '.gt.txt') + with open(file_path, 'wb') as f: + f.write((ltext + '\n').encode('utf-8')) diff --git a/ocrd_segment/import_coco_segmentation.py b/ocrd_segment/import_coco_segmentation.py index 611d55d..ffe1c19 100644 --- a/ocrd_segment/import_coco_segmentation.py +++ b/ocrd_segment/import_coco_segmentation.py @@ -1,22 +1,21 @@ from __future__ import absolute_import +from typing import Optional import os.path import json import logging + import numpy as np from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, points_from_polygon, MIMETYPE_PAGE, membername ) -from ocrd_modelfactory import page_from_file # pragma pylint: disable=unused-import # (region types will be referenced indirectly via globals()) from ocrd_models.ocrd_page import ( + OcrdPage, CoordsType, TextRegionType, ImageRegionType, @@ -41,25 +40,27 @@ ChartTypeSimpleType ) # pragma pylint: enable=unused-import -from ocrd import Processor +from ocrd import Workspace, Processor, OcrdPageResult -from .config import OCRD_TOOL +TYPEDICT = { + "TextRegion": TextTypeSimpleType, + "GraphicRegion": GraphicsTypeSimpleType, + "ChartType": ChartTypeSimpleType +} -TOOL = 'ocrd-segment-from-coco' class ImportCOCOSegmentation(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ImportCOCOSegmentation, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-from-coco' - def process(self): + def process_workspace(self, workspace: Workspace) -> None: """Performs region segmentation by reading from COCO annotations. - + Open and deserialize the COCO JSON file from the second input file group. (It lists region categories/subtypes, file names and segmentations for all pages.) - + Open and deserialize each PAGE input file (or generate from image input file) from the first input file group. Now find this page in COCO: - try to match the PAGE ``imageFilename`` or METS file path matches to some @@ -67,171 +68,160 @@ def process(self): - try to match the numeric part of the METS physical page ID to some COCO ``id``, otherwise - skip with an error. - + Then create and add a region for each ``segmentation``, converting its polygon to coordinate points and its COCO category to a region type (and subtype), either for a PubLayNet classification or PAGE classification (as produced by ocrd-segment-extract-pages), as indicated by ``source``. - + Produce a new output file by serialising the resulting hierarchy. - + Afterwards, if there are still COCO images left unaccounted for (i.e. without corresponding input files), then show a warning. """ - LOG = getLogger('processor.ImportCOCOSegmentation') - # Load JSON - assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO') - # pylint: disable=attribute-defined-outside-init + # overwrite input_file_grp to single to prevent zip_input_files from searching page pairs + #self.verify() self.input_file_grp, coco_grp = self.input_file_grp.split(',') - # pylint: disable=attribute-defined-outside-init - if not self.input_files: - LOG.warning('No input files to process') - return - if coco_grp in self.workspace.mets.file_groups: + # make sure the cardinality requirement is also reduced from 2 to 1 + self.ocrd_tool['input_file_grp_cardinality'] = 1 + # Load JSON + if coco_grp in workspace.mets.file_groups: try: - cocofile = next(f for f in self.workspace.mets.find_files(fileGrp=coco_grp) + cocofile = next(f for f in workspace.mets.find_files(fileGrp=coco_grp) # if f.mimetype == 'application/json' and not f.pageId if not f.pageId) + if self.download: + cocofile = workspace.download_file(cocofile) + cocofile = os.path.join(workspace.directory, cocofile.local_filename) except StopIteration: - raise Exception("no non-page-specific file in second file group (COCO file)", coco_grp) - cocofile = self.workspace.download_file(cocofile).local_filename - elif os.path.isfile(coco_grp): - cocofile = coco_grp + raise Exception("no document-wide file (COCO JSON file) in second file group", coco_grp) + elif os.path.isfile(os.path.join(workspace.directory, coco_grp)): + # passing a path as input fileGrp is not strictly allowed in OCR-D + cocofile = os.path.join(workspace.directory, coco_grp) else: raise Exception("file not found in second file group (COCO file)", coco_grp) - - LOG.info('Loading COCO annotations from "%s" into memory...', cocofile) - with open(cocofile, 'r') as inp: - coco = json.load(inp) - LOG.info('Loaded JSON for %d images with %d regions in %d categories', + + self.logger.info('Loading COCO annotations from "%s" into memory...', cocofile) + with open(cocofile, 'r') as cocof: + coco = json.load(cocof) + self.logger.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) - coco_source = 'PubLayNet' # Convert to usable dicts # classes: - categories = dict() - subcategories = dict() + self.coco_source = 'PubLayNet' + self.categories = {} + self.subcategories = {} for cat in coco['categories']: if cat['source'] == 'PAGE': - coco_source = 'PAGE' + self.coco_source = 'PAGE' if 'supercategory' in cat and cat['supercategory']: - categories[cat['id']] = cat['supercategory'] - subcategories[cat['id']] = cat['name'] + self.categories[cat['id']] = cat['supercategory'] + self.subcategories[cat['id']] = cat['name'] else: - categories[cat['id']] = cat['name'] + self.categories[cat['id']] = cat['name'] # images and annotations: - images_by_id = dict() - images_by_filename = dict() + self.images_by_id = {} + self.images_by_filename = {} for image in coco['images']: - images_by_id[image['id']] = image - images_by_filename[image['file_name']] = image + self.images_by_id[image['id']] = image + self.images_by_filename[image['file_name']] = image for annotation in coco['annotations']: - image = images_by_id[annotation['image_id']] - regions = image.setdefault('regions', list()) - regions.append(annotation) + image = self.images_by_id[annotation['image_id']] + image.setdefault('regions', []).append(annotation) del coco - - LOG.info('Converting %s annotations into PAGE-XML', coco_source) - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID + self.logger.info('Converting %s annotations into PAGE-XML', self.coco_source) + super().process_workspace(workspace) + + # warn of remaining COCO images + if self.images_by_filename and not self.page_id: + self.logger.warning('%d images remain unaccounted for after processing', len(self.images_by_filename)) + if self.logger.isEnabledFor(logging.DEBUG): + for filename in self.images_by_filename: + self.logger.debug('not found in workspace: "%s"', filename) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + try: + # separate non-numeric part of page ID to retain the numeric part num_page_id = int(page_id.strip(page_id.strip("0123456789"))) - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - - # find COCO image - if page.imageFilename in images_by_filename: - image = images_by_filename[page.imageFilename] - elif num_page_id in images_by_id: - image = images_by_id[num_page_id] + except Exception: + num_page_id = self.workspace.mets.physical_pages.index(page_id) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + + # find COCO image + if page.imageFilename in self.images_by_filename: + image = self.images_by_filename[page.imageFilename] + elif num_page_id in self.images_by_id: + image = self.images_by_id[num_page_id] + else: + raise Exception(f'Page "{page_id}" / file "{page.imageFilename}" not found in COCO') + + if image['width'] != page.imageWidth: + self.logger.error('Page "%s" width %d does not match annotated width %d', + page_id, page.imageWidth, image['width']) + if image['height'] != page.imageHeight: + self.logger.error('Page "%s" height %d does not match annotated height %d', + page_id, page.imageHeight, image['height']) + + # todo: remove existing segmentation first? + for region in image['regions']: + assert isinstance(region['segmentation'], list), "importing RLE/mask segmentation not implemented" + polygon = np.array(region['segmentation']) + polygon = np.reshape(polygon, (polygon.shape[1]//2, 2)) + coords = CoordsType(points=points_from_polygon(polygon)) + category = self.categories[region['category_id']] + if region['category_id'] in self.subcategories: + subcategory = self.subcategories[region['category_id']] else: - LOG.error('Page "%s" / file "%s" not found in COCO', - page_id, page.imageFilename) - # todo: maybe we should at least write the (unchanged) output PAGE? - continue - if image['width'] != page.imageWidth: - LOG.error('Page "%s" width %d does not match annotated width %d', - page_id, page.imageWidth, image['width']) - if image['height'] != page.imageHeight: - LOG.error('Page "%s" height %d does not match annotated height %d', - page_id, page.imageHeight, image['height']) - - # todo: remove existing segmentation first? - for region in image['regions']: - assert isinstance(region['segmentation'], list), "importing RLE/mask segmentation not implemented" - polygon = np.array(region['segmentation']) - polygon = np.reshape(polygon, (polygon.shape[1]//2, 2)) - coords = CoordsType(points=points_from_polygon(polygon)) - category = categories[region['category_id']] - if region['category_id'] in subcategories: - subcategory = subcategories[region['category_id']] + subcategory = None + region_id = f"r{region['id']}" + self.logger.info('Adding region %s:%s [area %d]', category, subcategory or '', region['area']) + if self.coco_source == 'PubLayNet': + if category == 'text': + region_obj = TextRegionType(id=region_id, Coords=coords, + type_=TextTypeSimpleType.PARAGRAPH) + page.add_TextRegion(region_obj) + elif category == 'title': + region_obj = TextRegionType(id=region_id, Coords=coords, + type_=TextTypeSimpleType.HEADING) # CAPTION? + page.add_TextRegion(region_obj) + elif category == 'list': + region_obj = TextRegionType(id=region_id, Coords=coords, + type_=TextTypeSimpleType.LISTLABEL) # OTHER? + page.add_TextRegion(region_obj) + elif category == 'table': + region_obj = TableRegionType(id=region_id, Coords=coords) + page.add_TableRegion(region_obj) + elif category == 'figure': + region_obj = ImageRegionType(id=region_id, Coords=coords) + page.add_ImageRegion(region_obj) else: - subcategory = None - region_id = 'r' + str(region['id']) - LOG.info('Adding region %s:%s [area %d]', category, subcategory or '', region['area']) - if coco_source == 'PubLayNet': - if category == 'text': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.PARAGRAPH) - page.add_TextRegion(region_obj) - elif category == 'title': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.HEADING) # CAPTION? - page.add_TextRegion(region_obj) - elif category == 'list': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.LISTLABEL) # OTHER? - page.add_TextRegion(region_obj) - elif category == 'table': - region_obj = TableRegionType(id=region_id, Coords=coords) - page.add_TableRegion(region_obj) - elif category == 'figure': - region_obj = ImageRegionType(id=region_id, Coords=coords) - page.add_ImageRegion(region_obj) - else: - raise Exception('unknown region category: %s' % category) - else: # 'PAGE' - args = {'id': region_id, - 'Coords': coords} - if subcategory: - typedict = {"TextRegion": TextTypeSimpleType, - "GraphicRegion": GraphicsTypeSimpleType, - "ChartType": ChartTypeSimpleType} - if category in typedict: - subtype = membername(typedict[category], subcategory) - if subtype == subcategory: - # not predefined in PAGE: use other + custom - args['custom'] = "subtype:%s" % subcategory - args['type_'] = "other" - else: - args['type_'] = subcategory - else: + raise Exception('unknown region category: %s' % category) + else: # 'PAGE' + args = {'id': region_id, + 'Coords': coords} + if subcategory: + if category in TYPEDICT: + subtype = membername(TYPEDICT[category], subcategory) + if subtype == subcategory: + # not predefined in PAGE: use other + custom args['custom'] = "subtype:%s" % subcategory - if category + 'Type' not in globals(): - raise Exception('unknown region category: %s' % category) - region_type = globals()[category + 'Type'] - if region_type is BorderType: - page.set_Border(BorderType(Coords=coords)) + args['type_'] = "other" + else: + args['type_'] = subcategory else: - region_obj = region_type(**args) - getattr(page, 'add_%s' % category)(region_obj) - # remove image from dicts - images_by_id.pop(num_page_id, None) - images_by_filename.pop(page.imageFilename, None) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - content=to_xml(pcgts)) + args['custom'] = "subtype:%s" % subcategory + if category + 'Type' not in globals(): + raise Exception('unknown region category: %s' % category) + region_type = globals()[category + 'Type'] + if region_type is BorderType: + page.set_Border(BorderType(Coords=coords)) + else: + region_obj = region_type(**args) + getattr(page, 'add_%s' % category)(region_obj) - # warn of remaining COCO images - if images_by_filename and not self.page_id: - LOG.warning('%d images remain unaccounted for after processing', len(images_by_filename)) - if LOG.isEnabledFor(logging.DEBUG): - for filename in images_by_filename: - LOG.debug('not found in workspace: "%s"', filename) + # remove image from dicts (so lookup becomes faster and we know if anything remains unaccounted for) + self.images_by_id.pop(num_page_id, None) + self.images_by_filename.pop(page.imageFilename, None) + + return OcrdPageResult(pcgts) diff --git a/ocrd_segment/import_image_segmentation.py b/ocrd_segment/import_image_segmentation.py index d9705a6..4c9afe7 100644 --- a/ocrd_segment/import_image_segmentation.py +++ b/ocrd_segment/import_image_segmentation.py @@ -1,30 +1,27 @@ from __future__ import absolute_import -import os.path +from typing import Optional from PIL import Image + import numpy as np import cv2 from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, points_from_polygon, - MIMETYPE_PAGE, pushd_popd, membername ) -from ocrd_modelfactory import page_from_file # pragma pylint: disable=unused-import # (region types will be referenced indirectly via globals()) from ocrd_models.ocrd_page import ( + OcrdPage, CoordsType, TextRegionType, ImageRegionType, MathsRegionType, SeparatorRegionType, NoiseRegionType, - to_xml) +) from ocrd_models.ocrd_page_generateds import ( BorderType, TableRegionType, @@ -39,151 +36,127 @@ ChartTypeSimpleType ) # pragma pylint: enable=unused-import -from ocrd import Processor +from ocrd import Processor, OcrdPageResult -from .config import OCRD_TOOL +TYPEDICT = { + "TextRegion": TextTypeSimpleType, + "GraphicRegion": GraphicsTypeSimpleType, + "ChartType": ChartTypeSimpleType +} -TOOL = 'ocrd-segment-from-masks' class ImportImageSegmentation(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ImportImageSegmentation, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-from-masks' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs region segmentation by reading mask images in pseudo-colour. - - Open and deserialize each PAGE input file (or generate from image input file) + + Open and deserialize PAGE input file (or generate from image input file) from the first input file group, as well as mask image file from the second. - + Then iterate over all connected (equally colored) mask segments and compute convex hull contours for them. Convert them to polygons, and look up their color value in ``colordict`` to instantiate the appropriate region types (optionally with subtype). Instantiate and annotate regions accordingly. - + Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.ImportImageSegmentation') - assert_file_grp_cardinality(self.input_file_grp, 2, 'base and mask') - assert_file_grp_cardinality(self.output_file_grp, 1) - colordict = self.parameter['colordict'] - typedict = {"TextRegion": TextTypeSimpleType, - "GraphicRegion": GraphicsTypeSimpleType, - "ChartType": ChartTypeSimpleType} - # collect input file tuples - ifts = self.zip_input_files() # input file tuples - # process input file tuples - for ift in ifts: - input_file, segmentation_file = ift - if segmentation_file is None: - LOG.warning("skipping page %s without segmentation", input_file.pageId) - continue - file_id = make_file_id(input_file, self.output_file_grp) - LOG.info("processing page %s", input_file.pageId) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - # import mask image - segmentation_filename = self.workspace.download_file(segmentation_file).local_filename - with pushd_popd(self.workspace.directory): - segmentation_pil = Image.open(segmentation_filename) - has_alpha = segmentation_pil.mode == 'RGBA' - if has_alpha: - colorformat = "%08X" - else: - colorformat = "%06X" - if segmentation_pil.mode != 'RGB': - segmentation_pil = segmentation_pil.convert('RGB') - # convert to array - segmentation_array = np.array(segmentation_pil) - # collapse 3 color channels - segmentation_array = segmentation_array.dot( - np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:]) - # partition mapped colors vs background - colors = np.unique(segmentation_array) - bgcolors = [] - for i, color in enumerate(colors): - colorname = colorformat % color - if (colorname not in colordict or - not colordict[colorname]): - #raise Exception("Unknown color %s (not in colordict)" % colorname) - LOG.info("Ignoring background color %s", colorname) - bgcolors.append(i) - background = np.zeros_like(segmentation_array, np.uint8) - if bgcolors: - for i in bgcolors: - background += np.array(segmentation_array == colors[i], np.uint8) - colors = np.delete(colors, bgcolors, 0) - # iterate over mask for each mapped color/class - regionno = 0 - for color in colors: - # get region (sub)type - colorname = colorformat % color - classname = colordict[colorname] - regiontype = None - custom = None - if ":" in classname: - classname, regiontype = classname.split(":") - if classname in typedict: - typename = membername(typedict[classname], regiontype) - if typename == regiontype: - # not predefined in PAGE: use other + custom - custom = "subtype:%s" % regiontype - regiontype = "other" - else: + pcgts = input_pcgts[0] + page = pcgts.get_Page() + + # import mask image (for which Processor.process_page_file will have created a pseudo PAGE by now) + segmentation_filename = input_pcgts[1].get_Page().get_imageFilename() + segmentation_pil = Image.open(segmentation_filename) + has_alpha = segmentation_pil.mode == 'RGBA' + if has_alpha: + colorformat = "%08X" + else: + colorformat = "%06X" + if segmentation_pil.mode != 'RGB': + segmentation_pil = segmentation_pil.convert('RGB') + # convert to array + segmentation_array = np.array(segmentation_pil) + # collapse 3 color channels + segmentation_array = segmentation_array.dot( + np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:]) + # partition mapped colors vs background + colors = np.unique(segmentation_array) + bgcolors = [] + for i, color in enumerate(colors): + colorname = colorformat % color + if (colorname not in colordict or + not colordict[colorname]): + #raise Exception("Unknown color %s (not in colordict)" % colorname) + self.logger.info("Ignoring background color %s", colorname) + bgcolors.append(i) + background = np.zeros_like(segmentation_array, np.uint8) + if bgcolors: + for i in bgcolors: + background += np.array(segmentation_array == colors[i], np.uint8) + colors = np.delete(colors, bgcolors, 0) + # iterate over mask for each mapped color/class + regionno = 0 + for color in colors: + # get region (sub)type + colorname = colorformat % color + classname = colordict[colorname] + regiontype = None + custom = None + if ":" in classname: + classname, regiontype = classname.split(":") + if classname in TYPEDICT: + typename = membername(TYPEDICT[classname], regiontype) + if typename == regiontype: + # not predefined in PAGE: use other + custom custom = "subtype:%s" % regiontype - if classname + "Type" not in globals(): - raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname)) - classtype = globals()[classname + "Type"] - if classtype is BorderType: - # mask from all non-background regions - classmask = 1 - background + regiontype = "other" else: - # mask from current color/class - classmask = np.array(segmentation_array == color, np.uint8) - if not np.count_nonzero(classmask): + custom = "subtype:%s" % regiontype + if classname + "Type" not in globals(): + raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname)) + classtype = globals()[classname + "Type"] + if classtype is BorderType: + # mask from all non-background regions + classmask = 1 - background + else: + # mask from current color/class + classmask = np.array(segmentation_array == color, np.uint8) + if not np.count_nonzero(classmask): + continue + # now get the contours and make polygons for them + contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for contour in contours: + # (could also just take bounding boxes to avoid islands/inclusions...) + area = cv2.contourArea(contour) + # filter too small regions + area_pct = area / np.prod(segmentation_array.shape) * 100 + if area < 100 and area_pct < 0.1: + self.logger.warning('ignoring contour of only %.1f%% area for %s', + area_pct, classname) continue - # now get the contours and make polygons for them - contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for contour in contours: - # (could also just take bounding boxes to avoid islands/inclusions...) - area = cv2.contourArea(contour) - # filter too small regions - area_pct = area / np.prod(segmentation_array.shape) * 100 - if area < 100 and area_pct < 0.1: - LOG.warning('ignoring contour of only %.1f%% area for %s', - area_pct, classname) - continue - LOG.info('found region %s:%s:%s with area %.1f%%', - classname, regiontype or '', custom or '', area_pct) - # simplify shape - poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y - if len(poly) < 4: - LOG.warning('ignoring contour of only %d points (area %.1f%%) for %s', - len(poly), area_pct, classname) - continue - if classtype is BorderType: - # add Border - page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly)))) - break - else: - # instantiate region - regionno += 1 - region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom, - Coords=CoordsType(points=points_from_polygon(poly))) - # add region - getattr(page, 'add_%s' % classname)(region) + self.logger.info('found region %s:%s:%s with area %.1f%%', + classname, regiontype or '', custom or '', area_pct) + # simplify shape + poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y + if len(poly) < 4: + self.logger.warning('ignoring contour of only %d points (area %.1f%%) for %s', + len(poly), area_pct, classname) + continue + if classtype is BorderType: + # add Border + page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly)))) + break + else: + # instantiate region + regionno += 1 + region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom, + Coords=CoordsType(points=points_from_polygon(poly))) + # add region + getattr(page, 'add_%s' % classname)(region) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + return OcrdPageResult(pcgts) diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json index 731faa1..c0cca7c 100644 --- a/ocrd_segment/ocrd-tool.json +++ b/ocrd_segment/ocrd-tool.json @@ -6,13 +6,8 @@ "executable": "ocrd-segment-repair", "categories": ["Layout analysis"], "description": "Analyse and repair region segmentation; at least ensure validity and consistency of coordinates.", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { "sanitize": { @@ -65,12 +60,8 @@ "executable": "ocrd-segment-project", "categories": ["Layout analysis"], "description": "Project segment coordinates to their structural parents", - "input_file_grp": [ - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation"], "parameters": { "level-of-operation": { @@ -92,13 +83,8 @@ "executable": "ocrd-segment-from-masks", "categories": ["Layout analysis"], "description": "Import region segmentation from mask images (segments filled with colors encoding classes). Input fileGrp format is `base,mask` (i.e. PAGE or original image files first, mask image files second).", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-PAGE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { "colordict": { @@ -164,13 +150,8 @@ "executable": "ocrd-segment-from-coco", "categories": ["Layout analysis"], "description": "Import region segmentation from COCO detection format JSON (for all pages). Input fileGrp format is `base,COCO` (i.e. PAGE or original image files first, COCO file second).", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-PAGE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { } @@ -179,15 +160,8 @@ "executable": "ocrd-segment-extract-pages", "categories": ["Image preprocessing"], "description": "Extract page segmentation as page images (deskewed according to `/Page/@orientation` and cropped+masked along `/Page/Border`) + JSON (including region coordinates/classes and meta-data), as binarized images, and as mask images (segments filled with colors encoding classes) + COCO detection format JSON (for all pages). Output fileGrp format is `raw[,binarized[,mask]]` (i.e. fall back to first group).", - "input_file_grp": [ - "OCR-D-SEG-PAGE", - "OCR-D-GT-SEG-PAGE", - "OCR-D-SEG-BLOCK", - "OCR-D-GT-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-IMG-PAGE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": [1, 3], "steps": ["layout/analysis"], "parameters": { "feature_filter": { @@ -289,13 +263,8 @@ "executable": "ocrd-segment-extract-regions", "categories": ["Image preprocessing"], "description": "Extract region segmentation as region images (deskewed according to `*/@orientation` and cropped+masked along `*/Coords` polygon) + JSON (including region coordinates/classes and meta-data).", - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-GT-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-IMG-REGION" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "feature_filter": { @@ -361,13 +330,8 @@ "executable": "ocrd-segment-extract-lines", "categories": ["Image preprocessing"], "description": "Extract line segmentation as line images + text file + JSON.", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-GT-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "feature_filter": { @@ -434,13 +398,8 @@ "executable": "ocrd-segment-extract-words", "categories": ["Image preprocessing"], "description": "Extract word segmentation as word images (deskewed according to `*/@orientation` and cropped+masked along `*/Coords` polygon and dewarped as in `*/AlternativeImage`) + text file (according to `*/TextEquiv`) + JSON (including line coordinates and meta-data).", - "input_file_grp": [ - "OCR-D-SEG-WORD", - "OCR-D-GT-SEG-WORD" - ], - "output_file_grp": [ - "OCR-D-IMG-WORD" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "feature_filter": { @@ -465,13 +424,8 @@ "executable": "ocrd-segment-extract-glyphs", "categories": ["Image preprocessing"], "description": "Extract glyph segmentation as glyph images (deskewed according to `*/@orientation` and cropped+masked along `*/Coords` polygon and dewarped as in `*/AlternativeImage`) + text file (according to `*/TextEquiv`) + JSON (including line coordinates and meta-data).", - "input_file_grp": [ - "OCR-D-SEG-GLYPH", - "OCR-D-GT-SEG-GLYPH" - ], - "output_file_grp": [ - "OCR-D-IMG-GLYPH" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "feature_filter": { @@ -496,14 +450,8 @@ "executable": "ocrd-segment-replace-original", "categories": ["Image preprocessing"], "description": "Extract page image (deskewed according to `/Page/@orientation` and cropped+masked along `/Page/Border`) and use it as @imageFilename, adjusting all coordinates", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-GT-SEG-LINE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-SEG-CROP" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "feature_selector": { @@ -527,15 +475,8 @@ "executable": "ocrd-segment-replace-page", "categories": ["Image preprocessing"], "description": "Replace everything below page level with another annotation, adjusting all coordinates", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-GT-SEG-LINE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-OCR" - ], + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "transform_coordinates": { @@ -549,6 +490,8 @@ "executable": "ocrd-segment-replace-text", "categories": ["Text recognition and optimization"], "description": "Insert text from annotations in single-segment text files", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["recognition/post-correction"], "parameters": { "file_glob": { @@ -562,10 +505,8 @@ "executable": "ocrd-segment-evaluate", "categories": ["Layout analysis"], "description": "Compare segmentations", - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "steps": ["layout/analysis"], "parameters": { "level-of-operation": { diff --git a/ocrd_segment/project.py b/ocrd_segment/project.py index 233067d..1388f7e 100644 --- a/ocrd_segment/project.py +++ b/ocrd_segment/project.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -import os.path +from typing import Optional import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree @@ -9,38 +9,29 @@ from shapely import set_precision from shapely.ops import unary_union, nearest_points -from ocrd import Processor +from ocrd import Processor, OcrdPageResult from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, polygon_from_points, points_from_polygon, - MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + OcrdPage, PageType, BorderType, CoordsType, - to_xml ) -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-project' class ProjectHull(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-project' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Make coordinates become the convex hull of their constituent segments with Shapely. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then iterate over the segment hierarchy down to the requested hierarchy ``level-of-operation``. @@ -51,79 +42,59 @@ def process(self): (A change in coordinates will automatically invalidate any AlternativeImage references on the segment. Therefore, you may need to rebinarize etc.) - Finally, produce new output files by serialising the resulting hierarchy. + Finally, produce new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.ProjectHull') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - level = self.parameter['level-of-operation'] - - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - if level == 'page': - regions = (page.get_TextRegion() + - page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_TableRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_SeparatorRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - if len(regions): - self._process_segment(page, regions, page_id) - elif level == 'table': - for region in page.get_AllRegions(classes=['Table']): - regions = region.get_TextRegion() - if not len(regions): - continue - self._process_segment(region, regions, page_id) - else: - for region in page.get_AllRegions(classes=['Text']): - lines = region.get_TextLine() - if not len(lines): + pcgts = input_pcgts[0] + page = pcgts.get_Page() + if level == 'page': + regions = (page.get_TextRegion() + + page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_TableRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_SeparatorRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + if len(regions): + self._process_segment(page, regions, page_id) + elif level == 'table': + for region in page.get_AllRegions(classes=['Table']): + regions = region.get_TextRegion() + if not len(regions): + continue + self._process_segment(region, regions, page_id) + else: + for region in page.get_AllRegions(classes=['Text']): + lines = region.get_TextLine() + if not len(lines): + continue + if level == 'region': + self._process_segment(region, lines, page_id) + continue + for line in lines: + words = line.get_Word() + if not len(words): continue - if level == 'region': - self._process_segment(region, lines, page_id) + if level == 'line': + self._process_segment(line, words, page_id) continue - for line in lines: - words = line.get_Word() - if not len(words): - continue - if level == 'line': - self._process_segment(line, words, page_id) + for word in words: + glyphs = word.get_Glyph() + if not len(glyphs): continue - for word in words: - glyphs = word.get_Glyph() - if not len(glyphs): - continue - self._process_segment(word, glyphs, page_id) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + self._process_segment(word, glyphs, page_id) + return OcrdPageResult(pcgts) def _process_segment(self, segment, constituents, page_id): """Overwrite segment outline to become the minimal convex hull of its constituent segments.""" - LOG = getLogger('processor.ProjectHull') polygons = [make_valid(Polygon(polygon_from_points(constituent.get_Coords().points))) for constituent in constituents] polygon = join_polygons(polygons).buffer(self.parameter['padding']).exterior.coords[:-1] @@ -136,12 +107,11 @@ def _process_segment(self, segment, constituents, page_id): parent = segment.parent_object_ polygon = polygon_for_parent(polygon, parent) if polygon is None: - LOG.info('Ignoring extant segment: %s', segment.id) + self.logger.info('Ignoring extant segment: %s', segment.id) else: points = points_from_polygon(polygon) coords = CoordsType(points=points) - LOG.debug('Using new coordinates from %d constituents for segment "%s"', - len(constituents), segment.id) + self.logger.debug(f'Using new coordinates from {len(constituents)} constituents for segment "{segment.id}"') if isinstance(segment, PageType): segment.set_Border(BorderType(Coords=coords)) else: @@ -194,7 +164,7 @@ def join_polygons(polygons, scale=20): def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. - + (Should be moved to ocrd_utils.coordinates_for_segment.) """ childp = Polygon(polygon) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 47802db..08b141c 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -1,6 +1,7 @@ from __future__ import absolute_import -import os.path +from typing import Optional + from skimage import draw from scipy.ndimage import filters, morphology import cv2 @@ -8,24 +9,19 @@ from shapely.geometry import Polygon from shapely.ops import unary_union -from ocrd import Processor +from ocrd import Processor, OcrdPageResult from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_for_segment, coordinates_of_segment, polygon_from_points, points_from_polygon, xywh_from_polygon, - MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + OcrdPage, PageType, BorderType, TextRegionType, - to_xml ) from ocrd_models.ocrd_page_generateds import ( RegionRefType, @@ -42,22 +38,23 @@ CoordinateValidityError, PageValidator ) -from .config import OCRD_TOOL -from .project import join_polygons, make_valid -TOOL = 'ocrd-segment-repair' +from .project import join_polygons, make_valid class RepairSegmentation(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(RepairSegmentation, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-repair' - def process(self): + def setup(self): + # be strict regarding polygon path validity (so this can be repaired) + ocrd_validators.page_validator.POLY_TOLERANCE = 0 + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Perform generic post-processing of page segmentation with Shapely and OpenCV. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then validate syntax and semantics, checking for invalid or inconsistent segmentation. Fix invalidities by simplifying and/or re-ordering polygon paths. Fix inconsistencies by shrinking segment polygons to their parents. Log @@ -105,108 +102,88 @@ def process(self): region outline, as if extended by ``sanitize_padding``. If ``spread`` is non-zero and ``spread_level=region``, then this still applies to the result.) - Finally, produce new output files by serialising the resulting hierarchy. + Finally, produce new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.RepairSegmentation') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - # be strict regarding polygon path validity (so this can be repaired) - ocrd_validators.page_validator.POLY_TOLERANCE = 0 - - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - pcgts.set_pcGtsId(file_id) - page = pcgts.get_Page() - - # shrink/expand text regions to the hull of their text lines - if self.parameter['sanitize']: - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized', - feature_filter='clipped') - shrink_regions(page_image, page_coords, page, page_id, - padding=self.parameter['sanitize_padding']) - # - # validate segmentation (warn of children extending beyond their parents) - # - report = PageValidator.validate(ocrd_page=pcgts, - page_textequiv_consistency='off', - check_baseline=False) - if not report.is_valid: - errors = report.errors - report.errors = [] - for error in errors: - if isinstance(error, (CoordinateConsistencyError,CoordinateValidityError)): - if error.tag == 'Page': - element = page.get_Border() - elif error.tag.endswith('Region'): - element = next((region - for region in page.get_AllRegions() - if region.id == error.ID), None) - elif error.tag == 'TextLine': - element = next((line - for region in page.get_AllRegions(classes=['Text']) - for line in region.get_TextLine() - if line.id == error.ID), None) - elif error.tag == 'Word': - element = next((word - for region in page.get_AllRegions(classes=['Text']) - for line in region.get_TextLine() - for word in line.get_Word() - if word.id == error.ID), None) - elif error.tag == 'Glyph': - element = next((glyph - for region in page.get_AllRegions(classes=['Text']) - for line in region.get_TextLine() - for word in line.get_Word() - for glyph in word.get_Glyph() - if glyph.id == error.ID), None) - else: - LOG.error("Unrepairable error for unknown segment type: %s", - str(error)) - report.add_error(error) - continue - if not element: - LOG.error("Unrepairable error for unknown segment element: %s", - str(error)) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + + # shrink/expand text regions to the hull of their text lines + if self.parameter['sanitize']: + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized', + feature_filter='clipped') + shrink_regions(page_image, page_coords, page, page_id, self.logger, + padding=self.parameter['sanitize_padding']) + # + # validate segmentation (warn of children extending beyond their parents) + # + report = PageValidator.validate(ocrd_page=pcgts, + page_textequiv_consistency='off', + check_baseline=False) + if not report.is_valid: + errors = report.errors + report.errors = [] + for error in errors: + if isinstance(error, (CoordinateConsistencyError,CoordinateValidityError)): + if error.tag == 'Page': + element = page.get_Border() + elif error.tag.endswith('Region'): + element = next((region + for region in page.get_AllRegions() + if region.id == error.ID), None) + elif error.tag == 'TextLine': + element = next((line + for region in page.get_AllRegions(classes=['Text']) + for line in region.get_TextLine() + if line.id == error.ID), None) + elif error.tag == 'Word': + element = next((word + for region in page.get_AllRegions(classes=['Text']) + for line in region.get_TextLine() + for word in line.get_Word() + if word.id == error.ID), None) + elif error.tag == 'Glyph': + element = next((glyph + for region in page.get_AllRegions(classes=['Text']) + for line in region.get_TextLine() + for word in line.get_Word() + for glyph in word.get_Glyph() + if glyph.id == error.ID), None) + else: + self.logger.error("Unrepairable error for unknown segment type: %s", + str(error)) + report.add_error(error) + continue + if not element: + self.logger.error("Unrepairable error for unknown segment element: %s", + str(error)) + report.add_error(error) + continue + if isinstance(error, CoordinateConsistencyError): + try: + ensure_consistent(element) + except Exception as e: + self.logger.error(str(e)) # exc_info=e report.add_error(error) continue - if isinstance(error, CoordinateConsistencyError): - try: - ensure_consistent(element) - except Exception as e: - LOG.error(str(e)) # exc_info=e - report.add_error(error) - continue - else: - ensure_valid(element) - LOG.warning("Fixed %s for %s '%s'", error.__class__.__name__, - error.tag, error.ID) - # show remaining errors - if not report.is_valid: - LOG.warning(report.to_xml()) - - # simplify - if self.parameter['simplify']: - self.simplify_page(page, page_id) - # delete/merge/split redundant text regions (or its text lines) - if self.parameter['plausibilize']: - self.plausibilize_page(page, page_id) - if self.parameter['spread']: - self.spread_segments(page, page_id) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + else: + ensure_valid(element) + self.logger.warning("Fixed %s for %s '%s'", error.__class__.__name__, + error.tag, error.ID) + # show remaining errors + if not report.is_valid: + self.logger.warning(report.to_xml()) + + # simplify + if self.parameter['simplify']: + self.simplify_page(page, page_id) + # delete/merge/split redundant text regions (or its text lines) + if self.parameter['plausibilize']: + self.plausibilize_page(page, page_id) + if self.parameter['spread']: + self.spread_segments(page, page_id) + return OcrdPageResult(pcgts) def simplify_page(self, page, page_id): if page.get_Border() is not None: @@ -257,7 +234,7 @@ def plausibilize_page(self, page, page_id): if _compare_segments(region1, region2, regpoly1, regpoly2, marked_for_deletion, marked_for_merging, self.parameter['plausibilize_merge_min_overlap'], - page_id): + page_id, self.logger): # non-trivial overlap: mutually plausibilize lines linepolys1 = sorted([(line, make_valid(Polygon(polygon_from_points(line.get_Coords().points)))) for line in region1.get_TextLine()], @@ -270,7 +247,7 @@ def plausibilize_page(self, page, page_id): if _compare_segments(line1, line2, linepoly1, linepoly2, marked_for_deletion, marked_for_merging, self.parameter['plausibilize_merge_min_overlap'], - page_id): + page_id, self.logger): # non-trivial overlap: check how close to each other if (linepoly1.centroid.within(linepoly2) or linepoly2.centroid.within(linepoly1)): @@ -295,7 +272,8 @@ def plausibilize_page(self, page, page_id): _plausibilize_segments(regpolys, rogroup, marked_for_deletion, marked_for_merging, - marked_for_splitting) + marked_for_splitting, + log) def spread_segments(self, page, page_id): level = self.parameter['spread_level'] @@ -329,7 +307,7 @@ def spread_segments(self, page, page_id): glyphs = word.get_Glyph() spread_segments(glyphs, self.parameter['spread']) -def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_merging, min_overlap, page_id): +def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_merging, min_overlap, page_id, log): """Determine redundancies in a pair of regions/lines \b @@ -343,29 +321,28 @@ def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_ Return whether something else besides deletion must be done about the redundancy, i.e. true iff they overlap, but neither side could be marked for deletion. """ - LOG = getLogger('processor.RepairSegmentation') - # LOG.debug('Comparing %s and %s', + # log.debug('Comparing %s and %s', # '%s "%s"' % (_tag_name(seg1), seg1.id), # '%s "%s"' % (_tag_name(seg2), seg2.id)) if poly1.almost_equals(poly2): - LOG.debug('Page "%s" %s is almost equal to %s', page_id, + log.debug('Page "%s" %s is almost equal to %s', page_id, '%s "%s"' % (_tag_name(seg2), seg2.id), '%s "%s"' % (_tag_name(seg1), seg1.id)) marked_for_deletion.append(seg2.id) elif poly1.contains(poly2): - LOG.debug('Page "%s" %s is within %s', page_id, + log.debug('Page "%s" %s is within %s', page_id, '%s "%s"' % (_tag_name(seg2), seg2.id), '%s "%s"' % (_tag_name(seg1), seg1.id)) marked_for_deletion.append(seg2.id) elif poly2.contains(poly1): - LOG.debug('Page "%s" %s is within %s', page_id, + log.debug('Page "%s" %s is within %s', page_id, '%s "%s"' % (_tag_name(seg1), seg1.id), '%s "%s"' % (_tag_name(seg2), seg2.id)) marked_for_deletion.append(seg1.id) elif poly1.overlaps(poly2): inter_poly = poly1.intersection(poly2) union_poly = poly1.union(poly2) - LOG.debug('Page "%s" %s overlaps %s by %.2f/%.2f', page_id, + log.debug('Page "%s" %s overlaps %s by %.2f/%.2f', page_id, '%s "%s"' % (_tag_name(seg1), seg1.id), '%s "%s"' % (_tag_name(seg2), seg2.id), inter_poly.area/poly1.area, inter_poly.area/poly2.area) @@ -373,12 +350,12 @@ def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_ # skip this pair -- combined polygon encloses previously free segments return True elif inter_poly.area / poly2.area > min_overlap: - LOG.debug('Page "%s" %s belongs to %s', page_id, + log.debug('Page "%s" %s belongs to %s', page_id, '%s "%s"' % (_tag_name(seg2), seg2.id), '%s "%s"' % (_tag_name(seg1), seg1.id)) marked_for_merging[seg2.id] = seg1 elif inter_poly.area / poly1.area > min_overlap: - LOG.debug('Page "%s" %s belongs to %s', page_id, + log.debug('Page "%s" %s belongs to %s', page_id, '%s "%s"' % (_tag_name(seg1), seg1.id), '%s "%s"' % (_tag_name(seg2), seg2.id)) marked_for_merging[seg1.id] = seg2 @@ -387,7 +364,7 @@ def _compare_segments(seg1, seg2, poly1, poly2, marked_for_deletion, marked_for_ return False -def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order): +def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order, log): """Merge one segment into another and update reading order refs. \b @@ -404,8 +381,7 @@ def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order): and elements like TextStyle and TextEquiv, if different between ``seg`` and ``superseg``. """ - LOG = getLogger('processor.RepairSegmentation') - LOG.info('Merging %s "%s" into %s "%s"', + log.info('Merging %s "%s" into %s "%s"', _tag_name(seg), seg.id, _tag_name(superseg), superseg.id) # granularity will necessarily be lost here -- @@ -423,7 +399,7 @@ def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order): superseg.get_Coords().set_points(points_from_polygon(superpoly)) # FIXME should we merge/mix attributes and features? if hasattr(seg, 'TextLine') and seg.get_TextLine(): - LOG.info('Merging region "{}" with {} text lines into "{}" with {}'.format( + log.info('Merging region "{}" with {} text lines into "{}" with {}'.format( seg.id, len(seg.get_TextLine()), superseg.id, len(superseg.get_TextLine()))) if (seg.id in reading_order and @@ -440,7 +416,7 @@ def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order): else: superseg.TextLine = superseg.TextLine + seg.TextLine elif hasattr(seg, 'Word') and seg.get_Word(): - LOG.info('Merging line "{}" with {} words into "{}" with {}'.format( + log.info('Merging line "{}" with {} words into "{}" with {}'.format( seg.id, len(seg.get_Word()), superseg.id, len(superseg.get_Word()))) pos = next(i for i, segpoly in enumerate(segpolys) if segpoly[0] == seg) @@ -451,31 +427,31 @@ def _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order): else: superseg.Word = superseg.Word + seg.Word if hasattr(seg, 'orientation') and seg.get_orientation() != superseg.get_orientation(): - LOG.warning('Merging "{}" with orientation {} into "{}" with {}'.format( + log.warning('Merging "{}" with orientation {} into "{}" with {}'.format( seg.id, seg.get_orientation(), superseg.id, superseg.get_orientation())) if hasattr(seg, 'type_') and seg.get_type() != superseg.get_type(): - LOG.warning('Merging "{}" with type {} into "{}" with {}'.format( + log.warning('Merging "{}" with type {} into "{}" with {}'.format( seg.id, seg.get_type(), superseg.id, superseg.get_type())) if seg.get_primaryScript() != superseg.get_primaryScript(): - LOG.warning('Merging "{}" with primaryScript {} into "{}" with {}'.format( + log.warning('Merging "{}" with primaryScript {} into "{}" with {}'.format( seg.id, seg.get_primaryScript(), superseg.id, superseg.get_primaryScript())) if seg.get_primaryLanguage() != superseg.get_primaryLanguage(): - LOG.warning('Merging "{}" with primaryLanguage {} into "{}" with {}'.format( + log.warning('Merging "{}" with primaryLanguage {} into "{}" with {}'.format( seg.id, seg.get_primaryLanguage(), superseg.id, superseg.get_primaryLanguage())) if seg.get_TextStyle(): - LOG.warning('Merging "{}" with TextStyle {} into "{}" with {}'.format( + log.warning('Merging "{}" with TextStyle {} into "{}" with {}'.format( seg.id, seg.get_TextStyle(), # FIXME needs repr... superseg.id, superseg.get_TextStyle())) # ...to be informative if seg.get_TextEquiv(): - LOG.warning('Merging "{}" with TextEquiv {} into "{}" with {}'.format( + log.warning('Merging "{}" with TextEquiv {} into "{}" with {}'.format( seg.id, seg.get_TextEquiv(), # FIXME needs repr... superseg.id, superseg.get_TextEquiv())) # ...to be informative -def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_merging, marked_for_splitting): +def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_merging, marked_for_splitting, log): """Remove redundancy among a set of segments by applying deletion/merging/splitting \b @@ -487,16 +463,20 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me Finally, update the reading order ``rogroup`` accordingly. """ - LOG = getLogger('processor.RepairSegmentation') wait_for_deletion = list() reading_order = dict() page_get_reading_order(reading_order, rogroup) for seg, poly in segpolys: if isinstance(seg, TextRegionType): # plausibilize lines first - _plausibilize_segments([(line, make_valid(Polygon(polygon_from_points(line.get_Coords().points)))) - for line in seg.get_TextLine()], None, - marked_for_deletion, marked_for_merging, marked_for_splitting) + linepolys = [(line, make_valid(Polygon(polygon_from_points(line.get_Coords().points)))) + for line in seg.get_TextLine()] + _plausibilize_segments(linepolys, + None, # no reading order on line level + marked_for_deletion, + marked_for_merging, + marked_for_splitting, + log) delete = seg.id in marked_for_deletion merge = seg.id in marked_for_merging split = seg.id in marked_for_splitting @@ -505,13 +485,13 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me # merge region with super region: superseg = marked_for_merging[seg.id] superpoly = make_valid(Polygon(polygon_from_points(superseg.get_Coords().points))) - _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order) + _merge_segments(seg, superseg, poly, superpoly, segpolys, reading_order, log) wait_for_deletion.append(seg) if seg.id in reading_order: regionref = reading_order[seg.id] # TODO: re-assign regionref.continuation and regionref.type to other? # could be any of the 6 types above: - regionrefs = regionref.parent_object_.__getattribute__(regionref.__class__.__name__.replace('Type', '')) + regionrefs = getattr(regionref.parent_object_, regionref.__class__.__name__.replace('Type', '')) # remove in-place regionrefs.remove(regionref) if hasattr(regionref, 'index'): @@ -519,7 +499,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me regionref.parent_object_.sort_AllIndexed() elif split: otherseg = marked_for_splitting[seg.id] - LOG.info('Shrinking %s "%s" in favour of %s "%s"', + log.info('Shrinking %s "%s" in favour of %s "%s"', _tag_name(seg), seg.id, _tag_name(otherseg), otherseg.id) otherpoly = make_valid(Polygon(polygon_from_points(otherseg.get_Coords().points))) @@ -535,7 +515,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me for seg in wait_for_deletion: if seg.parent_object_: # remove in-place - LOG.info('Deleting %s "%s"', _tag_name(seg), seg.id) + log.info('Deleting %s "%s"', _tag_name(seg), seg.id) getattr(seg.parent_object_, 'get_' + _tag_name(seg))().remove(seg) def page_get_reading_order(ro, rogroup): @@ -562,9 +542,8 @@ def page_get_reading_order(ro, rogroup): # recursive reading order element (un/ordered group): page_get_reading_order(ro, elem) -def shrink_regions(page_image, page_coords, page, page_id, padding=0): +def shrink_regions(page_image, page_coords, page, page_id, log, padding=0): """Shrink each region outline to become the minimal concave hull of its binary foreground.""" - LOG = getLogger('processor.RepairSegmentation') page_array = ~ np.array(page_image.convert('1')) page_polygon = page_poly(page) if page.get_Border(): @@ -579,7 +558,7 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0): else: scale = 43 for region in page.get_AllRegions(): - #LOG.info('Shrinking region "%s"', region.id) + #log.info('Shrinking region "%s"', region.id) region_mask = np.zeros_like(page_array, dtype=bool) region_polygon = coordinates_of_segment(region, page_image, page_coords) region_mask[draw.polygon(region_polygon[:, 1], @@ -594,7 +573,7 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0): total_area = sum(areas) if not total_area: # ignore if too small - LOG.warning('Zero contour area in region "%s"', region.id) + log.warning('Zero contour area in region "%s"', region.id) continue # pick contour and convert to absolute: region_polygon = join_polygons([make_valid(Polygon(contour[:, 0, ::])) @@ -607,7 +586,7 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0): if not region_polygon.within(page_polygon): region_polygon = clip_poly(region_polygon, page_polygon) if region_polygon is not None: - LOG.debug('Using new coordinates for region "%s"', region.id) + log.debug('Using new coordinates for region "%s"', region.id) region.get_Coords().set_points(points_from_polygon(region_polygon.exterior.coords[:-1])) def spread_segments(segments, distance=0): diff --git a/ocrd_segment/replace_original.py b/ocrd_segment/replace_original.py index 2d76d51..12253ea 100644 --- a/ocrd_segment/replace_original.py +++ b/ocrd_segment/replace_original.py @@ -1,39 +1,29 @@ from __future__ import absolute_import -import os.path - +from typing import Optional from ocrd_utils import ( - getLogger, concat_padded, coordinates_of_segment, points_from_polygon, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE ) from ocrd_models.ocrd_page import ( + OcrdPage, AlternativeImageType, TextRegionType, - to_xml ) -from ocrd_modelfactory import page_from_file -from ocrd import Processor +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from .config import OCRD_TOOL from .repair import ensure_valid -TOOL = 'ocrd-segment-replace-original' - class ReplaceOriginal(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ReplaceOriginal, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-replace-original' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Extract page image and replace original with it. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then go to the page hierarchy level. Retrieve the image of the (cropped, deskewed, dewarped) page, preferring @@ -45,79 +35,44 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.ReplaceOriginal') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - feature_selector = self.parameter['feature_selector'] - feature_filter = self.parameter['feature_filter'] - adapt_coords = self.parameter['transform_coordinates'] - - # pylint: disable=attribute-defined-outside-init - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_filter=feature_filter, - feature_selector=feature_selector) - if page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - else: - dpi = None - # annotate extracted image - file_path = self.workspace.save_image_file(page_image, - file_id + '-IMG', - self.output_file_grp, - page_id=input_file.pageId, - mimetype='image/png') - # replace original image - page.set_imageFilename(file_path) - # remove all coordinate-sensitive page-level annotations - page.set_imageWidth(page_image.width) - page.set_imageHeight(page_image.height) - page.set_Border(None) # also removes all derived images - page.set_orientation(None) - # also add image as derived image (in order to preserve image features) - # (but exclude coordinate-sensitive features that have already been applied over the "original") - features = ','.join(filter(lambda f: f not in [ - "cropped", "deskewed", "rotated-90", "rotated-180", "rotated-270"], - page_coords['features'].split(","))) - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=features)) - # adjust all coordinates - if adapt_coords: - for region in page.get_AllRegions(): - region_polygon = coordinates_of_segment(region, page_image, page_coords) - region.get_Coords().set_points(points_from_polygon(region_polygon)) - ensure_valid(region) - if isinstance(region, TextRegionType): - for line in region.get_TextLine(): - line_polygon = coordinates_of_segment(line, page_image, page_coords) - line.get_Coords().set_points(points_from_polygon(line_polygon)) - ensure_valid(line) - for word in line.get_Word(): - word_polygon = coordinates_of_segment(word, page_image, page_coords) - word.get_Coords().set_points(points_from_polygon(word_polygon)) - ensure_valid(word) - for glyph in word.get_Glyph(): - glyph_polygon = coordinates_of_segment(glyph, page_image, page_coords) - glyph.get_Coords().set_points(points_from_polygon(glyph_polygon)) - ensure_valid(glyph) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_filter=self.parameter['feature_filter'], + feature_selector=self.parameter['feature_selector'], + ) + # annotate extracted image as "new original" + result.images.append(OcrdPageResultImage(page_image, '.IMG', page)) + page.set_Border(None) # also removes all derived images + page.set_orientation(None) + # also add image as derived image (in order to preserve image features) + # (but exclude coordinate-sensitive features that have already been applied over the "original") + features = ','.join(filter(lambda f: f not in [ + "cropped", "deskewed", "rotated-90", "rotated-180", "rotated-270"], + page_coords['features'].split(","))) + alt_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alt_image) + result.images.append(OcrdPageResultImage(page_image, '.IMG-COPY', alt_image)) + # adjust all coordinates + if self.parameter['transform_coordinates']: + for region in page.get_AllRegions(): + region_polygon = coordinates_of_segment(region, page_image, page_coords) + region.get_Coords().set_points(points_from_polygon(region_polygon)) + ensure_valid(region) + if isinstance(region, TextRegionType): + for line in region.get_TextLine(): + line_polygon = coordinates_of_segment(line, page_image, page_coords) + line.get_Coords().set_points(points_from_polygon(line_polygon)) + ensure_valid(line) + for word in line.get_Word(): + word_polygon = coordinates_of_segment(word, page_image, page_coords) + word.get_Coords().set_points(points_from_polygon(word_polygon)) + ensure_valid(word) + for glyph in word.get_Glyph(): + glyph_polygon = coordinates_of_segment(glyph, page_image, page_coords) + glyph.get_Coords().set_points(points_from_polygon(glyph_polygon)) + ensure_valid(glyph) + return result - # update METS (add the PAGE file): - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) diff --git a/ocrd_segment/replace_page.py b/ocrd_segment/replace_page.py index 5253c76..d91130d 100644 --- a/ocrd_segment/replace_page.py +++ b/ocrd_segment/replace_page.py @@ -1,39 +1,27 @@ from __future__ import absolute_import -import os.path +from typing import Optional from ocrd_utils import ( - getLogger, concat_padded, coordinates_for_segment, points_from_polygon, polygon_from_points, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE ) -from ocrd_models.ocrd_page import ( - TextRegionType, - to_xml -) -from ocrd_modelfactory import page_from_file -from ocrd import Processor +from ocrd_models.ocrd_page import OcrdPage, TextRegionType +from ocrd import Processor, OcrdPageResult -from .config import OCRD_TOOL from .repair import ensure_consistent -TOOL = 'ocrd-segment-replace-page' - class ReplacePage(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super(ReplacePage, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-replace-page' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Replace everything below the page level with another annotation. - Open and deserialize PAGE input files from both input file groups, + Open and deserialize PAGE input file from both input file groups, then go to the page hierarchy level. Replace all regions (and their reading order) from the page of @@ -50,84 +38,57 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.ReplacePage') - assert_file_grp_cardinality(self.input_file_grp, 2, 'original, page') - assert_file_grp_cardinality(self.output_file_grp, 1) - adapt_coords = self.parameter['transform_coordinates'] - - # collect input file tuples - ifts = self.zip_input_files() # input file tuples - # process input file tuples - for n, ift in enumerate(ifts): - input_file, page_file = ift - if input_file is None or page_file is None: - continue - file_id = make_file_id(page_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - pcgts2 = page_from_file(self.workspace.download_file(page_file)) - page2 = pcgts2.get_Page() + pcgts = input_pcgts[0] + page = pcgts.get_Page() + pcgts2 = input_pcgts[1] + page2 = pcgts2.get_Page() + if self.parameter['transform_coordinates']: # adjust all coordinates (recursively) - if adapt_coords: - try: - _, page_coords, _ = self.workspace.image_from_page(page, page_id) - for region in page2.get_AllRegions(): - region_coords = region.get_Coords() - region_polygon = polygon_from_points(region_coords.points) - region_polygon = coordinates_for_segment(region_polygon, None, page_coords) - region_coords.set_points(points_from_polygon(region_polygon)) - ensure_consistent(region) - if isinstance(region, TextRegionType): - for line in region.get_TextLine(): - line_coords = line.get_Coords() - line_polygon = polygon_from_points(line_coords.points) - line_polygon = coordinates_for_segment(line_polygon, None, page_coords) - line_coords.set_points(points_from_polygon(line_polygon)) - ensure_consistent(line) - for word in line.get_Word(): - word_coords = word.get_Coords() - word_polygon = polygon_from_points(word_coords.points) - word_polygon = coordinates_for_segment(word_polygon, None, page_coords) - word_coords.set_points(points_from_polygon(word_polygon)) - ensure_consistent(word) - for glyph in word.get_Glyph(): - glyph_coords = glyph.get_Coords() - glyph_polygon = polygon_from_points(glyph_coords.points) - glyph_polygon = coordinates_for_segment(glyph_polygon, None, page_coords) - glyph_coords.set_points(points_from_polygon(glyph_polygon)) - ensure_consistent(glyph) - except: - LOG.error('invalid coordinates on page %s', page_id) - continue - # replace all regions - page.set_ReadingOrder(page2.get_ReadingOrder()) - page.set_TextRegion(page2.get_TextRegion()) - page.set_ImageRegion(page2.get_ImageRegion()) - page.set_LineDrawingRegion(page2.get_LineDrawingRegion()) - page.set_GraphicRegion(page2.get_GraphicRegion()) - page.set_TableRegion(page2.get_TableRegion()) - page.set_ChartRegion(page2.get_ChartRegion()) - page.set_MapRegion(page2.get_MapRegion()) - page.set_SeparatorRegion(page2.get_SeparatorRegion()) - page.set_MathsRegion(page2.get_MathsRegion()) - page.set_ChemRegion(page2.get_ChemRegion()) - page.set_MusicRegion(page2.get_MusicRegion()) - page.set_AdvertRegion(page2.get_AdvertRegion()) - page.set_NoiseRegion(page2.get_NoiseRegion()) - page.set_UnknownRegion(page2.get_UnknownRegion()) - page.set_CustomRegion(page2.get_CustomRegion()) - - # update METS (add the PAGE file): - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + try: + _, page_coords, _ = self.workspace.image_from_page(page, page_id) + for region in page2.get_AllRegions(): + region_coords = region.get_Coords() + region_polygon = polygon_from_points(region_coords.points) + region_polygon = coordinates_for_segment(region_polygon, None, page_coords) + region_coords.set_points(points_from_polygon(region_polygon)) + ensure_consistent(region) + if isinstance(region, TextRegionType): + for line in region.get_TextLine(): + line_coords = line.get_Coords() + line_polygon = polygon_from_points(line_coords.points) + line_polygon = coordinates_for_segment(line_polygon, None, page_coords) + line_coords.set_points(points_from_polygon(line_polygon)) + ensure_consistent(line) + for word in line.get_Word(): + word_coords = word.get_Coords() + word_polygon = polygon_from_points(word_coords.points) + word_polygon = coordinates_for_segment(word_polygon, None, page_coords) + word_coords.set_points(points_from_polygon(word_polygon)) + ensure_consistent(word) + for glyph in word.get_Glyph(): + glyph_coords = glyph.get_Coords() + glyph_polygon = polygon_from_points(glyph_coords.points) + glyph_polygon = coordinates_for_segment(glyph_polygon, None, page_coords) + glyph_coords.set_points(points_from_polygon(glyph_polygon)) + ensure_consistent(glyph) + except: + self.logger.error('invalid coordinates on page %s', page_id) + raise + # replace all regions + page.set_ReadingOrder(page2.get_ReadingOrder()) + page.set_TextRegion(page2.get_TextRegion()) + page.set_ImageRegion(page2.get_ImageRegion()) + page.set_LineDrawingRegion(page2.get_LineDrawingRegion()) + page.set_GraphicRegion(page2.get_GraphicRegion()) + page.set_TableRegion(page2.get_TableRegion()) + page.set_ChartRegion(page2.get_ChartRegion()) + page.set_MapRegion(page2.get_MapRegion()) + page.set_SeparatorRegion(page2.get_SeparatorRegion()) + page.set_MathsRegion(page2.get_MathsRegion()) + page.set_ChemRegion(page2.get_ChemRegion()) + page.set_MusicRegion(page2.get_MusicRegion()) + page.set_AdvertRegion(page2.get_AdvertRegion()) + page.set_NoiseRegion(page2.get_NoiseRegion()) + page.set_UnknownRegion(page2.get_UnknownRegion()) + page.set_CustomRegion(page2.get_CustomRegion()) + return OcrdPageResult(pcgts) diff --git a/ocrd_segment/replace_text.py b/ocrd_segment/replace_text.py index c9b96dc..98970ac 100644 --- a/ocrd_segment/replace_text.py +++ b/ocrd_segment/replace_text.py @@ -1,37 +1,28 @@ from __future__ import absolute_import import os.path +from typing import Optional from itertools import chain from glob import glob -from ocrd_utils import ( - getLogger, concat_padded, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import ( - TextEquivType, - to_xml -) -from ocrd_modelfactory import page_from_file -from ocrd import Processor - -from .config import OCRD_TOOL - -TOOL = 'ocrd-segment-replace-text' +from ocrd_models.ocrd_page import OcrdPage, TextEquivType +from ocrd import Processor, OcrdPageResult class ReplaceText(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-segment-replace-text' + + def setup(self): + file_glob = self.parameter['file_glob'] + self.input_text_files = glob(file_glob) + assert len(self.input_text_files), f"file_glob '{file_glob}' does not match any path names" - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Add TextEquiv anywhere below the page level from external text files named by segments. - Open and deserialize PAGE input files. For each page, try to find text files matching + Open and deserialize PAGE input file. Try to find text files matching ``file_glob`` which have both the page ID and some segment ID in their path name. For every match, insert the content of the text file as first TextEquiv of that very @@ -39,77 +30,54 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.ReplaceText') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - file_glob = self.parameter['file_glob'] - - input_text_files = glob(file_glob) - assert len(input_text_files), "file_glob '%s' does not match any path names" % file_glob - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - regions = page.get_AllRegions(classes=['Text']) - lines = list(chain.from_iterable( - [region.get_TextLine() for region in regions])) - words = list(chain.from_iterable( - [line.get_Word() for line in lines])) - glyphs = list(chain.from_iterable( - [word.get_Glyph() for word in words])) - segids = { seg.id: seg for seg in glyphs + words + lines + regions } - text_files = ([path for path in input_text_files - if page_id in path] or - [path for path in input_text_files - if input_file.ID in path]) - if not len(text_files): - LOG.warning("no text file input for page %s", page_id) - segments = set() - for text_file in text_files: - basename = os.path.splitext(text_file)[0] - basename2 = os.path.splitext(basename)[0] - segment = None - for id_ in segids: - if basename.endswith(id_) or basename2.endswith(id_): - segment = segids[id_] - break - if not segment: - LOG.error("no segment for text file '%s' on page '%s'", text_file, page_id) - continue - with open(text_file, 'r') as text_fd: - text = text_fd.read().strip() - LOG.debug("adding '%s' to '%s'", text, segment.id) - segment.insert_TextEquiv_at(0, TextEquivType(Unicode=text)) - segments.add(segment) - if not segments.isdisjoint(glyphs): - nonglyphs = segments.difference(glyphs) - LOG.info("updated %d of %d glyphs", len(segments) - len(nonglyphs), len(glyphs)) - segments.difference_update(glyphs) - if not segments.isdisjoint(words): - nonwords = segments.difference(words) - LOG.info("updated %d of %d words", len(segments) - len(nonwords), len(words)) - segments.difference_update(words) - if not segments.isdisjoint(lines): - nonlines = segments.difference(lines) - LOG.info("updated %d of %d lines", len(segments) - len(nonlines), len(lines)) - segments.difference_update(lines) - if not segments.isdisjoint(regions): - nonregions = segments.difference(regions) - LOG.info("updated %d of %d regions", len(segments) - len(nonregions), len(regions)) - segments.difference_update(regions) - assert len(segments) == 0 - - # update METS (add the PAGE file): - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + regions = page.get_AllRegions(classes=['Text']) + lines = list(chain.from_iterable( + [region.get_TextLine() for region in regions])) + words = list(chain.from_iterable( + [line.get_Word() for line in lines])) + glyphs = list(chain.from_iterable( + [word.get_Glyph() for word in words])) + segids = { seg.id: seg for seg in glyphs + words + lines + regions } + text_files = ([path for path in self.input_text_files + if page_id in path] or + [path for path in self.input_text_files + if input_file.ID in path]) + if not len(text_files): + self.logger.warning("no text file input for page %s", page_id) + segments = set() + for text_file in text_files: + basename = os.path.splitext(text_file)[0] + basename2 = os.path.splitext(basename)[0] + segment = None + for id_ in segids: + if basename.endswith(id_) or basename2.endswith(id_): + segment = segids[id_] + break + if not segment: + self.logger.error("no segment for text file '%s' on page '%s'", text_file, page_id) + continue + with open(text_file, 'r') as text_fd: + text = text_fd.read().strip() + self.logger.debug("adding '%s' to '%s'", text, segment.id) + segment.insert_TextEquiv_at(0, TextEquivType(Unicode=text)) + segments.add(segment) + if not segments.isdisjoint(glyphs): + nonglyphs = segments.difference(glyphs) + self.logger.info("updated %d of %d glyphs", len(segments) - len(nonglyphs), len(glyphs)) + segments.difference_update(glyphs) + if not segments.isdisjoint(words): + nonwords = segments.difference(words) + self.logger.info("updated %d of %d words", len(segments) - len(nonwords), len(words)) + segments.difference_update(words) + if not segments.isdisjoint(lines): + nonlines = segments.difference(lines) + self.logger.info("updated %d of %d lines", len(segments) - len(nonlines), len(lines)) + segments.difference_update(lines) + if not segments.isdisjoint(regions): + nonregions = segments.difference(regions) + self.logger.info("updated %d of %d regions", len(segments) - len(nonregions), len(regions)) + segments.difference_update(regions) + assert len(segments) == 0 + return OcrdPageResult(pcgts) diff --git a/requirements.txt b/requirements.txt index bc83f8a..d5a1123 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 2.20.0 +ocrd >= 3.0.0b1 shapely >= 2.0 scikit-image numpy From 7fd60e6e27ca58a9ca7cf4dc828b2c288db590c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 01:07:51 +0200 Subject: [PATCH 2/4] from_coco: allow empty pages --- ocrd_segment/import_coco_segmentation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_segment/import_coco_segmentation.py b/ocrd_segment/import_coco_segmentation.py index ffe1c19..4f0023e 100644 --- a/ocrd_segment/import_coco_segmentation.py +++ b/ocrd_segment/import_coco_segmentation.py @@ -61,6 +61,7 @@ def process_workspace(self, workspace: Workspace) -> None: Open and deserialize the COCO JSON file from the second input file group. (It lists region categories/subtypes, file names and segmentations for all pages.) + \b Open and deserialize each PAGE input file (or generate from image input file) from the first input file group. Now find this page in COCO: - try to match the PAGE ``imageFilename`` or METS file path matches to some @@ -164,7 +165,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_id, page.imageHeight, image['height']) # todo: remove existing segmentation first? - for region in image['regions']: + for region in image.get('regions', []): assert isinstance(region['segmentation'], list), "importing RLE/mask segmentation not implemented" polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1]//2, 2)) From 4563b32396d5bdeabfd8dd8f354e562a60cad632 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 01:08:24 +0200 Subject: [PATCH 3/4] from_coco: allow file_name as just basename --- ocrd_segment/import_coco_segmentation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocrd_segment/import_coco_segmentation.py b/ocrd_segment/import_coco_segmentation.py index 4f0023e..b04209b 100644 --- a/ocrd_segment/import_coco_segmentation.py +++ b/ocrd_segment/import_coco_segmentation.py @@ -152,6 +152,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # find COCO image if page.imageFilename in self.images_by_filename: image = self.images_by_filename[page.imageFilename] + elif os.path.basename(page.imageFilename) in self.images_by_filename: + image = self.images_by_filename[os.path.basename(page.imageFilename)] elif num_page_id in self.images_by_id: image = self.images_by_id[num_page_id] else: From 547c8d0cc84b58baeb0f8adbf21e6b0f79e329e5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 01:09:36 +0200 Subject: [PATCH 4/4] from_coco: generalise from fixed PubLayNet mapping to parameter categorydict default (keeping special case source=PAGE) --- ocrd_segment/import_coco_segmentation.py | 76 ++++++++++-------------- ocrd_segment/ocrd-tool.json | 11 ++++ 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/ocrd_segment/import_coco_segmentation.py b/ocrd_segment/import_coco_segmentation.py index b04209b..bc27ac2 100644 --- a/ocrd_segment/import_coco_segmentation.py +++ b/ocrd_segment/import_coco_segmentation.py @@ -109,12 +109,12 @@ def process_workspace(self, workspace: Workspace) -> None: len(coco['images']), len(coco['annotations']), len(coco['categories'])) # Convert to usable dicts # classes: - self.coco_source = 'PubLayNet' + self.coco_source = 'custom' self.categories = {} self.subcategories = {} for cat in coco['categories']: - if cat['source'] == 'PAGE': - self.coco_source = 'PAGE' + if 'source' in cat: + self.coco_source = cat['source'] if 'supercategory' in cat and cat['supercategory']: self.categories[cat['id']] = cat['supercategory'] self.subcategories[cat['id']] = cat['name'] @@ -177,51 +177,41 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional subcategory = self.subcategories[region['category_id']] else: subcategory = None + if subcategory == category: + subcategory = None + mapping = self.parameter['categorydict'] region_id = f"r{region['id']}" self.logger.info('Adding region %s:%s [area %d]', category, subcategory or '', region['area']) - if self.coco_source == 'PubLayNet': - if category == 'text': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.PARAGRAPH) - page.add_TextRegion(region_obj) - elif category == 'title': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.HEADING) # CAPTION? - page.add_TextRegion(region_obj) - elif category == 'list': - region_obj = TextRegionType(id=region_id, Coords=coords, - type_=TextTypeSimpleType.LISTLABEL) # OTHER? - page.add_TextRegion(region_obj) - elif category == 'table': - region_obj = TableRegionType(id=region_id, Coords=coords) - page.add_TableRegion(region_obj) - elif category == 'figure': - region_obj = ImageRegionType(id=region_id, Coords=coords) - page.add_ImageRegion(region_obj) - else: - raise Exception('unknown region category: %s' % category) - else: # 'PAGE' - args = {'id': region_id, - 'Coords': coords} + args = {'id': region_id, + 'Coords': coords} + if self.coco_source != 'PAGE': if subcategory: - if category in TYPEDICT: - subtype = membername(TYPEDICT[category], subcategory) - if subtype == subcategory: - # not predefined in PAGE: use other + custom - args['custom'] = "subtype:%s" % subcategory - args['type_'] = "other" - else: - args['type_'] = subcategory - else: + category = mapping[category + ':' + subcategory] + else: + category = mapping[category] + if ':' in category: + category, subcategory = category.split(':') + else: + subcategory = None + if subcategory: + if category in TYPEDICT: + subtype = membername(TYPEDICT[category], subcategory) + if subtype == subcategory: + # not predefined in PAGE: use other + custom args['custom'] = "subtype:%s" % subcategory - if category + 'Type' not in globals(): - raise Exception('unknown region category: %s' % category) - region_type = globals()[category + 'Type'] - if region_type is BorderType: - page.set_Border(BorderType(Coords=coords)) + args['type_'] = "other" + else: + args['type_'] = subcategory else: - region_obj = region_type(**args) - getattr(page, 'add_%s' % category)(region_obj) + args['custom'] = "subtype:%s" % subcategory + if category + 'Type' not in globals(): + raise Exception('unknown region category: %s' % category) + region_type = globals()[category + 'Type'] + if region_type is BorderType: + page.set_Border(BorderType(Coords=coords)) + else: + region_obj = region_type(**args) + getattr(page, 'add_%s' % category)(region_obj) # remove image from dicts (so lookup becomes faster and we know if anything remains unaccounted for) self.images_by_id.pop(num_page_id, None) diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json index c0cca7c..9211192 100644 --- a/ocrd_segment/ocrd-tool.json +++ b/ocrd_segment/ocrd-tool.json @@ -154,6 +154,17 @@ "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { + "categorydict": { + "type": "object", + "description": "How to map COCO category names to PAGE segment types (except if 'source=PAGE'). Format is category[:subcategory] to type[:subtype], e.g. \"text:title\": \"TextRegion:heading\". Defaults to recommended PubLayNet rules.", + "default": { + "text": "TextRegion:paragraph", + "title": "TextRegion:heading", + "list": "TextRegion:list-label", + "table": "TableRegion", + "figure": "ImageRegion" + } + } } }, "ocrd-segment-extract-pages": {