diff --git a/ocrd_browser/model/page.py b/ocrd_browser/model/page.py index b02c19f..4607f17 100644 --- a/ocrd_browser/model/page.py +++ b/ocrd_browser/model/page.py @@ -53,12 +53,12 @@ def meta(self) -> MetadataType: return self.pc_gts.get_Metadata() def xpath(self, xpath: str) -> List[Element]: - return cast(List[Element], self.xml_root.xpath(xpath, namespaces=NAMESPACES)) + page_namespace = {'page': ns for ns in self.xml_root.nsmap.values() if ns.startswith('http://schema.primaresearch.org/PAGE/gts/pagecontent/')} + return cast(List[Element], self.xml_root.xpath(xpath, namespaces=dict(NAMESPACES, **page_namespace))) @property def xml_root(self) -> Element: if self.pc_gts.gds_elementtree_node_ is None: - from ocrd_models.constants import NAMESPACES from ocrd_models.ocrd_page_generateds import parsexmlstring_ from io import StringIO sio = StringIO() diff --git a/ocrd_browser/model/page_xml_renderer.py b/ocrd_browser/model/page_xml_renderer.py index c476b65..887c72a 100644 --- a/ocrd_browser/model/page_xml_renderer.py +++ b/ocrd_browser/model/page_xml_renderer.py @@ -127,6 +127,17 @@ def __init__(self, region: RegionWithCoords) -> None: self._prep_poly: Optional[prepared.PreparedGeometry] = None self.warnings: List[str] = [] + @property + def coords_conf(self) -> Optional[float]: + return cast(float, self.region.Coords.conf) if hasattr(self.region, 'Coords') else None + + @property + def text_conf(self) -> Optional[float]: + if isinstance(self.region, (TextRegionType, TextLineType, WordType, GlyphType)): + if self.region.get_TextEquiv() and self.region.get_TextEquiv()[0].conf: + return cast(float, self.region.get_TextEquiv()[0].conf) + return None + @property def poly(self) -> Polygon: return self._poly diff --git a/ocrd_browser/view/page.py b/ocrd_browser/view/page.py index a4c8436..caee336 100644 --- a/ocrd_browser/view/page.py +++ b/ocrd_browser/view/page.py @@ -124,7 +124,8 @@ def set_page(self, page: Page) -> None: versions.append(ImageVersion.from_page(self.document, page)) alts: List[AlternativeImageType] = page.page.get_AlternativeImage() for alt in alts: - versions.append(ImageVersion.from_alternative_image(self.document, alt)) + if self.document.path(alt.filename).exists(): + versions.append(ImageVersion.from_alternative_image(self.document, alt)) with self.version_box.handler_block(self._change_handler): self.versions.clear() @@ -255,6 +256,10 @@ def build(self) -> None: self.add_configurator('scale', ImageZoomSelector(2.0, 0.05, -4.0, 2.0)) self.add_configurator('image_version', ImageVersionSelector()) self.add_configurator('features', PageFeaturesSelector()) + icon = Gtk.Image.new_from_icon_name('camera-photo', Gtk.IconSize.SMALL_TOOLBAR) + button = Gtk.Button(image=icon, visible=True, always_show_image=True, tooltip_text='Saves a screenshot of the current view') + button.connect('clicked', self.open_screenshotdialog) + self.action_bar.pack_start(button) actions = ActionRegistry() actions.create(name='zoom_by', param_type=GLib.VariantType('i'), callback=self._on_zoom_by) @@ -412,6 +417,11 @@ def _query_tooltip(self, _image: Gtk.Image, x: int, y: int, _keyboard_mode: bool if region: content += '\n{}\n\n{}\n'.format(str(region), escape(region.text)) + if region.text_conf: + content += '\n@text.conf={}'.format(region.text_conf) + + if region.coords_conf: + content += '\n@coords.conf={}'.format(region.coords_conf) if region.region_subtype: content += '\n@type: {}'.format(region.region_subtype) for attribute in [ @@ -530,3 +540,30 @@ def update_transformation(self) -> None: self.page_image.height ) self.highlight.queue_draw() + + def open_screenshotdialog(self, button: Gtk.Button) -> None: + if self.page_image is None: + return + + dialog = Gtk.FileChooserDialog(title="Save image under...", + parent=self.window, + action=Gtk.FileChooserAction.SAVE) + dialog.add_buttons(Gtk.STOCK_CANCEL, + Gtk.ResponseType.CANCEL, + Gtk.STOCK_SAVE, + Gtk.ResponseType.OK) + filter_png = Gtk.FileFilter() + filter_png.set_name("PNG image files") + filter_png.add_mime_type("image/png") + dialog.add_filter(filter_png) + dialog.set_current_name("untitled.png") + + response = dialog.run() + if response == Gtk.ResponseType.OK: + filename = dialog.get_filename() + else: + filename = '' + + dialog.destroy() + if filename: + self.page_image.save(filename) diff --git a/setup.cfg b/setup.cfg index b6caca3..a4a5d0c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,6 @@ [flake8] ignore=E501 +exclude=tests/assets/__init__.py [mypy] warn_return_any = True diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif new file mode 100644 index 0000000..cc1fbc5 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif new file mode 100644 index 0000000..cc1fbc5 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif new file mode 100644 index 0000000..cc1fbc5 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg new file mode 100644 index 0000000..7281633 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg new file mode 100644 index 0000000..7281633 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg new file mode 100644 index 0000000..7281633 Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg differ diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml new file mode 100644 index 0000000..4c80371 --- /dev/null +++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml @@ -0,0 +1,3252 @@ + + + + PRImA Research Lab + 2015-07-17T15:27:13 + 2017-07-14T10:03:33 + Example Page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + D + + + + o + + + + c + + + + u + + + + m + + + + e + + + + n + + + + t + + Document + + + + + + A + + + + n + + + + a + + + + l + + + + y + + + + s + + + + i + + + + s + + Analysis + + + + + + S + + + + y + + + + s + + + + t + + + + e + + + + m + + System + + Aletheia Document Analysis System + + Aletheia Document Analysis System + + + + + + + + + + + + O + + + + v + + + + e + + + + r + + + + v + + + + i + + + + e + + + + w + + + + : + + Overview: + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + i + + + + s + + is + + + + + + a + + + + n + + an + + + + + + a + + + + d + + + + - + + ad- + + Overview: Aletheia is an ad- + + + + + + vanced + + + + system + + + + for + + + + accurate + + + + and + + + + yet + + vanced system for accurate and yet + + + + + + cost-effective + + + + ground + + + + truthing + + + + of + + cost-effective ground truthing of + + + + + + large + + + + amounts + + + + of + + + + documents. + + + + It + + + + aids + + large amounts of documents. It aids + + + + + + the + + + + user + + + + with + + + + a + + + + number + + + + of + + + + automated + + the user with a number of automated + + + + + + and + + + + semi-automated + + + + tools + + + + which + + and semi-automated tools which + + + + + + were + + + + partly + + + + developed + + + + and + + + + improved + + were partly developed and improved + + + + + + based + + + + on + + + + feedback + + + + from + + + + major + + + + librar- + + based on feedback from major librar- + + + + + + ies + + + + across + + + + Europe + + + + and + + + + from + + + + their + + + + digit- + + ies across Europe and from their digit- + + + + + + isation + + + + service + + + + providers + + + + which + + + + are + + + + us- + + isation service providers which are us- + + + + + + ing + + + + the + + + + tool + + + + in + + + + a + + + + production + + + + environ- + + ing the tool in a production environ- + + + + + + ment. + + ment. + + Overview: Aletheia is an ad- +vanced system for accurate and yet +cost-effective ground truthing of +large amounts of documents. It aids +the user with a number of automated +and semi-automated tools which +were partly developed and improved +based on feedback from major librar- +ies across Europe and from their digit- +isation service providers which are us- +ing the tool in a production environ- +ment. + + + + + + + + Novel + + + + features + + + + are, + + + + among + + + + others, + + + + the + + Novel features are, among others, the + + + + + + support + + + + of + + + + top-down + + + + ground + + + + truthing + + support of top-down ground truthing + + + + + + with + + + + sophisticated + + + + split + + + + and + + + + shrink + + + + tools + + with sophisticated split and shrink tools + + + + + + as + + + + well + + + + as + + + + bottom-up + + + + ground + + + + truthing + + as well as bottom-up ground truthing + + + + + + supporting + + + + the + + + + aggregation + + + + of + + + + lower-level + + supporting the aggregation of lower-level + + + + + + elements + + + + to + + + + more + + + + complex + + + + structures. + + elements to more complex structures. + + + + + + Special + + + + features + + + + have + + + + been + + + + developed + + + + to + + Special features have been developed to + + + + + + support + + + + working + + + + with + + + + the + + + + complexities + + + + of + + support working with the complexities of + + + + + + historical + + + + documents. + + + + The + + + + integrated + + + + vali- + + historical documents. The integrated vali- + + + + + + dator, + + + + in + + + + combination + + + + with + + + + powerful + + + + cor- + + dator, in combination with powerful cor- + + + + + + rection + + + + tools, + + + + enable + + + + efficient + + + + production + + rection tools, enable efficient production + + + + + + of + + + + highly + + + + accurate + + + + ground + + + + truth. + + of highly accurate ground truth. + + Novel features are, among others, the +support of top-down ground truthing +with sophisticated split and shrink tools +as well as bottom-up ground truthing +supporting the aggregation of lower-level +elements to more complex structures. +Special features have been developed to +support working with the complexities of +historical documents. The integrated vali- +dator, in combination with powerful cor- +rection tools, enable efficient production +of highly accurate ground truth. + + + + + + + + Aletheia + + + + uses + + + + the + + + + PAGE + + + + (Page + + + + Analysis + + Aletheia uses the PAGE (Page Analysis + + + + + + and + + + + Ground + + + + truth + + + + Elements) + + + + XML + + + + format + + and Ground truth Elements) XML format + + + + + + framework + + + + which + + + + incorporates + + + + several + + framework which incorporates several + + + + + + XML + + + + schemas + + + + representing + + + + the + + + + whole + + XML schemas representing the whole + + + + + + workflow + + + + of + + + + document + + + + analysis. + + + + See + + + + also + + workflow of document analysis. See also + + + + + + the + + + + dedicated + + + + infobox. + + the dedicated infobox. + + Aletheia uses the PAGE (Page Analysis +and Ground truth Elements) XML format +framework which incorporates several +XML schemas representing the whole +workflow of document analysis. See also +the dedicated infobox. + + + + + + + + + + Layers + + + + and + + + + reading + + + + order + + Layers and reading order + + Layers and reading order + + + + + + + + + + Screenshot + + + + of + + + + Aletheia + + + + showing + + + + regions + + + + and + + + + properties + + Screenshot of Aletheia showing regions and properties + + Screenshot of Aletheia showing regions and properties + + + + + + + + + + The + + + + PAGE + + + + (Page + + + + Analysis + + + + and + + + + Ground + + The PAGE (Page Analysis and Ground + + + + + + truth + + + + Elements) + + + + format + + + + framework + + + + incorpo- + + truth Elements) format framework incorpo- + + + + + + rates + + + + several + + + + XML + + + + schemas + + + + representing + + + + the + + rates several XML schemas representing the + + + + + + whole + + + + workflow + + + + of + + + + document + + + + analysis, + + + + includ- + + whole workflow of document analysis, includ- + + + + + + ing + + + + image + + + + enhancement, + + + + binarisation, + + + + geo- + + ing image enhancement, binarisation, geo- + + + + + + metrical + + + + correction, + + + + layout + + + + analysis, + + + + layout + + metrical correction, layout analysis, layout + + + + + + evaluation + + + + and + + + + OCR. + + + + The + + + + here + + + + used + + + + schema + + evaluation and OCR. The here used schema + + + + + + for + + + + document + + + + layouts + + + + allows + + + + for + + + + polygonal + + for document layouts allows for polygonal + + + + + + regions + + + + with + + + + various + + + + attributes + + + + (including + + + + text + + regions with various attributes (including text + + + + + + content), + + + + reading + + + + order, + + + + layers + + + + and + + + + more. + + content), reading order, layers and more. + + The PAGE (Page Analysis and Ground +truth Elements) format framework incorpo- +rates several XML schemas representing the +whole workflow of document analysis, includ- +ing image enhancement, binarisation, geo- +metrical correction, layout analysis, layout +evaluation and OCR. The here used schema +for document layouts allows for polygonal +regions with various attributes (including text +content), reading order, layers and more. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From + + + + Scratch, + + + + Top-Down + + From Scratch, Top-Down + + From Scratch, Top-Down + + + + + + + + • + + + + Marking + + + + regions + + + + using + + + + man- + + • Marking regions using man- + + + + + + ual + + + + or + + + + semi-automated + + + + tools + + ual or semi-automated tools + + • Marking regions using man- +ual or semi-automated tools + + + + + + + + • + + + + Marking + + + + text + + + + kines + + + + with + + + + easy- + + • Marking text kines with easy- + + + + + + to-use + + + + split + + + + tools + + to-use split tools + + • Marking text kines with easy- +to-use split tools + + + + + + + + • + + + + Marking + + + + words + + + + with + + + + assistive + + • Marking words with assistive + + + + + + tools + + tools + + • Marking words with assistive +tools + + + + + + + + • + + + + Marking + + + + glyphs + + + + (characters) + + • Marking glyphs (characters) + + • Marking glyphs (characters) + + + + + + + + • + + + + Text + + + + transcription + + + + and + + + + propa- + + • Text transcription and propa- + + + + + + gation + + + + to + + + + any + + + + required + + + + level + + gation to any required level + + • Text transcription and propa- +gation to any required level + + + + + + + + • + + + + Reading + + + + order + + + + definition + + • Reading order definition + + • Reading order definition + + + + + + + + • + + + + Validation + + + + to + + + + reduce + + + + risk + + • Validation to reduce risk + + + + + + of + + + + mistakes + + of mistakes + + • Validation to reduce risk +of mistakes + + + + + + + + • + + + + Correcting + + + + text + + + + content + + • Correcting text content + + + + + + using + + + + rendered + + + + text + + + + over- + + using rendered text over- + + + + + + lay + + lay + + • Correcting text content +using rendered text over- +lay + + + + + + + + • + + + + Correcting + + + + layout + + + + using + + • Correcting layout using + + + + + + convenient + + + + tools + + + + such + + + + as + + convenient tools such as + + + + + + merge + + + + and + + + + split + + merge and split + + • Correcting layout using +convenient tools such as +merge and split + + + + + + + + • + + + + Automated + + + + page + + + + analysis + + • Automated page analysis + + + + + + with + + + + integrated + + + + Tesseract + + with integrated Tesseract + + + + + + OCR + + + + or + + + + opening + + + + externally + + OCR or opening externally + + + + + + generated + + + + result + + generated result + + • Automated page analysis +with integrated Tesseract +OCR or opening externally +generated result + + + + + + + + + + + + + + + + + + + + + + T + + + + y + + + + p + + + + i + + + + c + + + + l + + + + a + + Typical + + + + + + W + + + + o + + + + r + + + + k + + + + fl + + + + o + + + + s + + + + w + + Workflows + + Typical Workflows + + Typical Workflows + + + + + + + + Preproduction + + + + + + + + + Correction + + Preproduction + Correction + + Preproduction + Correction + + + + + + + + + + O + + + + t + + + + h + + + + e + + + + r + + Other + + + + + + S + + + + o + + + + f + + + + t + + + + w + + + + a + + + + r + + + + e + + Software + + + + + + T + + + + o + + + + o + + + + l + + + + s + + Tools + + + + + + b + + + + y + + by + + + + + + P + + + + R + + + + I + + + + A + + + + m + + PRImA + + Other Software Tools by PRImA + + Other Software Tools by PRImA + + + + + + + + + + + + + + + + + + + + Pattern + + + + Recognition + + + + and + + + + Image + + + + Analysis + + + + Research + + + + Lab, + + + + School + + + + of + + + + Computing, + + + + Science + + + + and + + + + Engineering, + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, + + + + + + University + + + + of + + + + Salford, + + + + Greater + + + + Manchester, + + + + United + + + + Kingdom, + + + + www.primaresearch.org + + University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, +University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + + + + + + + WebAletheia + + + + Webapp + + WebAletheia Webapp + + WebAletheia Webapp + + + + + + + + Tesseract + + + + OCR + + + + to + + + + PAGE + + + + For + + + + Windows + + Tesseract OCR to PAGE For Windows + + Tesseract OCR to PAGE For Windows + + + + + + + + PAGE + + + + Libraries + + + + For + + + + Java + + + + and + + + + C++ + + PAGE Libraries For Java and C++ + + PAGE Libraries For Java and C++ + + + + + + + + Layout + + + + Evaluation + + + + Performance + + + + Analysis + + + + System + + Layout Evaluation Performance Analysis System + + Layout Evaluation Performance Analysis System + + + + + + + + A + + + + lightweight + + + + web-based + + + + version + + + + of + + + + the + + + + Aletheia + + A lightweight web-based version of the Aletheia + + + + + + ground + + + + truthing + + + + system. + + + + Ideal + + + + for + + + + customised + + ground truthing system. Ideal for customised + + + + + + workflows + + + + and + + + + crowdsourcing + + + + applications. + + + + Go + + + + to + + workflows and crowdsourcing applications. Go to + + + + + + the + + + + PRImA + + + + website + + + + to + + + + try + + + + it + + + + yourself. + + the PRImA website to try it yourself. + + A lightweight web-based version of the Aletheia +ground truthing system. Ideal for customised +workflows and crowdsourcing applications. Go to +the PRImA website to try it yourself. + + + + + + + + A + + + + command + + + + line + + + + tool + + + + to + + + + analyse + + + + document + + + + page + + A command line tool to analyse document page + + + + + + images + + + + using + + + + the + + + + open + + + + source + + + + OCR + + + + engine + + + + Tesser- + + images using the open source OCR engine Tesser- + + + + + + act + + + + and + + + + save + + + + the + + + + results + + + + to + + + + PAGE + + + + XML + + + + format. + + act and save the results to PAGE XML format. + + + + + + Version + + + + 1.3 + + + + is + + + + based + + + + on + + + + the + + + + latest + + + + release + + + + of + + + + Tesser- + + Version 1.3 is based on the latest release of Tesser- + + + + + + act + + + + (3.03). + + act (3.03). + + A command line tool to analyse document page +images using the open source OCR engine Tesser- +act and save the results to PAGE XML format. +Version 1.3 is based on the latest release of Tesser- +act (3.03). + + + + + + + + Platform + + + + independent + + + + libraries + + + + to + + + + create + + + + valid + + + + lay- + + Platform independent libraries to create valid lay- + + + + + + out + + + + descriptions + + + + in + + + + PAGE + + + + XML + + + + format. + + + + The + + + + libraries + + out descriptions in PAGE XML format. The libraries + + + + + + can + + + + be + + + + easily + + + + integrated + + + + in + + + + other + + + + software + + + + projects + + can be easily integrated in other software projects + + + + + + such + + + + as + + + + page + + + + segmentation + + + + methods + + + + for + + + + ICDAR + + such as page segmentation methods for ICDAR + + + + + + competitions. + + competitions. + + Platform independent libraries to create valid lay- +out descriptions in PAGE XML format. The libraries +can be easily integrated in other software projects +such as page segmentation methods for ICDAR +competitions. + + + + + + + + This + + + + tool + + + + is + + + + part + + + + of + + + + a + + + + framework + + + + for + + + + evaluating + + + + the + + This tool is part of a framework for evaluating the + + + + + + performance + + + + of + + + + layout + + + + analysis + + + + methods. + + + + It + + + + com- + + performance of layout analysis methods. It com- + + + + + + bines + + + + efficiency + + + + and + + + + accuracy + + + + by + + + + using + + + + a + + + + special + + bines efficiency and accuracy by using a special + + + + + + interval + + + + based + + + + geometric + + + + representation + + + + of + + + + regions. + + interval based geometric representation of regions. + + + + + + A + + + + wide + + + + range + + + + of + + + + sophisticated + + + + evaluation + + + + measures + + A wide range of sophisticated evaluation measures + + + + + + provide + + + + the + + + + means + + + + for + + + + a + + + + deep + + + + insight + + + + into + + + + the + + provide the means for a deep insight into the + + + + + + analysed + + + + systems, + + analysed systems, + + + + + + which + + + + goes + + + + far + + which goes far + + + + + + beyond + + + + simple + + beyond simple + + + + + + benchmarking. + + + + The + + benchmarking. The + + + + + + support + + + + of + + + + user- + + support of user- + + + + + + defined + + + + profiles + + defined profiles + + + + + + allows + + + + the + + + + tuning + + allows the tuning + + + + + + for + + + + any + + + + kind + + + + of + + for any kind of + + + + + + evaluation + + + + scenario + + evaluation scenario + + + + + + related + + + + to + + + + real + + related to real + + + + + + world + + + + applications. + + world applications. + + This tool is part of a framework for evaluating the +performance of layout analysis methods. It com- +bines efficiency and accuracy by using a special +interval based geometric representation of regions. +A wide range of sophisticated evaluation measures +provide the means for a deep insight into the +analysed systems, +which goes far +beyond simple +benchmarking. The +support of user- +defined profiles +allows the tuning +for any kind of +evaluation scenario +related to real +world applications. + + diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml new file mode 100644 index 0000000..0bdcf8c --- /dev/null +++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml @@ -0,0 +1,3252 @@ + + + + PRImA Research Lab + 2015-07-17T15:27:13 + 2018-07-19T07:29:57 + Example Page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + D + + + + o + + + + c + + + + u + + + + m + + + + e + + + + n + + + + t + + Document + + + + + + A + + + + n + + + + a + + + + l + + + + y + + + + s + + + + i + + + + s + + Analysis + + + + + + S + + + + y + + + + s + + + + t + + + + e + + + + m + + System + + Aletheia Document Analysis System + + Aletheia Document Analysis System + + + + + + + + + + + + O + + + + v + + + + e + + + + r + + + + v + + + + i + + + + e + + + + w + + + + : + + Overview: + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + i + + + + s + + is + + + + + + a + + + + n + + an + + + + + + a + + + + d + + + + - + + ad- + + Overview: Aletheia is an ad- + + + + + + vanced + + + + system + + + + for + + + + accurate + + + + and + + + + yet + + vanced system for accurate and yet + + + + + + cost-effective + + + + ground + + + + truthing + + + + of + + cost-effective ground truthing of + + + + + + large + + + + amounts + + + + of + + + + documents. + + + + It + + + + aids + + large amounts of documents. It aids + + + + + + the + + + + user + + + + with + + + + a + + + + number + + + + of + + + + automated + + the user with a number of automated + + + + + + and + + + + semi-automated + + + + tools + + + + which + + and semi-automated tools which + + + + + + were + + + + partly + + + + developed + + + + and + + + + improved + + were partly developed and improved + + + + + + based + + + + on + + + + feedback + + + + from + + + + major + + + + librar- + + based on feedback from major librar- + + + + + + ies + + + + across + + + + Europe + + + + and + + + + from + + + + their + + + + digit- + + ies across Europe and from their digit- + + + + + + isation + + + + service + + + + providers + + + + which + + + + are + + + + us- + + isation service providers which are us- + + + + + + ing + + + + the + + + + tool + + + + in + + + + a + + + + production + + + + environ- + + ing the tool in a production environ- + + + + + + ment. + + ment. + + Overview: Aletheia is an ad- +vanced system for accurate and yet +cost-effective ground truthing of +large amounts of documents. It aids +the user with a number of automated +and semi-automated tools which +were partly developed and improved +based on feedback from major librar- +ies across Europe and from their digit- +isation service providers which are us- +ing the tool in a production environ- +ment. + + + + + + + + Novel + + + + features + + + + are, + + + + among + + + + others, + + + + the + + Novel features are, among others, the + + + + + + support + + + + of + + + + top-down + + + + ground + + + + truthing + + support of top-down ground truthing + + + + + + with + + + + sophisticated + + + + split + + + + and + + + + shrink + + + + tools + + with sophisticated split and shrink tools + + + + + + as + + + + well + + + + as + + + + bottom-up + + + + ground + + + + truthing + + as well as bottom-up ground truthing + + + + + + supporting + + + + the + + + + aggregation + + + + of + + + + lower-level + + supporting the aggregation of lower-level + + + + + + elements + + + + to + + + + more + + + + complex + + + + structures. + + elements to more complex structures. + + + + + + Special + + + + features + + + + have + + + + been + + + + developed + + + + to + + Special features have been developed to + + + + + + support + + + + working + + + + with + + + + the + + + + complexities + + + + of + + support working with the complexities of + + + + + + historical + + + + documents. + + + + The + + + + integrated + + + + vali- + + historical documents. The integrated vali- + + + + + + dator, + + + + in + + + + combination + + + + with + + + + powerful + + + + cor- + + dator, in combination with powerful cor- + + + + + + rection + + + + tools, + + + + enable + + + + efficient + + + + production + + rection tools, enable efficient production + + + + + + of + + + + highly + + + + accurate + + + + ground + + + + truth. + + of highly accurate ground truth. + + Novel features are, among others, the +support of top-down ground truthing +with sophisticated split and shrink tools +as well as bottom-up ground truthing +supporting the aggregation of lower-level +elements to more complex structures. +Special features have been developed to +support working with the complexities of +historical documents. The integrated vali- +dator, in combination with powerful cor- +rection tools, enable efficient production +of highly accurate ground truth. + + + + + + + + Aletheia + + + + uses + + + + the + + + + PAGE + + + + (Page + + + + Analysis + + Aletheia uses the PAGE (Page Analysis + + + + + + and + + + + Ground + + + + truth + + + + Elements) + + + + XML + + + + format + + and Ground truth Elements) XML format + + + + + + framework + + + + which + + + + incorporates + + + + several + + framework which incorporates several + + + + + + XML + + + + schemas + + + + representing + + + + the + + + + whole + + XML schemas representing the whole + + + + + + workflow + + + + of + + + + document + + + + analysis. + + + + See + + + + also + + workflow of document analysis. See also + + + + + + the + + + + dedicated + + + + infobox. + + the dedicated infobox. + + Aletheia uses the PAGE (Page Analysis +and Ground truth Elements) XML format +framework which incorporates several +XML schemas representing the whole +workflow of document analysis. See also +the dedicated infobox. + + + + + + + + + + Layers + + + + and + + + + reading + + + + order + + Layers and reading order + + Layers and reading order + + + + + + + + + + Screenshot + + + + of + + + + Aletheia + + + + showing + + + + regions + + + + and + + + + properties + + Screenshot of Aletheia showing regions and properties + + Screenshot of Aletheia showing regions and properties + + + + + + + + + + The + + + + PAGE + + + + (Page + + + + Analysis + + + + and + + + + Ground + + The PAGE (Page Analysis and Ground + + + + + + truth + + + + Elements) + + + + format + + + + framework + + + + incorpo- + + truth Elements) format framework incorpo- + + + + + + rates + + + + several + + + + XML + + + + schemas + + + + representing + + + + the + + rates several XML schemas representing the + + + + + + whole + + + + workflow + + + + of + + + + document + + + + analysis, + + + + includ- + + whole workflow of document analysis, includ- + + + + + + ing + + + + image + + + + enhancement, + + + + binarisation, + + + + geo- + + ing image enhancement, binarisation, geo- + + + + + + metrical + + + + correction, + + + + layout + + + + analysis, + + + + layout + + metrical correction, layout analysis, layout + + + + + + evaluation + + + + and + + + + OCR. + + + + The + + + + here + + + + used + + + + schema + + evaluation and OCR. The here used schema + + + + + + for + + + + document + + + + layouts + + + + allows + + + + for + + + + polygonal + + for document layouts allows for polygonal + + + + + + regions + + + + with + + + + various + + + + attributes + + + + (including + + + + text + + regions with various attributes (including text + + + + + + content), + + + + reading + + + + order, + + + + layers + + + + and + + + + more. + + content), reading order, layers and more. + + The PAGE (Page Analysis and Ground +truth Elements) format framework incorpo- +rates several XML schemas representing the +whole workflow of document analysis, includ- +ing image enhancement, binarisation, geo- +metrical correction, layout analysis, layout +evaluation and OCR. The here used schema +for document layouts allows for polygonal +regions with various attributes (including text +content), reading order, layers and more. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From + + + + Scratch, + + + + Top-Down + + From Scratch, Top-Down + + From Scratch, Top-Down + + + + + + + + • + + + + Marking + + + + regions + + + + using + + + + man- + + • Marking regions using man- + + + + + + ual + + + + or + + + + semi-automated + + + + tools + + ual or semi-automated tools + + • Marking regions using man- +ual or semi-automated tools + + + + + + + + • + + + + Marking + + + + text + + + + kines + + + + with + + + + easy- + + • Marking text kines with easy- + + + + + + to-use + + + + split + + + + tools + + to-use split tools + + • Marking text kines with easy- +to-use split tools + + + + + + + + • + + + + Marking + + + + words + + + + with + + + + assistive + + • Marking words with assistive + + + + + + tools + + tools + + • Marking words with assistive +tools + + + + + + + + • + + + + Marking + + + + glyphs + + + + (characters) + + • Marking glyphs (characters) + + • Marking glyphs (characters) + + + + + + + + • + + + + Text + + + + transcription + + + + and + + + + propa- + + • Text transcription and propa- + + + + + + gation + + + + to + + + + any + + + + required + + + + level + + gation to any required level + + • Text transcription and propa- +gation to any required level + + + + + + + + • + + + + Reading + + + + order + + + + definition + + • Reading order definition + + • Reading order definition + + + + + + + + • + + + + Validation + + + + to + + + + reduce + + + + risk + + • Validation to reduce risk + + + + + + of + + + + mistakes + + of mistakes + + • Validation to reduce risk +of mistakes + + + + + + + + • + + + + Correcting + + + + text + + + + content + + • Correcting text content + + + + + + using + + + + rendered + + + + text + + + + over- + + using rendered text over- + + + + + + lay + + lay + + • Correcting text content +using rendered text over- +lay + + + + + + + + • + + + + Correcting + + + + layout + + + + using + + • Correcting layout using + + + + + + convenient + + + + tools + + + + such + + + + as + + convenient tools such as + + + + + + merge + + + + and + + + + split + + merge and split + + • Correcting layout using +convenient tools such as +merge and split + + + + + + + + • + + + + Automated + + + + page + + + + analysis + + • Automated page analysis + + + + + + with + + + + integrated + + + + Tesseract + + with integrated Tesseract + + + + + + OCR + + + + or + + + + opening + + + + externally + + OCR or opening externally + + + + + + generated + + + + result + + generated result + + • Automated page analysis +with integrated Tesseract +OCR or opening externally +generated result + + + + + + + + + + + + + + + + + + + + + + T + + + + y + + + + p + + + + i + + + + c + + + + l + + + + a + + Typical + + + + + + W + + + + o + + + + r + + + + k + + + + fl + + + + o + + + + s + + + + w + + Workflows + + Typical Workflows + + Typical Workflows + + + + + + + + Preproduction + + + + + + + + + Correction + + Preproduction + Correction + + Preproduction + Correction + + + + + + + + + + O + + + + t + + + + h + + + + e + + + + r + + Other + + + + + + S + + + + o + + + + f + + + + t + + + + w + + + + a + + + + r + + + + e + + Software + + + + + + T + + + + o + + + + o + + + + l + + + + s + + Tools + + + + + + b + + + + y + + by + + + + + + P + + + + R + + + + I + + + + A + + + + m + + PRImA + + Other Software Tools by PRImA + + Other Software Tools by PRImA + + + + + + + + + + + + + + + + + + + + Pattern + + + + Recognition + + + + and + + + + Image + + + + Analysis + + + + Research + + + + Lab, + + + + School + + + + of + + + + Computing, + + + + Science + + + + and + + + + Engineering, + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, + + + + + + University + + + + of + + + + Salford, + + + + Greater + + + + Manchester, + + + + United + + + + Kingdom, + + + + www.primaresearch.org + + University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, +University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + + + + + + + WebAletheia + + + + Webapp + + WebAletheia Webapp + + WebAletheia Webapp + + + + + + + + Tesseract + + + + OCR + + + + to + + + + PAGE + + + + For + + + + Windows + + Tesseract OCR to PAGE For Windows + + Tesseract OCR to PAGE For Windows + + + + + + + + PAGE + + + + Libraries + + + + For + + + + Java + + + + and + + + + C++ + + PAGE Libraries For Java and C++ + + PAGE Libraries For Java and C++ + + + + + + + + Layout + + + + Evaluation + + + + Performance + + + + Analysis + + + + System + + Layout Evaluation Performance Analysis System + + Layout Evaluation Performance Analysis System + + + + + + + + A + + + + lightweight + + + + web-based + + + + version + + + + of + + + + the + + + + Aletheia + + A lightweight web-based version of the Aletheia + + + + + + ground + + + + truthing + + + + system. + + + + Ideal + + + + for + + + + customised + + ground truthing system. Ideal for customised + + + + + + workflows + + + + and + + + + crowdsourcing + + + + applications. + + + + Go + + + + to + + workflows and crowdsourcing applications. Go to + + + + + + the + + + + PRImA + + + + website + + + + to + + + + try + + + + it + + + + yourself. + + the PRImA website to try it yourself. + + A lightweight web-based version of the Aletheia +ground truthing system. Ideal for customised +workflows and crowdsourcing applications. Go to +the PRImA website to try it yourself. + + + + + + + + A + + + + command + + + + line + + + + tool + + + + to + + + + analyse + + + + document + + + + page + + A command line tool to analyse document page + + + + + + images + + + + using + + + + the + + + + open + + + + source + + + + OCR + + + + engine + + + + Tesser- + + images using the open source OCR engine Tesser- + + + + + + act + + + + and + + + + save + + + + the + + + + results + + + + to + + + + PAGE + + + + XML + + + + format. + + act and save the results to PAGE XML format. + + + + + + Version + + + + 1.3 + + + + is + + + + based + + + + on + + + + the + + + + latest + + + + release + + + + of + + + + Tesser- + + Version 1.3 is based on the latest release of Tesser- + + + + + + act + + + + (3.03). + + act (3.03). + + A command line tool to analyse document page +images using the open source OCR engine Tesser- +act and save the results to PAGE XML format. +Version 1.3 is based on the latest release of Tesser- +act (3.03). + + + + + + + + Platform + + + + independent + + + + libraries + + + + to + + + + create + + + + valid + + + + lay- + + Platform independent libraries to create valid lay- + + + + + + out + + + + descriptions + + + + in + + + + PAGE + + + + XML + + + + format. + + + + The + + + + libraries + + out descriptions in PAGE XML format. The libraries + + + + + + can + + + + be + + + + easily + + + + integrated + + + + in + + + + other + + + + software + + + + projects + + can be easily integrated in other software projects + + + + + + such + + + + as + + + + page + + + + segmentation + + + + methods + + + + for + + + + ICDAR + + such as page segmentation methods for ICDAR + + + + + + competitions. + + competitions. + + Platform independent libraries to create valid lay- +out descriptions in PAGE XML format. The libraries +can be easily integrated in other software projects +such as page segmentation methods for ICDAR +competitions. + + + + + + + + This + + + + tool + + + + is + + + + part + + + + of + + + + a + + + + framework + + + + for + + + + evaluating + + + + the + + This tool is part of a framework for evaluating the + + + + + + performance + + + + of + + + + layout + + + + analysis + + + + methods. + + + + It + + + + com- + + performance of layout analysis methods. It com- + + + + + + bines + + + + efficiency + + + + and + + + + accuracy + + + + by + + + + using + + + + a + + + + special + + bines efficiency and accuracy by using a special + + + + + + interval + + + + based + + + + geometric + + + + representation + + + + of + + + + regions. + + interval based geometric representation of regions. + + + + + + A + + + + wide + + + + range + + + + of + + + + sophisticated + + + + evaluation + + + + measures + + A wide range of sophisticated evaluation measures + + + + + + provide + + + + the + + + + means + + + + for + + + + a + + + + deep + + + + insight + + + + into + + + + the + + provide the means for a deep insight into the + + + + + + analysed + + + + systems, + + analysed systems, + + + + + + which + + + + goes + + + + far + + which goes far + + + + + + beyond + + + + simple + + beyond simple + + + + + + benchmarking. + + + + The + + benchmarking. The + + + + + + support + + + + of + + + + user- + + support of user- + + + + + + defined + + + + profiles + + defined profiles + + + + + + allows + + + + the + + + + tuning + + allows the tuning + + + + + + for + + + + any + + + + kind + + + + of + + for any kind of + + + + + + evaluation + + + + scenario + + evaluation scenario + + + + + + related + + + + to + + + + real + + related to real + + + + + + world + + + + applications. + + world applications. + + This tool is part of a framework for evaluating the +performance of layout analysis methods. It com- +bines efficiency and accuracy by using a special +interval based geometric representation of regions. +A wide range of sophisticated evaluation measures +provide the means for a deep insight into the +analysed systems, +which goes far +beyond simple +benchmarking. The +support of user- +defined profiles +allows the tuning +for any kind of +evaluation scenario +related to real +world applications. + + diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml new file mode 100644 index 0000000..531349b --- /dev/null +++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml @@ -0,0 +1,3252 @@ + + + + PRImA Research Lab + 2015-07-17T15:27:13 + 2018-07-19T07:29:57 + Example Page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + D + + + + o + + + + c + + + + u + + + + m + + + + e + + + + n + + + + t + + Document + + + + + + A + + + + n + + + + a + + + + l + + + + y + + + + s + + + + i + + + + s + + Analysis + + + + + + S + + + + y + + + + s + + + + t + + + + e + + + + m + + System + + Aletheia Document Analysis System + + Aletheia Document Analysis System + + + + + + + + + + + + O + + + + v + + + + e + + + + r + + + + v + + + + i + + + + e + + + + w + + + + : + + Overview: + + + + + + A + + + + l + + + + e + + + + t + + + + h + + + + e + + + + i + + + + a + + Aletheia + + + + + + i + + + + s + + is + + + + + + a + + + + n + + an + + + + + + a + + + + d + + + + - + + ad- + + Overview: Aletheia is an ad- + + + + + + vanced + + + + system + + + + for + + + + accurate + + + + and + + + + yet + + vanced system for accurate and yet + + + + + + cost-effective + + + + ground + + + + truthing + + + + of + + cost-effective ground truthing of + + + + + + large + + + + amounts + + + + of + + + + documents. + + + + It + + + + aids + + large amounts of documents. It aids + + + + + + the + + + + user + + + + with + + + + a + + + + number + + + + of + + + + automated + + the user with a number of automated + + + + + + and + + + + semi-automated + + + + tools + + + + which + + and semi-automated tools which + + + + + + were + + + + partly + + + + developed + + + + and + + + + improved + + were partly developed and improved + + + + + + based + + + + on + + + + feedback + + + + from + + + + major + + + + librar- + + based on feedback from major librar- + + + + + + ies + + + + across + + + + Europe + + + + and + + + + from + + + + their + + + + digit- + + ies across Europe and from their digit- + + + + + + isation + + + + service + + + + providers + + + + which + + + + are + + + + us- + + isation service providers which are us- + + + + + + ing + + + + the + + + + tool + + + + in + + + + a + + + + production + + + + environ- + + ing the tool in a production environ- + + + + + + ment. + + ment. + + Overview: Aletheia is an ad- +vanced system for accurate and yet +cost-effective ground truthing of +large amounts of documents. It aids +the user with a number of automated +and semi-automated tools which +were partly developed and improved +based on feedback from major librar- +ies across Europe and from their digit- +isation service providers which are us- +ing the tool in a production environ- +ment. + + + + + + + + Novel + + + + features + + + + are, + + + + among + + + + others, + + + + the + + Novel features are, among others, the + + + + + + support + + + + of + + + + top-down + + + + ground + + + + truthing + + support of top-down ground truthing + + + + + + with + + + + sophisticated + + + + split + + + + and + + + + shrink + + + + tools + + with sophisticated split and shrink tools + + + + + + as + + + + well + + + + as + + + + bottom-up + + + + ground + + + + truthing + + as well as bottom-up ground truthing + + + + + + supporting + + + + the + + + + aggregation + + + + of + + + + lower-level + + supporting the aggregation of lower-level + + + + + + elements + + + + to + + + + more + + + + complex + + + + structures. + + elements to more complex structures. + + + + + + Special + + + + features + + + + have + + + + been + + + + developed + + + + to + + Special features have been developed to + + + + + + support + + + + working + + + + with + + + + the + + + + complexities + + + + of + + support working with the complexities of + + + + + + historical + + + + documents. + + + + The + + + + integrated + + + + vali- + + historical documents. The integrated vali- + + + + + + dator, + + + + in + + + + combination + + + + with + + + + powerful + + + + cor- + + dator, in combination with powerful cor- + + + + + + rection + + + + tools, + + + + enable + + + + efficient + + + + production + + rection tools, enable efficient production + + + + + + of + + + + highly + + + + accurate + + + + ground + + + + truth. + + of highly accurate ground truth. + + Novel features are, among others, the +support of top-down ground truthing +with sophisticated split and shrink tools +as well as bottom-up ground truthing +supporting the aggregation of lower-level +elements to more complex structures. +Special features have been developed to +support working with the complexities of +historical documents. The integrated vali- +dator, in combination with powerful cor- +rection tools, enable efficient production +of highly accurate ground truth. + + + + + + + + Aletheia + + + + uses + + + + the + + + + PAGE + + + + (Page + + + + Analysis + + Aletheia uses the PAGE (Page Analysis + + + + + + and + + + + Ground + + + + truth + + + + Elements) + + + + XML + + + + format + + and Ground truth Elements) XML format + + + + + + framework + + + + which + + + + incorporates + + + + several + + framework which incorporates several + + + + + + XML + + + + schemas + + + + representing + + + + the + + + + whole + + XML schemas representing the whole + + + + + + workflow + + + + of + + + + document + + + + analysis. + + + + See + + + + also + + workflow of document analysis. See also + + + + + + the + + + + dedicated + + + + infobox. + + the dedicated infobox. + + Aletheia uses the PAGE (Page Analysis +and Ground truth Elements) XML format +framework which incorporates several +XML schemas representing the whole +workflow of document analysis. See also +the dedicated infobox. + + + + + + + + + + Layers + + + + and + + + + reading + + + + order + + Layers and reading order + + Layers and reading order + + + + + + + + + + Screenshot + + + + of + + + + Aletheia + + + + showing + + + + regions + + + + and + + + + properties + + Screenshot of Aletheia showing regions and properties + + Screenshot of Aletheia showing regions and properties + + + + + + + + + + The + + + + PAGE + + + + (Page + + + + Analysis + + + + and + + + + Ground + + The PAGE (Page Analysis and Ground + + + + + + truth + + + + Elements) + + + + format + + + + framework + + + + incorpo- + + truth Elements) format framework incorpo- + + + + + + rates + + + + several + + + + XML + + + + schemas + + + + representing + + + + the + + rates several XML schemas representing the + + + + + + whole + + + + workflow + + + + of + + + + document + + + + analysis, + + + + includ- + + whole workflow of document analysis, includ- + + + + + + ing + + + + image + + + + enhancement, + + + + binarisation, + + + + geo- + + ing image enhancement, binarisation, geo- + + + + + + metrical + + + + correction, + + + + layout + + + + analysis, + + + + layout + + metrical correction, layout analysis, layout + + + + + + evaluation + + + + and + + + + OCR. + + + + The + + + + here + + + + used + + + + schema + + evaluation and OCR. The here used schema + + + + + + for + + + + document + + + + layouts + + + + allows + + + + for + + + + polygonal + + for document layouts allows for polygonal + + + + + + regions + + + + with + + + + various + + + + attributes + + + + (including + + + + text + + regions with various attributes (including text + + + + + + content), + + + + reading + + + + order, + + + + layers + + + + and + + + + more. + + content), reading order, layers and more. + + The PAGE (Page Analysis and Ground +truth Elements) format framework incorpo- +rates several XML schemas representing the +whole workflow of document analysis, includ- +ing image enhancement, binarisation, geo- +metrical correction, layout analysis, layout +evaluation and OCR. The here used schema +for document layouts allows for polygonal +regions with various attributes (including text +content), reading order, layers and more. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From + + + + Scratch, + + + + Top-Down + + From Scratch, Top-Down + + From Scratch, Top-Down + + + + + + + + • + + + + Marking + + + + regions + + + + using + + + + man- + + • Marking regions using man- + + + + + + ual + + + + or + + + + semi-automated + + + + tools + + ual or semi-automated tools + + • Marking regions using man- +ual or semi-automated tools + + + + + + + + • + + + + Marking + + + + text + + + + kines + + + + with + + + + easy- + + • Marking text kines with easy- + + + + + + to-use + + + + split + + + + tools + + to-use split tools + + • Marking text kines with easy- +to-use split tools + + + + + + + + • + + + + Marking + + + + words + + + + with + + + + assistive + + • Marking words with assistive + + + + + + tools + + tools + + • Marking words with assistive +tools + + + + + + + + • + + + + Marking + + + + glyphs + + + + (characters) + + • Marking glyphs (characters) + + • Marking glyphs (characters) + + + + + + + + • + + + + Text + + + + transcription + + + + and + + + + propa- + + • Text transcription and propa- + + + + + + gation + + + + to + + + + any + + + + required + + + + level + + gation to any required level + + • Text transcription and propa- +gation to any required level + + + + + + + + • + + + + Reading + + + + order + + + + definition + + • Reading order definition + + • Reading order definition + + + + + + + + • + + + + Validation + + + + to + + + + reduce + + + + risk + + • Validation to reduce risk + + + + + + of + + + + mistakes + + of mistakes + + • Validation to reduce risk +of mistakes + + + + + + + + • + + + + Correcting + + + + text + + + + content + + • Correcting text content + + + + + + using + + + + rendered + + + + text + + + + over- + + using rendered text over- + + + + + + lay + + lay + + • Correcting text content +using rendered text over- +lay + + + + + + + + • + + + + Correcting + + + + layout + + + + using + + • Correcting layout using + + + + + + convenient + + + + tools + + + + such + + + + as + + convenient tools such as + + + + + + merge + + + + and + + + + split + + merge and split + + • Correcting layout using +convenient tools such as +merge and split + + + + + + + + • + + + + Automated + + + + page + + + + analysis + + • Automated page analysis + + + + + + with + + + + integrated + + + + Tesseract + + with integrated Tesseract + + + + + + OCR + + + + or + + + + opening + + + + externally + + OCR or opening externally + + + + + + generated + + + + result + + generated result + + • Automated page analysis +with integrated Tesseract +OCR or opening externally +generated result + + + + + + + + + + + + + + + + + + + + + + T + + + + y + + + + p + + + + i + + + + c + + + + l + + + + a + + Typical + + + + + + W + + + + o + + + + r + + + + k + + + + fl + + + + o + + + + s + + + + w + + Workflows + + Typical Workflows + + Typical Workflows + + + + + + + + Preproduction + + + + + + + + + Correction + + Preproduction + Correction + + Preproduction + Correction + + + + + + + + + + O + + + + t + + + + h + + + + e + + + + r + + Other + + + + + + S + + + + o + + + + f + + + + t + + + + w + + + + a + + + + r + + + + e + + Software + + + + + + T + + + + o + + + + o + + + + l + + + + s + + Tools + + + + + + b + + + + y + + by + + + + + + P + + + + R + + + + I + + + + A + + + + m + + PRImA + + Other Software Tools by PRImA + + Other Software Tools by PRImA + + + + + + + + + + + + + + + + + + + + Pattern + + + + Recognition + + + + and + + + + Image + + + + Analysis + + + + Research + + + + Lab, + + + + School + + + + of + + + + Computing, + + + + Science + + + + and + + + + Engineering, + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, + + + + + + University + + + + of + + + + Salford, + + + + Greater + + + + Manchester, + + + + United + + + + Kingdom, + + + + www.primaresearch.org + + University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering, +University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org + + + + + + + + WebAletheia + + + + Webapp + + WebAletheia Webapp + + WebAletheia Webapp + + + + + + + + Tesseract + + + + OCR + + + + to + + + + PAGE + + + + For + + + + Windows + + Tesseract OCR to PAGE For Windows + + Tesseract OCR to PAGE For Windows + + + + + + + + PAGE + + + + Libraries + + + + For + + + + Java + + + + and + + + + C++ + + PAGE Libraries For Java and C++ + + PAGE Libraries For Java and C++ + + + + + + + + Layout + + + + Evaluation + + + + Performance + + + + Analysis + + + + System + + Layout Evaluation Performance Analysis System + + Layout Evaluation Performance Analysis System + + + + + + + + A + + + + lightweight + + + + web-based + + + + version + + + + of + + + + the + + + + Aletheia + + A lightweight web-based version of the Aletheia + + + + + + ground + + + + truthing + + + + system. + + + + Ideal + + + + for + + + + customised + + ground truthing system. Ideal for customised + + + + + + workflows + + + + and + + + + crowdsourcing + + + + applications. + + + + Go + + + + to + + workflows and crowdsourcing applications. Go to + + + + + + the + + + + PRImA + + + + website + + + + to + + + + try + + + + it + + + + yourself. + + the PRImA website to try it yourself. + + A lightweight web-based version of the Aletheia +ground truthing system. Ideal for customised +workflows and crowdsourcing applications. Go to +the PRImA website to try it yourself. + + + + + + + + A + + + + command + + + + line + + + + tool + + + + to + + + + analyse + + + + document + + + + page + + A command line tool to analyse document page + + + + + + images + + + + using + + + + the + + + + open + + + + source + + + + OCR + + + + engine + + + + Tesser- + + images using the open source OCR engine Tesser- + + + + + + act + + + + and + + + + save + + + + the + + + + results + + + + to + + + + PAGE + + + + XML + + + + format. + + act and save the results to PAGE XML format. + + + + + + Version + + + + 1.3 + + + + is + + + + based + + + + on + + + + the + + + + latest + + + + release + + + + of + + + + Tesser- + + Version 1.3 is based on the latest release of Tesser- + + + + + + act + + + + (3.03). + + act (3.03). + + A command line tool to analyse document page +images using the open source OCR engine Tesser- +act and save the results to PAGE XML format. +Version 1.3 is based on the latest release of Tesser- +act (3.03). + + + + + + + + Platform + + + + independent + + + + libraries + + + + to + + + + create + + + + valid + + + + lay- + + Platform independent libraries to create valid lay- + + + + + + out + + + + descriptions + + + + in + + + + PAGE + + + + XML + + + + format. + + + + The + + + + libraries + + out descriptions in PAGE XML format. The libraries + + + + + + can + + + + be + + + + easily + + + + integrated + + + + in + + + + other + + + + software + + + + projects + + can be easily integrated in other software projects + + + + + + such + + + + as + + + + page + + + + segmentation + + + + methods + + + + for + + + + ICDAR + + such as page segmentation methods for ICDAR + + + + + + competitions. + + competitions. + + Platform independent libraries to create valid lay- +out descriptions in PAGE XML format. The libraries +can be easily integrated in other software projects +such as page segmentation methods for ICDAR +competitions. + + + + + + + + This + + + + tool + + + + is + + + + part + + + + of + + + + a + + + + framework + + + + for + + + + evaluating + + + + the + + This tool is part of a framework for evaluating the + + + + + + performance + + + + of + + + + layout + + + + analysis + + + + methods. + + + + It + + + + com- + + performance of layout analysis methods. It com- + + + + + + bines + + + + efficiency + + + + and + + + + accuracy + + + + by + + + + using + + + + a + + + + special + + bines efficiency and accuracy by using a special + + + + + + interval + + + + based + + + + geometric + + + + representation + + + + of + + + + regions. + + interval based geometric representation of regions. + + + + + + A + + + + wide + + + + range + + + + of + + + + sophisticated + + + + evaluation + + + + measures + + A wide range of sophisticated evaluation measures + + + + + + provide + + + + the + + + + means + + + + for + + + + a + + + + deep + + + + insight + + + + into + + + + the + + provide the means for a deep insight into the + + + + + + analysed + + + + systems, + + analysed systems, + + + + + + which + + + + goes + + + + far + + which goes far + + + + + + beyond + + + + simple + + beyond simple + + + + + + benchmarking. + + + + The + + benchmarking. The + + + + + + support + + + + of + + + + user- + + support of user- + + + + + + defined + + + + profiles + + defined profiles + + + + + + allows + + + + the + + + + tuning + + allows the tuning + + + + + + for + + + + any + + + + kind + + + + of + + for any kind of + + + + + + evaluation + + + + scenario + + evaluation scenario + + + + + + related + + + + to + + + + real + + related to real + + + + + + world + + + + applications. + + world applications. + + This tool is part of a framework for evaluating the +performance of layout analysis methods. It com- +bines efficiency and accuracy by using a special +interval based geometric representation of regions. +A wide range of sophisticated evaluation measures +provide the means for a deep insight into the +analysed systems, +which goes far +beyond simple +benchmarking. The +support of user- +defined profiles +allows the tuning +for any kind of +evaluation scenario +related to real +world applications. + + diff --git a/tests/example/workspaces/aletheiaexamplepage/mets.xml b/tests/example/workspaces/aletheiaexamplepage/mets.xml new file mode 100644 index 0000000..c0fa26a --- /dev/null +++ b/tests/example/workspaces/aletheiaexamplepage/mets.xml @@ -0,0 +1,73 @@ + + + + + ocrd/core v2.26.1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/model/test_page.py b/tests/model/test_page.py new file mode 100644 index 0000000..2b2ab85 --- /dev/null +++ b/tests/model/test_page.py @@ -0,0 +1,14 @@ +from tests import TestCase, TEST_BASE_PATH +from ocrd_browser.model import Document + + +class PageTestCase(TestCase): + + def setUp(self): + self.doc = Document.load(TEST_BASE_PATH / 'example/workspaces/aletheiaexamplepage/mets.xml') + + def test_xpath_works_with_different_namespaces(self): + for page_id in ['PAGE_2017', 'PAGE_2018', 'PAGE_2019']: + page = self.doc.page_for_id(page_id, 'OCR-D-GT-PAGE') + xpath_result = page.xpath('/page:PcGts/page:Page/@imageFilename') + self.assertGreater(len(xpath_result), 0) diff --git a/tests/util/test_image.py b/tests/util/test_image.py index 3cd2ed5..cda0ab0 100644 --- a/tests/util/test_image.py +++ b/tests/util/test_image.py @@ -31,7 +31,7 @@ def _image_modes(): class ImageUtilTestCase(TestCase): def test_pil_to_pixbuf_is_faster_via_opencv(self): - # self.skipTest('Slow test') + self.skipTest('Slow test') files = [ ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG/OCR-D-IMG_0017.tif', ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png',