diff --git a/ocrd_browser/model/page.py b/ocrd_browser/model/page.py
index b02c19f..4607f17 100644
--- a/ocrd_browser/model/page.py
+++ b/ocrd_browser/model/page.py
@@ -53,12 +53,12 @@ def meta(self) -> MetadataType:
return self.pc_gts.get_Metadata()
def xpath(self, xpath: str) -> List[Element]:
- return cast(List[Element], self.xml_root.xpath(xpath, namespaces=NAMESPACES))
+ page_namespace = {'page': ns for ns in self.xml_root.nsmap.values() if ns.startswith('http://schema.primaresearch.org/PAGE/gts/pagecontent/')}
+ return cast(List[Element], self.xml_root.xpath(xpath, namespaces=dict(NAMESPACES, **page_namespace)))
@property
def xml_root(self) -> Element:
if self.pc_gts.gds_elementtree_node_ is None:
- from ocrd_models.constants import NAMESPACES
from ocrd_models.ocrd_page_generateds import parsexmlstring_
from io import StringIO
sio = StringIO()
diff --git a/ocrd_browser/model/page_xml_renderer.py b/ocrd_browser/model/page_xml_renderer.py
index c476b65..887c72a 100644
--- a/ocrd_browser/model/page_xml_renderer.py
+++ b/ocrd_browser/model/page_xml_renderer.py
@@ -127,6 +127,17 @@ def __init__(self, region: RegionWithCoords) -> None:
self._prep_poly: Optional[prepared.PreparedGeometry] = None
self.warnings: List[str] = []
+ @property
+ def coords_conf(self) -> Optional[float]:
+ return cast(float, self.region.Coords.conf) if hasattr(self.region, 'Coords') else None
+
+ @property
+ def text_conf(self) -> Optional[float]:
+ if isinstance(self.region, (TextRegionType, TextLineType, WordType, GlyphType)):
+ if self.region.get_TextEquiv() and self.region.get_TextEquiv()[0].conf:
+ return cast(float, self.region.get_TextEquiv()[0].conf)
+ return None
+
@property
def poly(self) -> Polygon:
return self._poly
diff --git a/ocrd_browser/view/page.py b/ocrd_browser/view/page.py
index a4c8436..caee336 100644
--- a/ocrd_browser/view/page.py
+++ b/ocrd_browser/view/page.py
@@ -124,7 +124,8 @@ def set_page(self, page: Page) -> None:
versions.append(ImageVersion.from_page(self.document, page))
alts: List[AlternativeImageType] = page.page.get_AlternativeImage()
for alt in alts:
- versions.append(ImageVersion.from_alternative_image(self.document, alt))
+ if self.document.path(alt.filename).exists():
+ versions.append(ImageVersion.from_alternative_image(self.document, alt))
with self.version_box.handler_block(self._change_handler):
self.versions.clear()
@@ -255,6 +256,10 @@ def build(self) -> None:
self.add_configurator('scale', ImageZoomSelector(2.0, 0.05, -4.0, 2.0))
self.add_configurator('image_version', ImageVersionSelector())
self.add_configurator('features', PageFeaturesSelector())
+ icon = Gtk.Image.new_from_icon_name('camera-photo', Gtk.IconSize.SMALL_TOOLBAR)
+ button = Gtk.Button(image=icon, visible=True, always_show_image=True, tooltip_text='Saves a screenshot of the current view')
+ button.connect('clicked', self.open_screenshotdialog)
+ self.action_bar.pack_start(button)
actions = ActionRegistry()
actions.create(name='zoom_by', param_type=GLib.VariantType('i'), callback=self._on_zoom_by)
@@ -412,6 +417,11 @@ def _query_tooltip(self, _image: Gtk.Image, x: int, y: int, _keyboard_mode: bool
if region:
content += '\n{}\n\n{}\n'.format(str(region), escape(region.text))
+ if region.text_conf:
+ content += '\n@text.conf={}'.format(region.text_conf)
+
+ if region.coords_conf:
+ content += '\n@coords.conf={}'.format(region.coords_conf)
if region.region_subtype:
content += '\n@type: {}'.format(region.region_subtype)
for attribute in [
@@ -530,3 +540,30 @@ def update_transformation(self) -> None:
self.page_image.height
)
self.highlight.queue_draw()
+
+ def open_screenshotdialog(self, button: Gtk.Button) -> None:
+ if self.page_image is None:
+ return
+
+ dialog = Gtk.FileChooserDialog(title="Save image under...",
+ parent=self.window,
+ action=Gtk.FileChooserAction.SAVE)
+ dialog.add_buttons(Gtk.STOCK_CANCEL,
+ Gtk.ResponseType.CANCEL,
+ Gtk.STOCK_SAVE,
+ Gtk.ResponseType.OK)
+ filter_png = Gtk.FileFilter()
+ filter_png.set_name("PNG image files")
+ filter_png.add_mime_type("image/png")
+ dialog.add_filter(filter_png)
+ dialog.set_current_name("untitled.png")
+
+ response = dialog.run()
+ if response == Gtk.ResponseType.OK:
+ filename = dialog.get_filename()
+ else:
+ filename = ''
+
+ dialog.destroy()
+ if filename:
+ self.page_image.save(filename)
diff --git a/setup.cfg b/setup.cfg
index b6caca3..a4a5d0c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,6 @@
[flake8]
ignore=E501
+exclude=tests/assets/__init__.py
[mypy]
warn_return_any = True
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif
new file mode 100644
index 0000000..cc1fbc5
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2017.tif differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif
new file mode 100644
index 0000000..cc1fbc5
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2018.tif differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif
new file mode 100644
index 0000000..cc1fbc5
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG-BIN/PAGE_2019.tif differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg
new file mode 100644
index 0000000..7281633
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2017.jpg differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg
new file mode 100644
index 0000000..7281633
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2018.jpg differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg
new file mode 100644
index 0000000..7281633
Binary files /dev/null and b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-IMG/PAGE_2019.jpg differ
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml
new file mode 100644
index 0000000..4c80371
--- /dev/null
+++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2017.xml
@@ -0,0 +1,3252 @@
+
+
+
+ PRImA Research Lab
+ 2015-07-17T15:27:13
+ 2017-07-14T10:03:33
+ Example Page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ D
+
+
+
+ o
+
+
+
+ c
+
+
+
+ u
+
+
+
+ m
+
+
+
+ e
+
+
+
+ n
+
+
+
+ t
+
+ Document
+
+
+
+
+
+ A
+
+
+
+ n
+
+
+
+ a
+
+
+
+ l
+
+
+
+ y
+
+
+
+ s
+
+
+
+ i
+
+
+
+ s
+
+ Analysis
+
+
+
+
+
+ S
+
+
+
+ y
+
+
+
+ s
+
+
+
+ t
+
+
+
+ e
+
+
+
+ m
+
+ System
+
+ Aletheia Document Analysis System
+
+ Aletheia Document Analysis System
+
+
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ v
+
+
+
+ e
+
+
+
+ r
+
+
+
+ v
+
+
+
+ i
+
+
+
+ e
+
+
+
+ w
+
+
+
+ :
+
+ Overview:
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ i
+
+
+
+ s
+
+ is
+
+
+
+
+
+ a
+
+
+
+ n
+
+ an
+
+
+
+
+
+ a
+
+
+
+ d
+
+
+
+ -
+
+ ad-
+
+ Overview: Aletheia is an ad-
+
+
+
+
+
+ vanced
+
+
+
+ system
+
+
+
+ for
+
+
+
+ accurate
+
+
+
+ and
+
+
+
+ yet
+
+ vanced system for accurate and yet
+
+
+
+
+
+ cost-effective
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ of
+
+ cost-effective ground truthing of
+
+
+
+
+
+ large
+
+
+
+ amounts
+
+
+
+ of
+
+
+
+ documents.
+
+
+
+ It
+
+
+
+ aids
+
+ large amounts of documents. It aids
+
+
+
+
+
+ the
+
+
+
+ user
+
+
+
+ with
+
+
+
+ a
+
+
+
+ number
+
+
+
+ of
+
+
+
+ automated
+
+ the user with a number of automated
+
+
+
+
+
+ and
+
+
+
+ semi-automated
+
+
+
+ tools
+
+
+
+ which
+
+ and semi-automated tools which
+
+
+
+
+
+ were
+
+
+
+ partly
+
+
+
+ developed
+
+
+
+ and
+
+
+
+ improved
+
+ were partly developed and improved
+
+
+
+
+
+ based
+
+
+
+ on
+
+
+
+ feedback
+
+
+
+ from
+
+
+
+ major
+
+
+
+ librar-
+
+ based on feedback from major librar-
+
+
+
+
+
+ ies
+
+
+
+ across
+
+
+
+ Europe
+
+
+
+ and
+
+
+
+ from
+
+
+
+ their
+
+
+
+ digit-
+
+ ies across Europe and from their digit-
+
+
+
+
+
+ isation
+
+
+
+ service
+
+
+
+ providers
+
+
+
+ which
+
+
+
+ are
+
+
+
+ us-
+
+ isation service providers which are us-
+
+
+
+
+
+ ing
+
+
+
+ the
+
+
+
+ tool
+
+
+
+ in
+
+
+
+ a
+
+
+
+ production
+
+
+
+ environ-
+
+ ing the tool in a production environ-
+
+
+
+
+
+ ment.
+
+ ment.
+
+ Overview: Aletheia is an ad-
+vanced system for accurate and yet
+cost-effective ground truthing of
+large amounts of documents. It aids
+the user with a number of automated
+and semi-automated tools which
+were partly developed and improved
+based on feedback from major librar-
+ies across Europe and from their digit-
+isation service providers which are us-
+ing the tool in a production environ-
+ment.
+
+
+
+
+
+
+
+ Novel
+
+
+
+ features
+
+
+
+ are,
+
+
+
+ among
+
+
+
+ others,
+
+
+
+ the
+
+ Novel features are, among others, the
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ top-down
+
+
+
+ ground
+
+
+
+ truthing
+
+ support of top-down ground truthing
+
+
+
+
+
+ with
+
+
+
+ sophisticated
+
+
+
+ split
+
+
+
+ and
+
+
+
+ shrink
+
+
+
+ tools
+
+ with sophisticated split and shrink tools
+
+
+
+
+
+ as
+
+
+
+ well
+
+
+
+ as
+
+
+
+ bottom-up
+
+
+
+ ground
+
+
+
+ truthing
+
+ as well as bottom-up ground truthing
+
+
+
+
+
+ supporting
+
+
+
+ the
+
+
+
+ aggregation
+
+
+
+ of
+
+
+
+ lower-level
+
+ supporting the aggregation of lower-level
+
+
+
+
+
+ elements
+
+
+
+ to
+
+
+
+ more
+
+
+
+ complex
+
+
+
+ structures.
+
+ elements to more complex structures.
+
+
+
+
+
+ Special
+
+
+
+ features
+
+
+
+ have
+
+
+
+ been
+
+
+
+ developed
+
+
+
+ to
+
+ Special features have been developed to
+
+
+
+
+
+ support
+
+
+
+ working
+
+
+
+ with
+
+
+
+ the
+
+
+
+ complexities
+
+
+
+ of
+
+ support working with the complexities of
+
+
+
+
+
+ historical
+
+
+
+ documents.
+
+
+
+ The
+
+
+
+ integrated
+
+
+
+ vali-
+
+ historical documents. The integrated vali-
+
+
+
+
+
+ dator,
+
+
+
+ in
+
+
+
+ combination
+
+
+
+ with
+
+
+
+ powerful
+
+
+
+ cor-
+
+ dator, in combination with powerful cor-
+
+
+
+
+
+ rection
+
+
+
+ tools,
+
+
+
+ enable
+
+
+
+ efficient
+
+
+
+ production
+
+ rection tools, enable efficient production
+
+
+
+
+
+ of
+
+
+
+ highly
+
+
+
+ accurate
+
+
+
+ ground
+
+
+
+ truth.
+
+ of highly accurate ground truth.
+
+ Novel features are, among others, the
+support of top-down ground truthing
+with sophisticated split and shrink tools
+as well as bottom-up ground truthing
+supporting the aggregation of lower-level
+elements to more complex structures.
+Special features have been developed to
+support working with the complexities of
+historical documents. The integrated vali-
+dator, in combination with powerful cor-
+rection tools, enable efficient production
+of highly accurate ground truth.
+
+
+
+
+
+
+
+ Aletheia
+
+
+
+ uses
+
+
+
+ the
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+ Aletheia uses the PAGE (Page Analysis
+
+
+
+
+
+ and
+
+
+
+ Ground
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ XML
+
+
+
+ format
+
+ and Ground truth Elements) XML format
+
+
+
+
+
+ framework
+
+
+
+ which
+
+
+
+ incorporates
+
+
+
+ several
+
+ framework which incorporates several
+
+
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+
+
+ whole
+
+ XML schemas representing the whole
+
+
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis.
+
+
+
+ See
+
+
+
+ also
+
+ workflow of document analysis. See also
+
+
+
+
+
+ the
+
+
+
+ dedicated
+
+
+
+ infobox.
+
+ the dedicated infobox.
+
+ Aletheia uses the PAGE (Page Analysis
+and Ground truth Elements) XML format
+framework which incorporates several
+XML schemas representing the whole
+workflow of document analysis. See also
+the dedicated infobox.
+
+
+
+
+
+
+
+
+
+ Layers
+
+
+
+ and
+
+
+
+ reading
+
+
+
+ order
+
+ Layers and reading order
+
+ Layers and reading order
+
+
+
+
+
+
+
+
+
+ Screenshot
+
+
+
+ of
+
+
+
+ Aletheia
+
+
+
+ showing
+
+
+
+ regions
+
+
+
+ and
+
+
+
+ properties
+
+ Screenshot of Aletheia showing regions and properties
+
+ Screenshot of Aletheia showing regions and properties
+
+
+
+
+
+
+
+
+
+ The
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+
+
+ and
+
+
+
+ Ground
+
+ The PAGE (Page Analysis and Ground
+
+
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ format
+
+
+
+ framework
+
+
+
+ incorpo-
+
+ truth Elements) format framework incorpo-
+
+
+
+
+
+ rates
+
+
+
+ several
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+ rates several XML schemas representing the
+
+
+
+
+
+ whole
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis,
+
+
+
+ includ-
+
+ whole workflow of document analysis, includ-
+
+
+
+
+
+ ing
+
+
+
+ image
+
+
+
+ enhancement,
+
+
+
+ binarisation,
+
+
+
+ geo-
+
+ ing image enhancement, binarisation, geo-
+
+
+
+
+
+ metrical
+
+
+
+ correction,
+
+
+
+ layout
+
+
+
+ analysis,
+
+
+
+ layout
+
+ metrical correction, layout analysis, layout
+
+
+
+
+
+ evaluation
+
+
+
+ and
+
+
+
+ OCR.
+
+
+
+ The
+
+
+
+ here
+
+
+
+ used
+
+
+
+ schema
+
+ evaluation and OCR. The here used schema
+
+
+
+
+
+ for
+
+
+
+ document
+
+
+
+ layouts
+
+
+
+ allows
+
+
+
+ for
+
+
+
+ polygonal
+
+ for document layouts allows for polygonal
+
+
+
+
+
+ regions
+
+
+
+ with
+
+
+
+ various
+
+
+
+ attributes
+
+
+
+ (including
+
+
+
+ text
+
+ regions with various attributes (including text
+
+
+
+
+
+ content),
+
+
+
+ reading
+
+
+
+ order,
+
+
+
+ layers
+
+
+
+ and
+
+
+
+ more.
+
+ content), reading order, layers and more.
+
+ The PAGE (Page Analysis and Ground
+truth Elements) format framework incorpo-
+rates several XML schemas representing the
+whole workflow of document analysis, includ-
+ing image enhancement, binarisation, geo-
+metrical correction, layout analysis, layout
+evaluation and OCR. The here used schema
+for document layouts allows for polygonal
+regions with various attributes (including text
+content), reading order, layers and more.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From
+
+
+
+ Scratch,
+
+
+
+ Top-Down
+
+ From Scratch, Top-Down
+
+ From Scratch, Top-Down
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ regions
+
+
+
+ using
+
+
+
+ man-
+
+ • Marking regions using man-
+
+
+
+
+
+ ual
+
+
+
+ or
+
+
+
+ semi-automated
+
+
+
+ tools
+
+ ual or semi-automated tools
+
+ • Marking regions using man-
+ual or semi-automated tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ text
+
+
+
+ kines
+
+
+
+ with
+
+
+
+ easy-
+
+ • Marking text kines with easy-
+
+
+
+
+
+ to-use
+
+
+
+ split
+
+
+
+ tools
+
+ to-use split tools
+
+ • Marking text kines with easy-
+to-use split tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ words
+
+
+
+ with
+
+
+
+ assistive
+
+ • Marking words with assistive
+
+
+
+
+
+ tools
+
+ tools
+
+ • Marking words with assistive
+tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ glyphs
+
+
+
+ (characters)
+
+ • Marking glyphs (characters)
+
+ • Marking glyphs (characters)
+
+
+
+
+
+
+
+ •
+
+
+
+ Text
+
+
+
+ transcription
+
+
+
+ and
+
+
+
+ propa-
+
+ • Text transcription and propa-
+
+
+
+
+
+ gation
+
+
+
+ to
+
+
+
+ any
+
+
+
+ required
+
+
+
+ level
+
+ gation to any required level
+
+ • Text transcription and propa-
+gation to any required level
+
+
+
+
+
+
+
+ •
+
+
+
+ Reading
+
+
+
+ order
+
+
+
+ definition
+
+ • Reading order definition
+
+ • Reading order definition
+
+
+
+
+
+
+
+ •
+
+
+
+ Validation
+
+
+
+ to
+
+
+
+ reduce
+
+
+
+ risk
+
+ • Validation to reduce risk
+
+
+
+
+
+ of
+
+
+
+ mistakes
+
+ of mistakes
+
+ • Validation to reduce risk
+of mistakes
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ text
+
+
+
+ content
+
+ • Correcting text content
+
+
+
+
+
+ using
+
+
+
+ rendered
+
+
+
+ text
+
+
+
+ over-
+
+ using rendered text over-
+
+
+
+
+
+ lay
+
+ lay
+
+ • Correcting text content
+using rendered text over-
+lay
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ layout
+
+
+
+ using
+
+ • Correcting layout using
+
+
+
+
+
+ convenient
+
+
+
+ tools
+
+
+
+ such
+
+
+
+ as
+
+ convenient tools such as
+
+
+
+
+
+ merge
+
+
+
+ and
+
+
+
+ split
+
+ merge and split
+
+ • Correcting layout using
+convenient tools such as
+merge and split
+
+
+
+
+
+
+
+ •
+
+
+
+ Automated
+
+
+
+ page
+
+
+
+ analysis
+
+ • Automated page analysis
+
+
+
+
+
+ with
+
+
+
+ integrated
+
+
+
+ Tesseract
+
+ with integrated Tesseract
+
+
+
+
+
+ OCR
+
+
+
+ or
+
+
+
+ opening
+
+
+
+ externally
+
+ OCR or opening externally
+
+
+
+
+
+ generated
+
+
+
+ result
+
+ generated result
+
+ • Automated page analysis
+with integrated Tesseract
+OCR or opening externally
+generated result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ T
+
+
+
+ y
+
+
+
+ p
+
+
+
+ i
+
+
+
+ c
+
+
+
+ l
+
+
+
+ a
+
+ Typical
+
+
+
+
+
+ W
+
+
+
+ o
+
+
+
+ r
+
+
+
+ k
+
+
+
+ fl
+
+
+
+ o
+
+
+
+ s
+
+
+
+ w
+
+ Workflows
+
+ Typical Workflows
+
+ Typical Workflows
+
+
+
+
+
+
+
+ Preproduction
+
+
+
+ +
+
+
+
+ Correction
+
+ Preproduction + Correction
+
+ Preproduction + Correction
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ r
+
+ Other
+
+
+
+
+
+ S
+
+
+
+ o
+
+
+
+ f
+
+
+
+ t
+
+
+
+ w
+
+
+
+ a
+
+
+
+ r
+
+
+
+ e
+
+ Software
+
+
+
+
+
+ T
+
+
+
+ o
+
+
+
+ o
+
+
+
+ l
+
+
+
+ s
+
+ Tools
+
+
+
+
+
+ b
+
+
+
+ y
+
+ by
+
+
+
+
+
+ P
+
+
+
+ R
+
+
+
+ I
+
+
+
+ A
+
+
+
+ m
+
+ PRImA
+
+ Other Software Tools by PRImA
+
+ Other Software Tools by PRImA
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pattern
+
+
+
+ Recognition
+
+
+
+ and
+
+
+
+ Image
+
+
+
+ Analysis
+
+
+
+ Research
+
+
+
+ Lab,
+
+
+
+ School
+
+
+
+ of
+
+
+
+ Computing,
+
+
+
+ Science
+
+
+
+ and
+
+
+
+ Engineering,
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+
+
+
+
+
+ University
+
+
+
+ of
+
+
+
+ Salford,
+
+
+
+ Greater
+
+
+
+ Manchester,
+
+
+
+ United
+
+
+
+ Kingdom,
+
+
+
+ www.primaresearch.org
+
+ University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+
+
+
+
+
+
+ WebAletheia
+
+
+
+ Webapp
+
+ WebAletheia Webapp
+
+ WebAletheia Webapp
+
+
+
+
+
+
+
+ Tesseract
+
+
+
+ OCR
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ For
+
+
+
+ Windows
+
+ Tesseract OCR to PAGE For Windows
+
+ Tesseract OCR to PAGE For Windows
+
+
+
+
+
+
+
+ PAGE
+
+
+
+ Libraries
+
+
+
+ For
+
+
+
+ Java
+
+
+
+ and
+
+
+
+ C++
+
+ PAGE Libraries For Java and C++
+
+ PAGE Libraries For Java and C++
+
+
+
+
+
+
+
+ Layout
+
+
+
+ Evaluation
+
+
+
+ Performance
+
+
+
+ Analysis
+
+
+
+ System
+
+ Layout Evaluation Performance Analysis System
+
+ Layout Evaluation Performance Analysis System
+
+
+
+
+
+
+
+ A
+
+
+
+ lightweight
+
+
+
+ web-based
+
+
+
+ version
+
+
+
+ of
+
+
+
+ the
+
+
+
+ Aletheia
+
+ A lightweight web-based version of the Aletheia
+
+
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ system.
+
+
+
+ Ideal
+
+
+
+ for
+
+
+
+ customised
+
+ ground truthing system. Ideal for customised
+
+
+
+
+
+ workflows
+
+
+
+ and
+
+
+
+ crowdsourcing
+
+
+
+ applications.
+
+
+
+ Go
+
+
+
+ to
+
+ workflows and crowdsourcing applications. Go to
+
+
+
+
+
+ the
+
+
+
+ PRImA
+
+
+
+ website
+
+
+
+ to
+
+
+
+ try
+
+
+
+ it
+
+
+
+ yourself.
+
+ the PRImA website to try it yourself.
+
+ A lightweight web-based version of the Aletheia
+ground truthing system. Ideal for customised
+workflows and crowdsourcing applications. Go to
+the PRImA website to try it yourself.
+
+
+
+
+
+
+
+ A
+
+
+
+ command
+
+
+
+ line
+
+
+
+ tool
+
+
+
+ to
+
+
+
+ analyse
+
+
+
+ document
+
+
+
+ page
+
+ A command line tool to analyse document page
+
+
+
+
+
+ images
+
+
+
+ using
+
+
+
+ the
+
+
+
+ open
+
+
+
+ source
+
+
+
+ OCR
+
+
+
+ engine
+
+
+
+ Tesser-
+
+ images using the open source OCR engine Tesser-
+
+
+
+
+
+ act
+
+
+
+ and
+
+
+
+ save
+
+
+
+ the
+
+
+
+ results
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+ act and save the results to PAGE XML format.
+
+
+
+
+
+ Version
+
+
+
+ 1.3
+
+
+
+ is
+
+
+
+ based
+
+
+
+ on
+
+
+
+ the
+
+
+
+ latest
+
+
+
+ release
+
+
+
+ of
+
+
+
+ Tesser-
+
+ Version 1.3 is based on the latest release of Tesser-
+
+
+
+
+
+ act
+
+
+
+ (3.03).
+
+ act (3.03).
+
+ A command line tool to analyse document page
+images using the open source OCR engine Tesser-
+act and save the results to PAGE XML format.
+Version 1.3 is based on the latest release of Tesser-
+act (3.03).
+
+
+
+
+
+
+
+ Platform
+
+
+
+ independent
+
+
+
+ libraries
+
+
+
+ to
+
+
+
+ create
+
+
+
+ valid
+
+
+
+ lay-
+
+ Platform independent libraries to create valid lay-
+
+
+
+
+
+ out
+
+
+
+ descriptions
+
+
+
+ in
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+
+
+ The
+
+
+
+ libraries
+
+ out descriptions in PAGE XML format. The libraries
+
+
+
+
+
+ can
+
+
+
+ be
+
+
+
+ easily
+
+
+
+ integrated
+
+
+
+ in
+
+
+
+ other
+
+
+
+ software
+
+
+
+ projects
+
+ can be easily integrated in other software projects
+
+
+
+
+
+ such
+
+
+
+ as
+
+
+
+ page
+
+
+
+ segmentation
+
+
+
+ methods
+
+
+
+ for
+
+
+
+ ICDAR
+
+ such as page segmentation methods for ICDAR
+
+
+
+
+
+ competitions.
+
+ competitions.
+
+ Platform independent libraries to create valid lay-
+out descriptions in PAGE XML format. The libraries
+can be easily integrated in other software projects
+such as page segmentation methods for ICDAR
+competitions.
+
+
+
+
+
+
+
+ This
+
+
+
+ tool
+
+
+
+ is
+
+
+
+ part
+
+
+
+ of
+
+
+
+ a
+
+
+
+ framework
+
+
+
+ for
+
+
+
+ evaluating
+
+
+
+ the
+
+ This tool is part of a framework for evaluating the
+
+
+
+
+
+ performance
+
+
+
+ of
+
+
+
+ layout
+
+
+
+ analysis
+
+
+
+ methods.
+
+
+
+ It
+
+
+
+ com-
+
+ performance of layout analysis methods. It com-
+
+
+
+
+
+ bines
+
+
+
+ efficiency
+
+
+
+ and
+
+
+
+ accuracy
+
+
+
+ by
+
+
+
+ using
+
+
+
+ a
+
+
+
+ special
+
+ bines efficiency and accuracy by using a special
+
+
+
+
+
+ interval
+
+
+
+ based
+
+
+
+ geometric
+
+
+
+ representation
+
+
+
+ of
+
+
+
+ regions.
+
+ interval based geometric representation of regions.
+
+
+
+
+
+ A
+
+
+
+ wide
+
+
+
+ range
+
+
+
+ of
+
+
+
+ sophisticated
+
+
+
+ evaluation
+
+
+
+ measures
+
+ A wide range of sophisticated evaluation measures
+
+
+
+
+
+ provide
+
+
+
+ the
+
+
+
+ means
+
+
+
+ for
+
+
+
+ a
+
+
+
+ deep
+
+
+
+ insight
+
+
+
+ into
+
+
+
+ the
+
+ provide the means for a deep insight into the
+
+
+
+
+
+ analysed
+
+
+
+ systems,
+
+ analysed systems,
+
+
+
+
+
+ which
+
+
+
+ goes
+
+
+
+ far
+
+ which goes far
+
+
+
+
+
+ beyond
+
+
+
+ simple
+
+ beyond simple
+
+
+
+
+
+ benchmarking.
+
+
+
+ The
+
+ benchmarking. The
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ user-
+
+ support of user-
+
+
+
+
+
+ defined
+
+
+
+ profiles
+
+ defined profiles
+
+
+
+
+
+ allows
+
+
+
+ the
+
+
+
+ tuning
+
+ allows the tuning
+
+
+
+
+
+ for
+
+
+
+ any
+
+
+
+ kind
+
+
+
+ of
+
+ for any kind of
+
+
+
+
+
+ evaluation
+
+
+
+ scenario
+
+ evaluation scenario
+
+
+
+
+
+ related
+
+
+
+ to
+
+
+
+ real
+
+ related to real
+
+
+
+
+
+ world
+
+
+
+ applications.
+
+ world applications.
+
+ This tool is part of a framework for evaluating the
+performance of layout analysis methods. It com-
+bines efficiency and accuracy by using a special
+interval based geometric representation of regions.
+A wide range of sophisticated evaluation measures
+provide the means for a deep insight into the
+analysed systems,
+which goes far
+beyond simple
+benchmarking. The
+support of user-
+defined profiles
+allows the tuning
+for any kind of
+evaluation scenario
+related to real
+world applications.
+
+
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml
new file mode 100644
index 0000000..0bdcf8c
--- /dev/null
+++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2018.xml
@@ -0,0 +1,3252 @@
+
+
+
+ PRImA Research Lab
+ 2015-07-17T15:27:13
+ 2018-07-19T07:29:57
+ Example Page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ D
+
+
+
+ o
+
+
+
+ c
+
+
+
+ u
+
+
+
+ m
+
+
+
+ e
+
+
+
+ n
+
+
+
+ t
+
+ Document
+
+
+
+
+
+ A
+
+
+
+ n
+
+
+
+ a
+
+
+
+ l
+
+
+
+ y
+
+
+
+ s
+
+
+
+ i
+
+
+
+ s
+
+ Analysis
+
+
+
+
+
+ S
+
+
+
+ y
+
+
+
+ s
+
+
+
+ t
+
+
+
+ e
+
+
+
+ m
+
+ System
+
+ Aletheia Document Analysis System
+
+ Aletheia Document Analysis System
+
+
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ v
+
+
+
+ e
+
+
+
+ r
+
+
+
+ v
+
+
+
+ i
+
+
+
+ e
+
+
+
+ w
+
+
+
+ :
+
+ Overview:
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ i
+
+
+
+ s
+
+ is
+
+
+
+
+
+ a
+
+
+
+ n
+
+ an
+
+
+
+
+
+ a
+
+
+
+ d
+
+
+
+ -
+
+ ad-
+
+ Overview: Aletheia is an ad-
+
+
+
+
+
+ vanced
+
+
+
+ system
+
+
+
+ for
+
+
+
+ accurate
+
+
+
+ and
+
+
+
+ yet
+
+ vanced system for accurate and yet
+
+
+
+
+
+ cost-effective
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ of
+
+ cost-effective ground truthing of
+
+
+
+
+
+ large
+
+
+
+ amounts
+
+
+
+ of
+
+
+
+ documents.
+
+
+
+ It
+
+
+
+ aids
+
+ large amounts of documents. It aids
+
+
+
+
+
+ the
+
+
+
+ user
+
+
+
+ with
+
+
+
+ a
+
+
+
+ number
+
+
+
+ of
+
+
+
+ automated
+
+ the user with a number of automated
+
+
+
+
+
+ and
+
+
+
+ semi-automated
+
+
+
+ tools
+
+
+
+ which
+
+ and semi-automated tools which
+
+
+
+
+
+ were
+
+
+
+ partly
+
+
+
+ developed
+
+
+
+ and
+
+
+
+ improved
+
+ were partly developed and improved
+
+
+
+
+
+ based
+
+
+
+ on
+
+
+
+ feedback
+
+
+
+ from
+
+
+
+ major
+
+
+
+ librar-
+
+ based on feedback from major librar-
+
+
+
+
+
+ ies
+
+
+
+ across
+
+
+
+ Europe
+
+
+
+ and
+
+
+
+ from
+
+
+
+ their
+
+
+
+ digit-
+
+ ies across Europe and from their digit-
+
+
+
+
+
+ isation
+
+
+
+ service
+
+
+
+ providers
+
+
+
+ which
+
+
+
+ are
+
+
+
+ us-
+
+ isation service providers which are us-
+
+
+
+
+
+ ing
+
+
+
+ the
+
+
+
+ tool
+
+
+
+ in
+
+
+
+ a
+
+
+
+ production
+
+
+
+ environ-
+
+ ing the tool in a production environ-
+
+
+
+
+
+ ment.
+
+ ment.
+
+ Overview: Aletheia is an ad-
+vanced system for accurate and yet
+cost-effective ground truthing of
+large amounts of documents. It aids
+the user with a number of automated
+and semi-automated tools which
+were partly developed and improved
+based on feedback from major librar-
+ies across Europe and from their digit-
+isation service providers which are us-
+ing the tool in a production environ-
+ment.
+
+
+
+
+
+
+
+ Novel
+
+
+
+ features
+
+
+
+ are,
+
+
+
+ among
+
+
+
+ others,
+
+
+
+ the
+
+ Novel features are, among others, the
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ top-down
+
+
+
+ ground
+
+
+
+ truthing
+
+ support of top-down ground truthing
+
+
+
+
+
+ with
+
+
+
+ sophisticated
+
+
+
+ split
+
+
+
+ and
+
+
+
+ shrink
+
+
+
+ tools
+
+ with sophisticated split and shrink tools
+
+
+
+
+
+ as
+
+
+
+ well
+
+
+
+ as
+
+
+
+ bottom-up
+
+
+
+ ground
+
+
+
+ truthing
+
+ as well as bottom-up ground truthing
+
+
+
+
+
+ supporting
+
+
+
+ the
+
+
+
+ aggregation
+
+
+
+ of
+
+
+
+ lower-level
+
+ supporting the aggregation of lower-level
+
+
+
+
+
+ elements
+
+
+
+ to
+
+
+
+ more
+
+
+
+ complex
+
+
+
+ structures.
+
+ elements to more complex structures.
+
+
+
+
+
+ Special
+
+
+
+ features
+
+
+
+ have
+
+
+
+ been
+
+
+
+ developed
+
+
+
+ to
+
+ Special features have been developed to
+
+
+
+
+
+ support
+
+
+
+ working
+
+
+
+ with
+
+
+
+ the
+
+
+
+ complexities
+
+
+
+ of
+
+ support working with the complexities of
+
+
+
+
+
+ historical
+
+
+
+ documents.
+
+
+
+ The
+
+
+
+ integrated
+
+
+
+ vali-
+
+ historical documents. The integrated vali-
+
+
+
+
+
+ dator,
+
+
+
+ in
+
+
+
+ combination
+
+
+
+ with
+
+
+
+ powerful
+
+
+
+ cor-
+
+ dator, in combination with powerful cor-
+
+
+
+
+
+ rection
+
+
+
+ tools,
+
+
+
+ enable
+
+
+
+ efficient
+
+
+
+ production
+
+ rection tools, enable efficient production
+
+
+
+
+
+ of
+
+
+
+ highly
+
+
+
+ accurate
+
+
+
+ ground
+
+
+
+ truth.
+
+ of highly accurate ground truth.
+
+ Novel features are, among others, the
+support of top-down ground truthing
+with sophisticated split and shrink tools
+as well as bottom-up ground truthing
+supporting the aggregation of lower-level
+elements to more complex structures.
+Special features have been developed to
+support working with the complexities of
+historical documents. The integrated vali-
+dator, in combination with powerful cor-
+rection tools, enable efficient production
+of highly accurate ground truth.
+
+
+
+
+
+
+
+ Aletheia
+
+
+
+ uses
+
+
+
+ the
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+ Aletheia uses the PAGE (Page Analysis
+
+
+
+
+
+ and
+
+
+
+ Ground
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ XML
+
+
+
+ format
+
+ and Ground truth Elements) XML format
+
+
+
+
+
+ framework
+
+
+
+ which
+
+
+
+ incorporates
+
+
+
+ several
+
+ framework which incorporates several
+
+
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+
+
+ whole
+
+ XML schemas representing the whole
+
+
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis.
+
+
+
+ See
+
+
+
+ also
+
+ workflow of document analysis. See also
+
+
+
+
+
+ the
+
+
+
+ dedicated
+
+
+
+ infobox.
+
+ the dedicated infobox.
+
+ Aletheia uses the PAGE (Page Analysis
+and Ground truth Elements) XML format
+framework which incorporates several
+XML schemas representing the whole
+workflow of document analysis. See also
+the dedicated infobox.
+
+
+
+
+
+
+
+
+
+ Layers
+
+
+
+ and
+
+
+
+ reading
+
+
+
+ order
+
+ Layers and reading order
+
+ Layers and reading order
+
+
+
+
+
+
+
+
+
+ Screenshot
+
+
+
+ of
+
+
+
+ Aletheia
+
+
+
+ showing
+
+
+
+ regions
+
+
+
+ and
+
+
+
+ properties
+
+ Screenshot of Aletheia showing regions and properties
+
+ Screenshot of Aletheia showing regions and properties
+
+
+
+
+
+
+
+
+
+ The
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+
+
+ and
+
+
+
+ Ground
+
+ The PAGE (Page Analysis and Ground
+
+
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ format
+
+
+
+ framework
+
+
+
+ incorpo-
+
+ truth Elements) format framework incorpo-
+
+
+
+
+
+ rates
+
+
+
+ several
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+ rates several XML schemas representing the
+
+
+
+
+
+ whole
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis,
+
+
+
+ includ-
+
+ whole workflow of document analysis, includ-
+
+
+
+
+
+ ing
+
+
+
+ image
+
+
+
+ enhancement,
+
+
+
+ binarisation,
+
+
+
+ geo-
+
+ ing image enhancement, binarisation, geo-
+
+
+
+
+
+ metrical
+
+
+
+ correction,
+
+
+
+ layout
+
+
+
+ analysis,
+
+
+
+ layout
+
+ metrical correction, layout analysis, layout
+
+
+
+
+
+ evaluation
+
+
+
+ and
+
+
+
+ OCR.
+
+
+
+ The
+
+
+
+ here
+
+
+
+ used
+
+
+
+ schema
+
+ evaluation and OCR. The here used schema
+
+
+
+
+
+ for
+
+
+
+ document
+
+
+
+ layouts
+
+
+
+ allows
+
+
+
+ for
+
+
+
+ polygonal
+
+ for document layouts allows for polygonal
+
+
+
+
+
+ regions
+
+
+
+ with
+
+
+
+ various
+
+
+
+ attributes
+
+
+
+ (including
+
+
+
+ text
+
+ regions with various attributes (including text
+
+
+
+
+
+ content),
+
+
+
+ reading
+
+
+
+ order,
+
+
+
+ layers
+
+
+
+ and
+
+
+
+ more.
+
+ content), reading order, layers and more.
+
+ The PAGE (Page Analysis and Ground
+truth Elements) format framework incorpo-
+rates several XML schemas representing the
+whole workflow of document analysis, includ-
+ing image enhancement, binarisation, geo-
+metrical correction, layout analysis, layout
+evaluation and OCR. The here used schema
+for document layouts allows for polygonal
+regions with various attributes (including text
+content), reading order, layers and more.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From
+
+
+
+ Scratch,
+
+
+
+ Top-Down
+
+ From Scratch, Top-Down
+
+ From Scratch, Top-Down
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ regions
+
+
+
+ using
+
+
+
+ man-
+
+ • Marking regions using man-
+
+
+
+
+
+ ual
+
+
+
+ or
+
+
+
+ semi-automated
+
+
+
+ tools
+
+ ual or semi-automated tools
+
+ • Marking regions using man-
+ual or semi-automated tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ text
+
+
+
+ kines
+
+
+
+ with
+
+
+
+ easy-
+
+ • Marking text kines with easy-
+
+
+
+
+
+ to-use
+
+
+
+ split
+
+
+
+ tools
+
+ to-use split tools
+
+ • Marking text kines with easy-
+to-use split tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ words
+
+
+
+ with
+
+
+
+ assistive
+
+ • Marking words with assistive
+
+
+
+
+
+ tools
+
+ tools
+
+ • Marking words with assistive
+tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ glyphs
+
+
+
+ (characters)
+
+ • Marking glyphs (characters)
+
+ • Marking glyphs (characters)
+
+
+
+
+
+
+
+ •
+
+
+
+ Text
+
+
+
+ transcription
+
+
+
+ and
+
+
+
+ propa-
+
+ • Text transcription and propa-
+
+
+
+
+
+ gation
+
+
+
+ to
+
+
+
+ any
+
+
+
+ required
+
+
+
+ level
+
+ gation to any required level
+
+ • Text transcription and propa-
+gation to any required level
+
+
+
+
+
+
+
+ •
+
+
+
+ Reading
+
+
+
+ order
+
+
+
+ definition
+
+ • Reading order definition
+
+ • Reading order definition
+
+
+
+
+
+
+
+ •
+
+
+
+ Validation
+
+
+
+ to
+
+
+
+ reduce
+
+
+
+ risk
+
+ • Validation to reduce risk
+
+
+
+
+
+ of
+
+
+
+ mistakes
+
+ of mistakes
+
+ • Validation to reduce risk
+of mistakes
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ text
+
+
+
+ content
+
+ • Correcting text content
+
+
+
+
+
+ using
+
+
+
+ rendered
+
+
+
+ text
+
+
+
+ over-
+
+ using rendered text over-
+
+
+
+
+
+ lay
+
+ lay
+
+ • Correcting text content
+using rendered text over-
+lay
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ layout
+
+
+
+ using
+
+ • Correcting layout using
+
+
+
+
+
+ convenient
+
+
+
+ tools
+
+
+
+ such
+
+
+
+ as
+
+ convenient tools such as
+
+
+
+
+
+ merge
+
+
+
+ and
+
+
+
+ split
+
+ merge and split
+
+ • Correcting layout using
+convenient tools such as
+merge and split
+
+
+
+
+
+
+
+ •
+
+
+
+ Automated
+
+
+
+ page
+
+
+
+ analysis
+
+ • Automated page analysis
+
+
+
+
+
+ with
+
+
+
+ integrated
+
+
+
+ Tesseract
+
+ with integrated Tesseract
+
+
+
+
+
+ OCR
+
+
+
+ or
+
+
+
+ opening
+
+
+
+ externally
+
+ OCR or opening externally
+
+
+
+
+
+ generated
+
+
+
+ result
+
+ generated result
+
+ • Automated page analysis
+with integrated Tesseract
+OCR or opening externally
+generated result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ T
+
+
+
+ y
+
+
+
+ p
+
+
+
+ i
+
+
+
+ c
+
+
+
+ l
+
+
+
+ a
+
+ Typical
+
+
+
+
+
+ W
+
+
+
+ o
+
+
+
+ r
+
+
+
+ k
+
+
+
+ fl
+
+
+
+ o
+
+
+
+ s
+
+
+
+ w
+
+ Workflows
+
+ Typical Workflows
+
+ Typical Workflows
+
+
+
+
+
+
+
+ Preproduction
+
+
+
+ +
+
+
+
+ Correction
+
+ Preproduction + Correction
+
+ Preproduction + Correction
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ r
+
+ Other
+
+
+
+
+
+ S
+
+
+
+ o
+
+
+
+ f
+
+
+
+ t
+
+
+
+ w
+
+
+
+ a
+
+
+
+ r
+
+
+
+ e
+
+ Software
+
+
+
+
+
+ T
+
+
+
+ o
+
+
+
+ o
+
+
+
+ l
+
+
+
+ s
+
+ Tools
+
+
+
+
+
+ b
+
+
+
+ y
+
+ by
+
+
+
+
+
+ P
+
+
+
+ R
+
+
+
+ I
+
+
+
+ A
+
+
+
+ m
+
+ PRImA
+
+ Other Software Tools by PRImA
+
+ Other Software Tools by PRImA
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pattern
+
+
+
+ Recognition
+
+
+
+ and
+
+
+
+ Image
+
+
+
+ Analysis
+
+
+
+ Research
+
+
+
+ Lab,
+
+
+
+ School
+
+
+
+ of
+
+
+
+ Computing,
+
+
+
+ Science
+
+
+
+ and
+
+
+
+ Engineering,
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+
+
+
+
+
+ University
+
+
+
+ of
+
+
+
+ Salford,
+
+
+
+ Greater
+
+
+
+ Manchester,
+
+
+
+ United
+
+
+
+ Kingdom,
+
+
+
+ www.primaresearch.org
+
+ University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+
+
+
+
+
+
+ WebAletheia
+
+
+
+ Webapp
+
+ WebAletheia Webapp
+
+ WebAletheia Webapp
+
+
+
+
+
+
+
+ Tesseract
+
+
+
+ OCR
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ For
+
+
+
+ Windows
+
+ Tesseract OCR to PAGE For Windows
+
+ Tesseract OCR to PAGE For Windows
+
+
+
+
+
+
+
+ PAGE
+
+
+
+ Libraries
+
+
+
+ For
+
+
+
+ Java
+
+
+
+ and
+
+
+
+ C++
+
+ PAGE Libraries For Java and C++
+
+ PAGE Libraries For Java and C++
+
+
+
+
+
+
+
+ Layout
+
+
+
+ Evaluation
+
+
+
+ Performance
+
+
+
+ Analysis
+
+
+
+ System
+
+ Layout Evaluation Performance Analysis System
+
+ Layout Evaluation Performance Analysis System
+
+
+
+
+
+
+
+ A
+
+
+
+ lightweight
+
+
+
+ web-based
+
+
+
+ version
+
+
+
+ of
+
+
+
+ the
+
+
+
+ Aletheia
+
+ A lightweight web-based version of the Aletheia
+
+
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ system.
+
+
+
+ Ideal
+
+
+
+ for
+
+
+
+ customised
+
+ ground truthing system. Ideal for customised
+
+
+
+
+
+ workflows
+
+
+
+ and
+
+
+
+ crowdsourcing
+
+
+
+ applications.
+
+
+
+ Go
+
+
+
+ to
+
+ workflows and crowdsourcing applications. Go to
+
+
+
+
+
+ the
+
+
+
+ PRImA
+
+
+
+ website
+
+
+
+ to
+
+
+
+ try
+
+
+
+ it
+
+
+
+ yourself.
+
+ the PRImA website to try it yourself.
+
+ A lightweight web-based version of the Aletheia
+ground truthing system. Ideal for customised
+workflows and crowdsourcing applications. Go to
+the PRImA website to try it yourself.
+
+
+
+
+
+
+
+ A
+
+
+
+ command
+
+
+
+ line
+
+
+
+ tool
+
+
+
+ to
+
+
+
+ analyse
+
+
+
+ document
+
+
+
+ page
+
+ A command line tool to analyse document page
+
+
+
+
+
+ images
+
+
+
+ using
+
+
+
+ the
+
+
+
+ open
+
+
+
+ source
+
+
+
+ OCR
+
+
+
+ engine
+
+
+
+ Tesser-
+
+ images using the open source OCR engine Tesser-
+
+
+
+
+
+ act
+
+
+
+ and
+
+
+
+ save
+
+
+
+ the
+
+
+
+ results
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+ act and save the results to PAGE XML format.
+
+
+
+
+
+ Version
+
+
+
+ 1.3
+
+
+
+ is
+
+
+
+ based
+
+
+
+ on
+
+
+
+ the
+
+
+
+ latest
+
+
+
+ release
+
+
+
+ of
+
+
+
+ Tesser-
+
+ Version 1.3 is based on the latest release of Tesser-
+
+
+
+
+
+ act
+
+
+
+ (3.03).
+
+ act (3.03).
+
+ A command line tool to analyse document page
+images using the open source OCR engine Tesser-
+act and save the results to PAGE XML format.
+Version 1.3 is based on the latest release of Tesser-
+act (3.03).
+
+
+
+
+
+
+
+ Platform
+
+
+
+ independent
+
+
+
+ libraries
+
+
+
+ to
+
+
+
+ create
+
+
+
+ valid
+
+
+
+ lay-
+
+ Platform independent libraries to create valid lay-
+
+
+
+
+
+ out
+
+
+
+ descriptions
+
+
+
+ in
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+
+
+ The
+
+
+
+ libraries
+
+ out descriptions in PAGE XML format. The libraries
+
+
+
+
+
+ can
+
+
+
+ be
+
+
+
+ easily
+
+
+
+ integrated
+
+
+
+ in
+
+
+
+ other
+
+
+
+ software
+
+
+
+ projects
+
+ can be easily integrated in other software projects
+
+
+
+
+
+ such
+
+
+
+ as
+
+
+
+ page
+
+
+
+ segmentation
+
+
+
+ methods
+
+
+
+ for
+
+
+
+ ICDAR
+
+ such as page segmentation methods for ICDAR
+
+
+
+
+
+ competitions.
+
+ competitions.
+
+ Platform independent libraries to create valid lay-
+out descriptions in PAGE XML format. The libraries
+can be easily integrated in other software projects
+such as page segmentation methods for ICDAR
+competitions.
+
+
+
+
+
+
+
+ This
+
+
+
+ tool
+
+
+
+ is
+
+
+
+ part
+
+
+
+ of
+
+
+
+ a
+
+
+
+ framework
+
+
+
+ for
+
+
+
+ evaluating
+
+
+
+ the
+
+ This tool is part of a framework for evaluating the
+
+
+
+
+
+ performance
+
+
+
+ of
+
+
+
+ layout
+
+
+
+ analysis
+
+
+
+ methods.
+
+
+
+ It
+
+
+
+ com-
+
+ performance of layout analysis methods. It com-
+
+
+
+
+
+ bines
+
+
+
+ efficiency
+
+
+
+ and
+
+
+
+ accuracy
+
+
+
+ by
+
+
+
+ using
+
+
+
+ a
+
+
+
+ special
+
+ bines efficiency and accuracy by using a special
+
+
+
+
+
+ interval
+
+
+
+ based
+
+
+
+ geometric
+
+
+
+ representation
+
+
+
+ of
+
+
+
+ regions.
+
+ interval based geometric representation of regions.
+
+
+
+
+
+ A
+
+
+
+ wide
+
+
+
+ range
+
+
+
+ of
+
+
+
+ sophisticated
+
+
+
+ evaluation
+
+
+
+ measures
+
+ A wide range of sophisticated evaluation measures
+
+
+
+
+
+ provide
+
+
+
+ the
+
+
+
+ means
+
+
+
+ for
+
+
+
+ a
+
+
+
+ deep
+
+
+
+ insight
+
+
+
+ into
+
+
+
+ the
+
+ provide the means for a deep insight into the
+
+
+
+
+
+ analysed
+
+
+
+ systems,
+
+ analysed systems,
+
+
+
+
+
+ which
+
+
+
+ goes
+
+
+
+ far
+
+ which goes far
+
+
+
+
+
+ beyond
+
+
+
+ simple
+
+ beyond simple
+
+
+
+
+
+ benchmarking.
+
+
+
+ The
+
+ benchmarking. The
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ user-
+
+ support of user-
+
+
+
+
+
+ defined
+
+
+
+ profiles
+
+ defined profiles
+
+
+
+
+
+ allows
+
+
+
+ the
+
+
+
+ tuning
+
+ allows the tuning
+
+
+
+
+
+ for
+
+
+
+ any
+
+
+
+ kind
+
+
+
+ of
+
+ for any kind of
+
+
+
+
+
+ evaluation
+
+
+
+ scenario
+
+ evaluation scenario
+
+
+
+
+
+ related
+
+
+
+ to
+
+
+
+ real
+
+ related to real
+
+
+
+
+
+ world
+
+
+
+ applications.
+
+ world applications.
+
+ This tool is part of a framework for evaluating the
+performance of layout analysis methods. It com-
+bines efficiency and accuracy by using a special
+interval based geometric representation of regions.
+A wide range of sophisticated evaluation measures
+provide the means for a deep insight into the
+analysed systems,
+which goes far
+beyond simple
+benchmarking. The
+support of user-
+defined profiles
+allows the tuning
+for any kind of
+evaluation scenario
+related to real
+world applications.
+
+
diff --git a/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml
new file mode 100644
index 0000000..531349b
--- /dev/null
+++ b/tests/example/workspaces/aletheiaexamplepage/OCR-D-GT-PAGE/PAGE_2019.xml
@@ -0,0 +1,3252 @@
+
+
+
+ PRImA Research Lab
+ 2015-07-17T15:27:13
+ 2018-07-19T07:29:57
+ Example Page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ D
+
+
+
+ o
+
+
+
+ c
+
+
+
+ u
+
+
+
+ m
+
+
+
+ e
+
+
+
+ n
+
+
+
+ t
+
+ Document
+
+
+
+
+
+ A
+
+
+
+ n
+
+
+
+ a
+
+
+
+ l
+
+
+
+ y
+
+
+
+ s
+
+
+
+ i
+
+
+
+ s
+
+ Analysis
+
+
+
+
+
+ S
+
+
+
+ y
+
+
+
+ s
+
+
+
+ t
+
+
+
+ e
+
+
+
+ m
+
+ System
+
+ Aletheia Document Analysis System
+
+ Aletheia Document Analysis System
+
+
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ v
+
+
+
+ e
+
+
+
+ r
+
+
+
+ v
+
+
+
+ i
+
+
+
+ e
+
+
+
+ w
+
+
+
+ :
+
+ Overview:
+
+
+
+
+
+ A
+
+
+
+ l
+
+
+
+ e
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ i
+
+
+
+ a
+
+ Aletheia
+
+
+
+
+
+ i
+
+
+
+ s
+
+ is
+
+
+
+
+
+ a
+
+
+
+ n
+
+ an
+
+
+
+
+
+ a
+
+
+
+ d
+
+
+
+ -
+
+ ad-
+
+ Overview: Aletheia is an ad-
+
+
+
+
+
+ vanced
+
+
+
+ system
+
+
+
+ for
+
+
+
+ accurate
+
+
+
+ and
+
+
+
+ yet
+
+ vanced system for accurate and yet
+
+
+
+
+
+ cost-effective
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ of
+
+ cost-effective ground truthing of
+
+
+
+
+
+ large
+
+
+
+ amounts
+
+
+
+ of
+
+
+
+ documents.
+
+
+
+ It
+
+
+
+ aids
+
+ large amounts of documents. It aids
+
+
+
+
+
+ the
+
+
+
+ user
+
+
+
+ with
+
+
+
+ a
+
+
+
+ number
+
+
+
+ of
+
+
+
+ automated
+
+ the user with a number of automated
+
+
+
+
+
+ and
+
+
+
+ semi-automated
+
+
+
+ tools
+
+
+
+ which
+
+ and semi-automated tools which
+
+
+
+
+
+ were
+
+
+
+ partly
+
+
+
+ developed
+
+
+
+ and
+
+
+
+ improved
+
+ were partly developed and improved
+
+
+
+
+
+ based
+
+
+
+ on
+
+
+
+ feedback
+
+
+
+ from
+
+
+
+ major
+
+
+
+ librar-
+
+ based on feedback from major librar-
+
+
+
+
+
+ ies
+
+
+
+ across
+
+
+
+ Europe
+
+
+
+ and
+
+
+
+ from
+
+
+
+ their
+
+
+
+ digit-
+
+ ies across Europe and from their digit-
+
+
+
+
+
+ isation
+
+
+
+ service
+
+
+
+ providers
+
+
+
+ which
+
+
+
+ are
+
+
+
+ us-
+
+ isation service providers which are us-
+
+
+
+
+
+ ing
+
+
+
+ the
+
+
+
+ tool
+
+
+
+ in
+
+
+
+ a
+
+
+
+ production
+
+
+
+ environ-
+
+ ing the tool in a production environ-
+
+
+
+
+
+ ment.
+
+ ment.
+
+ Overview: Aletheia is an ad-
+vanced system for accurate and yet
+cost-effective ground truthing of
+large amounts of documents. It aids
+the user with a number of automated
+and semi-automated tools which
+were partly developed and improved
+based on feedback from major librar-
+ies across Europe and from their digit-
+isation service providers which are us-
+ing the tool in a production environ-
+ment.
+
+
+
+
+
+
+
+ Novel
+
+
+
+ features
+
+
+
+ are,
+
+
+
+ among
+
+
+
+ others,
+
+
+
+ the
+
+ Novel features are, among others, the
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ top-down
+
+
+
+ ground
+
+
+
+ truthing
+
+ support of top-down ground truthing
+
+
+
+
+
+ with
+
+
+
+ sophisticated
+
+
+
+ split
+
+
+
+ and
+
+
+
+ shrink
+
+
+
+ tools
+
+ with sophisticated split and shrink tools
+
+
+
+
+
+ as
+
+
+
+ well
+
+
+
+ as
+
+
+
+ bottom-up
+
+
+
+ ground
+
+
+
+ truthing
+
+ as well as bottom-up ground truthing
+
+
+
+
+
+ supporting
+
+
+
+ the
+
+
+
+ aggregation
+
+
+
+ of
+
+
+
+ lower-level
+
+ supporting the aggregation of lower-level
+
+
+
+
+
+ elements
+
+
+
+ to
+
+
+
+ more
+
+
+
+ complex
+
+
+
+ structures.
+
+ elements to more complex structures.
+
+
+
+
+
+ Special
+
+
+
+ features
+
+
+
+ have
+
+
+
+ been
+
+
+
+ developed
+
+
+
+ to
+
+ Special features have been developed to
+
+
+
+
+
+ support
+
+
+
+ working
+
+
+
+ with
+
+
+
+ the
+
+
+
+ complexities
+
+
+
+ of
+
+ support working with the complexities of
+
+
+
+
+
+ historical
+
+
+
+ documents.
+
+
+
+ The
+
+
+
+ integrated
+
+
+
+ vali-
+
+ historical documents. The integrated vali-
+
+
+
+
+
+ dator,
+
+
+
+ in
+
+
+
+ combination
+
+
+
+ with
+
+
+
+ powerful
+
+
+
+ cor-
+
+ dator, in combination with powerful cor-
+
+
+
+
+
+ rection
+
+
+
+ tools,
+
+
+
+ enable
+
+
+
+ efficient
+
+
+
+ production
+
+ rection tools, enable efficient production
+
+
+
+
+
+ of
+
+
+
+ highly
+
+
+
+ accurate
+
+
+
+ ground
+
+
+
+ truth.
+
+ of highly accurate ground truth.
+
+ Novel features are, among others, the
+support of top-down ground truthing
+with sophisticated split and shrink tools
+as well as bottom-up ground truthing
+supporting the aggregation of lower-level
+elements to more complex structures.
+Special features have been developed to
+support working with the complexities of
+historical documents. The integrated vali-
+dator, in combination with powerful cor-
+rection tools, enable efficient production
+of highly accurate ground truth.
+
+
+
+
+
+
+
+ Aletheia
+
+
+
+ uses
+
+
+
+ the
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+ Aletheia uses the PAGE (Page Analysis
+
+
+
+
+
+ and
+
+
+
+ Ground
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ XML
+
+
+
+ format
+
+ and Ground truth Elements) XML format
+
+
+
+
+
+ framework
+
+
+
+ which
+
+
+
+ incorporates
+
+
+
+ several
+
+ framework which incorporates several
+
+
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+
+
+ whole
+
+ XML schemas representing the whole
+
+
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis.
+
+
+
+ See
+
+
+
+ also
+
+ workflow of document analysis. See also
+
+
+
+
+
+ the
+
+
+
+ dedicated
+
+
+
+ infobox.
+
+ the dedicated infobox.
+
+ Aletheia uses the PAGE (Page Analysis
+and Ground truth Elements) XML format
+framework which incorporates several
+XML schemas representing the whole
+workflow of document analysis. See also
+the dedicated infobox.
+
+
+
+
+
+
+
+
+
+ Layers
+
+
+
+ and
+
+
+
+ reading
+
+
+
+ order
+
+ Layers and reading order
+
+ Layers and reading order
+
+
+
+
+
+
+
+
+
+ Screenshot
+
+
+
+ of
+
+
+
+ Aletheia
+
+
+
+ showing
+
+
+
+ regions
+
+
+
+ and
+
+
+
+ properties
+
+ Screenshot of Aletheia showing regions and properties
+
+ Screenshot of Aletheia showing regions and properties
+
+
+
+
+
+
+
+
+
+ The
+
+
+
+ PAGE
+
+
+
+ (Page
+
+
+
+ Analysis
+
+
+
+ and
+
+
+
+ Ground
+
+ The PAGE (Page Analysis and Ground
+
+
+
+
+
+ truth
+
+
+
+ Elements)
+
+
+
+ format
+
+
+
+ framework
+
+
+
+ incorpo-
+
+ truth Elements) format framework incorpo-
+
+
+
+
+
+ rates
+
+
+
+ several
+
+
+
+ XML
+
+
+
+ schemas
+
+
+
+ representing
+
+
+
+ the
+
+ rates several XML schemas representing the
+
+
+
+
+
+ whole
+
+
+
+ workflow
+
+
+
+ of
+
+
+
+ document
+
+
+
+ analysis,
+
+
+
+ includ-
+
+ whole workflow of document analysis, includ-
+
+
+
+
+
+ ing
+
+
+
+ image
+
+
+
+ enhancement,
+
+
+
+ binarisation,
+
+
+
+ geo-
+
+ ing image enhancement, binarisation, geo-
+
+
+
+
+
+ metrical
+
+
+
+ correction,
+
+
+
+ layout
+
+
+
+ analysis,
+
+
+
+ layout
+
+ metrical correction, layout analysis, layout
+
+
+
+
+
+ evaluation
+
+
+
+ and
+
+
+
+ OCR.
+
+
+
+ The
+
+
+
+ here
+
+
+
+ used
+
+
+
+ schema
+
+ evaluation and OCR. The here used schema
+
+
+
+
+
+ for
+
+
+
+ document
+
+
+
+ layouts
+
+
+
+ allows
+
+
+
+ for
+
+
+
+ polygonal
+
+ for document layouts allows for polygonal
+
+
+
+
+
+ regions
+
+
+
+ with
+
+
+
+ various
+
+
+
+ attributes
+
+
+
+ (including
+
+
+
+ text
+
+ regions with various attributes (including text
+
+
+
+
+
+ content),
+
+
+
+ reading
+
+
+
+ order,
+
+
+
+ layers
+
+
+
+ and
+
+
+
+ more.
+
+ content), reading order, layers and more.
+
+ The PAGE (Page Analysis and Ground
+truth Elements) format framework incorpo-
+rates several XML schemas representing the
+whole workflow of document analysis, includ-
+ing image enhancement, binarisation, geo-
+metrical correction, layout analysis, layout
+evaluation and OCR. The here used schema
+for document layouts allows for polygonal
+regions with various attributes (including text
+content), reading order, layers and more.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From
+
+
+
+ Scratch,
+
+
+
+ Top-Down
+
+ From Scratch, Top-Down
+
+ From Scratch, Top-Down
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ regions
+
+
+
+ using
+
+
+
+ man-
+
+ • Marking regions using man-
+
+
+
+
+
+ ual
+
+
+
+ or
+
+
+
+ semi-automated
+
+
+
+ tools
+
+ ual or semi-automated tools
+
+ • Marking regions using man-
+ual or semi-automated tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ text
+
+
+
+ kines
+
+
+
+ with
+
+
+
+ easy-
+
+ • Marking text kines with easy-
+
+
+
+
+
+ to-use
+
+
+
+ split
+
+
+
+ tools
+
+ to-use split tools
+
+ • Marking text kines with easy-
+to-use split tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ words
+
+
+
+ with
+
+
+
+ assistive
+
+ • Marking words with assistive
+
+
+
+
+
+ tools
+
+ tools
+
+ • Marking words with assistive
+tools
+
+
+
+
+
+
+
+ •
+
+
+
+ Marking
+
+
+
+ glyphs
+
+
+
+ (characters)
+
+ • Marking glyphs (characters)
+
+ • Marking glyphs (characters)
+
+
+
+
+
+
+
+ •
+
+
+
+ Text
+
+
+
+ transcription
+
+
+
+ and
+
+
+
+ propa-
+
+ • Text transcription and propa-
+
+
+
+
+
+ gation
+
+
+
+ to
+
+
+
+ any
+
+
+
+ required
+
+
+
+ level
+
+ gation to any required level
+
+ • Text transcription and propa-
+gation to any required level
+
+
+
+
+
+
+
+ •
+
+
+
+ Reading
+
+
+
+ order
+
+
+
+ definition
+
+ • Reading order definition
+
+ • Reading order definition
+
+
+
+
+
+
+
+ •
+
+
+
+ Validation
+
+
+
+ to
+
+
+
+ reduce
+
+
+
+ risk
+
+ • Validation to reduce risk
+
+
+
+
+
+ of
+
+
+
+ mistakes
+
+ of mistakes
+
+ • Validation to reduce risk
+of mistakes
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ text
+
+
+
+ content
+
+ • Correcting text content
+
+
+
+
+
+ using
+
+
+
+ rendered
+
+
+
+ text
+
+
+
+ over-
+
+ using rendered text over-
+
+
+
+
+
+ lay
+
+ lay
+
+ • Correcting text content
+using rendered text over-
+lay
+
+
+
+
+
+
+
+ •
+
+
+
+ Correcting
+
+
+
+ layout
+
+
+
+ using
+
+ • Correcting layout using
+
+
+
+
+
+ convenient
+
+
+
+ tools
+
+
+
+ such
+
+
+
+ as
+
+ convenient tools such as
+
+
+
+
+
+ merge
+
+
+
+ and
+
+
+
+ split
+
+ merge and split
+
+ • Correcting layout using
+convenient tools such as
+merge and split
+
+
+
+
+
+
+
+ •
+
+
+
+ Automated
+
+
+
+ page
+
+
+
+ analysis
+
+ • Automated page analysis
+
+
+
+
+
+ with
+
+
+
+ integrated
+
+
+
+ Tesseract
+
+ with integrated Tesseract
+
+
+
+
+
+ OCR
+
+
+
+ or
+
+
+
+ opening
+
+
+
+ externally
+
+ OCR or opening externally
+
+
+
+
+
+ generated
+
+
+
+ result
+
+ generated result
+
+ • Automated page analysis
+with integrated Tesseract
+OCR or opening externally
+generated result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ T
+
+
+
+ y
+
+
+
+ p
+
+
+
+ i
+
+
+
+ c
+
+
+
+ l
+
+
+
+ a
+
+ Typical
+
+
+
+
+
+ W
+
+
+
+ o
+
+
+
+ r
+
+
+
+ k
+
+
+
+ fl
+
+
+
+ o
+
+
+
+ s
+
+
+
+ w
+
+ Workflows
+
+ Typical Workflows
+
+ Typical Workflows
+
+
+
+
+
+
+
+ Preproduction
+
+
+
+ +
+
+
+
+ Correction
+
+ Preproduction + Correction
+
+ Preproduction + Correction
+
+
+
+
+
+
+
+
+
+ O
+
+
+
+ t
+
+
+
+ h
+
+
+
+ e
+
+
+
+ r
+
+ Other
+
+
+
+
+
+ S
+
+
+
+ o
+
+
+
+ f
+
+
+
+ t
+
+
+
+ w
+
+
+
+ a
+
+
+
+ r
+
+
+
+ e
+
+ Software
+
+
+
+
+
+ T
+
+
+
+ o
+
+
+
+ o
+
+
+
+ l
+
+
+
+ s
+
+ Tools
+
+
+
+
+
+ b
+
+
+
+ y
+
+ by
+
+
+
+
+
+ P
+
+
+
+ R
+
+
+
+ I
+
+
+
+ A
+
+
+
+ m
+
+ PRImA
+
+ Other Software Tools by PRImA
+
+ Other Software Tools by PRImA
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pattern
+
+
+
+ Recognition
+
+
+
+ and
+
+
+
+ Image
+
+
+
+ Analysis
+
+
+
+ Research
+
+
+
+ Lab,
+
+
+
+ School
+
+
+
+ of
+
+
+
+ Computing,
+
+
+
+ Science
+
+
+
+ and
+
+
+
+ Engineering,
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+
+
+
+
+
+ University
+
+
+
+ of
+
+
+
+ Salford,
+
+
+
+ Greater
+
+
+
+ Manchester,
+
+
+
+ United
+
+
+
+ Kingdom,
+
+
+
+ www.primaresearch.org
+
+ University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+ Pattern Recognition and Image Analysis Research Lab, School of Computing, Science and Engineering,
+University of Salford, Greater Manchester, United Kingdom, www.primaresearch.org
+
+
+
+
+
+
+
+ WebAletheia
+
+
+
+ Webapp
+
+ WebAletheia Webapp
+
+ WebAletheia Webapp
+
+
+
+
+
+
+
+ Tesseract
+
+
+
+ OCR
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ For
+
+
+
+ Windows
+
+ Tesseract OCR to PAGE For Windows
+
+ Tesseract OCR to PAGE For Windows
+
+
+
+
+
+
+
+ PAGE
+
+
+
+ Libraries
+
+
+
+ For
+
+
+
+ Java
+
+
+
+ and
+
+
+
+ C++
+
+ PAGE Libraries For Java and C++
+
+ PAGE Libraries For Java and C++
+
+
+
+
+
+
+
+ Layout
+
+
+
+ Evaluation
+
+
+
+ Performance
+
+
+
+ Analysis
+
+
+
+ System
+
+ Layout Evaluation Performance Analysis System
+
+ Layout Evaluation Performance Analysis System
+
+
+
+
+
+
+
+ A
+
+
+
+ lightweight
+
+
+
+ web-based
+
+
+
+ version
+
+
+
+ of
+
+
+
+ the
+
+
+
+ Aletheia
+
+ A lightweight web-based version of the Aletheia
+
+
+
+
+
+ ground
+
+
+
+ truthing
+
+
+
+ system.
+
+
+
+ Ideal
+
+
+
+ for
+
+
+
+ customised
+
+ ground truthing system. Ideal for customised
+
+
+
+
+
+ workflows
+
+
+
+ and
+
+
+
+ crowdsourcing
+
+
+
+ applications.
+
+
+
+ Go
+
+
+
+ to
+
+ workflows and crowdsourcing applications. Go to
+
+
+
+
+
+ the
+
+
+
+ PRImA
+
+
+
+ website
+
+
+
+ to
+
+
+
+ try
+
+
+
+ it
+
+
+
+ yourself.
+
+ the PRImA website to try it yourself.
+
+ A lightweight web-based version of the Aletheia
+ground truthing system. Ideal for customised
+workflows and crowdsourcing applications. Go to
+the PRImA website to try it yourself.
+
+
+
+
+
+
+
+ A
+
+
+
+ command
+
+
+
+ line
+
+
+
+ tool
+
+
+
+ to
+
+
+
+ analyse
+
+
+
+ document
+
+
+
+ page
+
+ A command line tool to analyse document page
+
+
+
+
+
+ images
+
+
+
+ using
+
+
+
+ the
+
+
+
+ open
+
+
+
+ source
+
+
+
+ OCR
+
+
+
+ engine
+
+
+
+ Tesser-
+
+ images using the open source OCR engine Tesser-
+
+
+
+
+
+ act
+
+
+
+ and
+
+
+
+ save
+
+
+
+ the
+
+
+
+ results
+
+
+
+ to
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+ act and save the results to PAGE XML format.
+
+
+
+
+
+ Version
+
+
+
+ 1.3
+
+
+
+ is
+
+
+
+ based
+
+
+
+ on
+
+
+
+ the
+
+
+
+ latest
+
+
+
+ release
+
+
+
+ of
+
+
+
+ Tesser-
+
+ Version 1.3 is based on the latest release of Tesser-
+
+
+
+
+
+ act
+
+
+
+ (3.03).
+
+ act (3.03).
+
+ A command line tool to analyse document page
+images using the open source OCR engine Tesser-
+act and save the results to PAGE XML format.
+Version 1.3 is based on the latest release of Tesser-
+act (3.03).
+
+
+
+
+
+
+
+ Platform
+
+
+
+ independent
+
+
+
+ libraries
+
+
+
+ to
+
+
+
+ create
+
+
+
+ valid
+
+
+
+ lay-
+
+ Platform independent libraries to create valid lay-
+
+
+
+
+
+ out
+
+
+
+ descriptions
+
+
+
+ in
+
+
+
+ PAGE
+
+
+
+ XML
+
+
+
+ format.
+
+
+
+ The
+
+
+
+ libraries
+
+ out descriptions in PAGE XML format. The libraries
+
+
+
+
+
+ can
+
+
+
+ be
+
+
+
+ easily
+
+
+
+ integrated
+
+
+
+ in
+
+
+
+ other
+
+
+
+ software
+
+
+
+ projects
+
+ can be easily integrated in other software projects
+
+
+
+
+
+ such
+
+
+
+ as
+
+
+
+ page
+
+
+
+ segmentation
+
+
+
+ methods
+
+
+
+ for
+
+
+
+ ICDAR
+
+ such as page segmentation methods for ICDAR
+
+
+
+
+
+ competitions.
+
+ competitions.
+
+ Platform independent libraries to create valid lay-
+out descriptions in PAGE XML format. The libraries
+can be easily integrated in other software projects
+such as page segmentation methods for ICDAR
+competitions.
+
+
+
+
+
+
+
+ This
+
+
+
+ tool
+
+
+
+ is
+
+
+
+ part
+
+
+
+ of
+
+
+
+ a
+
+
+
+ framework
+
+
+
+ for
+
+
+
+ evaluating
+
+
+
+ the
+
+ This tool is part of a framework for evaluating the
+
+
+
+
+
+ performance
+
+
+
+ of
+
+
+
+ layout
+
+
+
+ analysis
+
+
+
+ methods.
+
+
+
+ It
+
+
+
+ com-
+
+ performance of layout analysis methods. It com-
+
+
+
+
+
+ bines
+
+
+
+ efficiency
+
+
+
+ and
+
+
+
+ accuracy
+
+
+
+ by
+
+
+
+ using
+
+
+
+ a
+
+
+
+ special
+
+ bines efficiency and accuracy by using a special
+
+
+
+
+
+ interval
+
+
+
+ based
+
+
+
+ geometric
+
+
+
+ representation
+
+
+
+ of
+
+
+
+ regions.
+
+ interval based geometric representation of regions.
+
+
+
+
+
+ A
+
+
+
+ wide
+
+
+
+ range
+
+
+
+ of
+
+
+
+ sophisticated
+
+
+
+ evaluation
+
+
+
+ measures
+
+ A wide range of sophisticated evaluation measures
+
+
+
+
+
+ provide
+
+
+
+ the
+
+
+
+ means
+
+
+
+ for
+
+
+
+ a
+
+
+
+ deep
+
+
+
+ insight
+
+
+
+ into
+
+
+
+ the
+
+ provide the means for a deep insight into the
+
+
+
+
+
+ analysed
+
+
+
+ systems,
+
+ analysed systems,
+
+
+
+
+
+ which
+
+
+
+ goes
+
+
+
+ far
+
+ which goes far
+
+
+
+
+
+ beyond
+
+
+
+ simple
+
+ beyond simple
+
+
+
+
+
+ benchmarking.
+
+
+
+ The
+
+ benchmarking. The
+
+
+
+
+
+ support
+
+
+
+ of
+
+
+
+ user-
+
+ support of user-
+
+
+
+
+
+ defined
+
+
+
+ profiles
+
+ defined profiles
+
+
+
+
+
+ allows
+
+
+
+ the
+
+
+
+ tuning
+
+ allows the tuning
+
+
+
+
+
+ for
+
+
+
+ any
+
+
+
+ kind
+
+
+
+ of
+
+ for any kind of
+
+
+
+
+
+ evaluation
+
+
+
+ scenario
+
+ evaluation scenario
+
+
+
+
+
+ related
+
+
+
+ to
+
+
+
+ real
+
+ related to real
+
+
+
+
+
+ world
+
+
+
+ applications.
+
+ world applications.
+
+ This tool is part of a framework for evaluating the
+performance of layout analysis methods. It com-
+bines efficiency and accuracy by using a special
+interval based geometric representation of regions.
+A wide range of sophisticated evaluation measures
+provide the means for a deep insight into the
+analysed systems,
+which goes far
+beyond simple
+benchmarking. The
+support of user-
+defined profiles
+allows the tuning
+for any kind of
+evaluation scenario
+related to real
+world applications.
+
+
diff --git a/tests/example/workspaces/aletheiaexamplepage/mets.xml b/tests/example/workspaces/aletheiaexamplepage/mets.xml
new file mode 100644
index 0000000..c0fa26a
--- /dev/null
+++ b/tests/example/workspaces/aletheiaexamplepage/mets.xml
@@ -0,0 +1,73 @@
+
+
+
+
+ ocrd/core v2.26.1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/model/test_page.py b/tests/model/test_page.py
new file mode 100644
index 0000000..2b2ab85
--- /dev/null
+++ b/tests/model/test_page.py
@@ -0,0 +1,14 @@
+from tests import TestCase, TEST_BASE_PATH
+from ocrd_browser.model import Document
+
+
+class PageTestCase(TestCase):
+
+ def setUp(self):
+ self.doc = Document.load(TEST_BASE_PATH / 'example/workspaces/aletheiaexamplepage/mets.xml')
+
+ def test_xpath_works_with_different_namespaces(self):
+ for page_id in ['PAGE_2017', 'PAGE_2018', 'PAGE_2019']:
+ page = self.doc.page_for_id(page_id, 'OCR-D-GT-PAGE')
+ xpath_result = page.xpath('/page:PcGts/page:Page/@imageFilename')
+ self.assertGreater(len(xpath_result), 0)
diff --git a/tests/util/test_image.py b/tests/util/test_image.py
index 3cd2ed5..cda0ab0 100644
--- a/tests/util/test_image.py
+++ b/tests/util/test_image.py
@@ -31,7 +31,7 @@ def _image_modes():
class ImageUtilTestCase(TestCase):
def test_pil_to_pixbuf_is_faster_via_opencv(self):
- # self.skipTest('Slow test')
+ self.skipTest('Slow test')
files = [
ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG/OCR-D-IMG_0017.tif',
ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png',