Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: OCR-D/ocrd_segment
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v0.1.17
Choose a base ref
...
head repository: OCR-D/ocrd_segment
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: v0.1.18
Choose a head ref
  • 4 commits
  • 3 files changed
  • 1 contributor

Commits on Mar 29, 2022

  1. Copy the full SHA
    c523b8c View commit details

Commits on Mar 30, 2022

  1. extract-lines: fix typo

    bertsky committed Mar 30, 2022
    Copy the full SHA
    f6c7f1c View commit details
  2. Copy the full SHA
    a52f684 View commit details
  3. 📦 0.1.18

    bertsky committed Mar 30, 2022
    Copy the full SHA
    a197002 View commit details
Showing with 45 additions and 38 deletions.
  1. +7 −0 CHANGELOG.md
  2. +7 −7 ocrd_segment/extract_lines.py
  3. +31 −31 ocrd_segment/ocrd-tool.json
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -4,6 +4,11 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [0.1.18] - 2022-03-30

* extract-lines/words: move extra parameters where they belong
* extract-lines: fix regressions in v0.1.15

## [0.1.17] - 2022-03-12

Changed:
@@ -163,6 +168,8 @@ Changed:
* further improve README

<!-- link-labels -->
[0.1.18]: ../../compare/v0.1.17...v0.1.18
[0.1.17]: ../../compare/v0.1.16...v0.1.17
[0.1.16]: ../../compare/v0.1.15...v0.1.16
[0.1.15]: ../../compare/v0.1.14...v0.1.15
[0.1.14]: ../../compare/v0.1.13...v0.1.14
14 changes: 7 additions & 7 deletions ocrd_segment/extract_lines.py
Original file line number Diff line number Diff line change
@@ -74,7 +74,7 @@ def process(self):
* fileID + regionID + lineID + '.nrm.png': line image (if the workflow provides grayscale-normalized images)
* fileID + regionID + lineID + '.json': line metadata.
* fileID + regionID + lineID + '.gt.txt': line text.
* fileID + '.xslx': spreadsheet file.
* fileID + '.xlsx': spreadsheet file.
(This is intended for correction, training and evaluation of OCR models.)
"""
@@ -110,7 +110,7 @@ def process(self):
if not os.path.isdir(self.output_file_grp):
os.mkdir(self.output_file_grp)

if 'xslx' in out_types:
if 'xlsx' in out_types:
LOG.info('Writing Excel result file "%s.xlsx" in "%s"', file_id, self.output_file_grp)
excel_path = '%s.xlsx' % os.path.join(self.output_file_grp, file_id)
workbook = xlsxwriter.Workbook(excel_path,
@@ -130,10 +130,10 @@ def process(self):
worksheet.write('D1', 'Image', bold)
symbols = 'ſ ꝛ aͤ oͤ uͤ æ œ Æ Œ ℳ ç ę ë č ř š ž ě — – - ⸗ = Α α Β β ϐ Γ γ Δ δ Ε ε ϵ Ζ ζ Η η Θ θ ϑ Ι ι ' \
'Κ κ ϰ Λ λ Μ μ Ν ν Ξ ξ Ο ο Π π ϖ Ρ ρ ϱ Σ σ ς ϲ Τ τ Υ υ ϒ Φ φ ϕ Χ χ Ψ ψ Ω ω'.split(' ')
for i, s in enumerate(symbols):
col_idx = 4 + i
worksheet.write_string(0, col_idx, s, editable)
worksheet.set_column(col_idx, col_idx, 2)
for i, s in enumerate(symbols):
col_idx = 4 + i
worksheet.write_string(0, col_idx, s, editable)
worksheet.set_column(col_idx, col_idx, 2)
worksheet.protect('', {
'objects': True,
'scenarios': True,
@@ -255,7 +255,7 @@ def process(self):
self.output_file_grp,
page_id=page_id,
mimetype=self.parameter['mimetype'])
if 'xslx' in out_types:
if 'xlsx' in out_types:
scale = 40.0 / line_image.height
worksheet.write('A%d' % i, file_id + '_' + region.id + '_' + line.id, normal)
if len(ltext) > max_text_length:
62 changes: 31 additions & 31 deletions ocrd_segment/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "0.1.17",
"version": "0.1.18",
"git_url": "https://github.com/OCR-D/ocrd_segment",
"tools": {
"ocrd-segment-repair": {
@@ -315,6 +315,36 @@
"type": "boolean",
"default": true,
"description": "Add alpha channels with segment masks to the images"
},
"output-types": {
"type": "array",
"default": ["text", "json", "xlsx"],
"items": {
"type": "string",
"enum": ["text", "json", "xlsx"]
},
"description": "What kind of files to extract besides the line image itself (text/json files for each line, xlsx per page)."
},
"library-convention": {
"type": "string",
"enum": ["slub", "sbb", "none"],
"default": "none",
"description": "For xlsx extraction, to make line images hyperlinked, use this scheme in reconstructing presentation URLs of original pages. Libraries have different conventions in their METS files. Set to none to disable."
},
"min-line-length": {
"type": "number",
"default": 5,
"description": "Only extract lines with at least this many characters."
},
"min-line-width": {
"type": "number",
"default": 200,
"description": "Only extract lines that are at least this wide (in px)."
},
"min-line-height": {
"type": "number",
"default": 30,
"description": "Only extract lines that are at least this high (in px)."
}
}
},
@@ -346,36 +376,6 @@
"type": "boolean",
"default": true,
"description": "Add alpha channels with segment masks to the images"
},
"output-types": {
"type": "array",
"default": ["text", "json", "xslx"],
"items": {
"type": "string",
"enum": ["text", "json", "xslx"]
},
"description": "What kind of files to extract besides the line image itself (text/json files for each line, xlsx per page)."
},
"library-convention": {
"type": "string",
"enum": ["slub", "sbb", "none"],
"default": "none",
"description": "For xlsx extraction, to make line images hyperlinked, use this scheme in reconstructing presentation URLs of original pages. Libraries have different conventions in their METS files. Set to none to disable."
},
"min-line-length": {
"type": "number",
"default": 5,
"description": "Only extract lines with at least this many characters."
},
"min-line-width": {
"type": "number",
"default": 200,
"description": "Only extract lines that are at least this wide (in px)."
},
"min-line-height": {
"type": "number",
"default": 30,
"description": "Only extract lines that are at least this high (in px)."
}
}
},