OCR-D · Mar 29, 2022 · Mar 30, 2022 · Mar 30, 2022 · Mar 30, 2022
Showing with 45 additions and 38 deletions.

+7 −0 CHANGELOG.md

+7 −7 ocrd_segment/extract_lines.py

+31 −31 ocrd_segment/ocrd-tool.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.1.18] - 2022-03-30
+
+ * extract-lines/words: move extra parameters where they belong
+ * extract-lines: fix regressions in v0.1.15
+
 ## [0.1.17] - 2022-03-12
 
 Changed:
@@ -163,6 +168,8 @@ Changed:
   * further improve README
 
 <!-- link-labels -->
+[0.1.18]: ../../compare/v0.1.17...v0.1.18
+[0.1.17]: ../../compare/v0.1.16...v0.1.17
 [0.1.16]: ../../compare/v0.1.15...v0.1.16
 [0.1.15]: ../../compare/v0.1.14...v0.1.15
 [0.1.14]: ../../compare/v0.1.13...v0.1.14

diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py
@@ -74,7 +74,7 @@ def process(self):
         * fileID + regionID + lineID + '.nrm.png': line image (if the workflow provides grayscale-normalized images)
         * fileID + regionID + lineID + '.json': line metadata.
         * fileID + regionID + lineID + '.gt.txt': line text.
-        * fileID + '.xslx': spreadsheet file.
+        * fileID + '.xlsx': spreadsheet file.
         
         (This is intended for correction, training and evaluation of OCR models.)
         """
@@ -110,7 +110,7 @@ def process(self):
             if not os.path.isdir(self.output_file_grp):
                 os.mkdir(self.output_file_grp)
 
-            if 'xslx' in out_types:
+            if 'xlsx' in out_types:
                 LOG.info('Writing Excel result file "%s.xlsx" in "%s"', file_id, self.output_file_grp)
                 excel_path = '%s.xlsx' % os.path.join(self.output_file_grp, file_id)
                 workbook = xlsxwriter.Workbook(excel_path,
@@ -130,10 +130,10 @@ def process(self):
                 worksheet.write('D1', 'Image', bold)
                 symbols = 'ſ ꝛ aͤ oͤ uͤ æ œ Æ Œ ℳ  ç ę ë č ř š ž ě — – - ⸗ = Α α Β β ϐ Γ γ Δ δ Ε ε ϵ Ζ ζ Η η Θ θ ϑ Ι ι ' \
                     'Κ κ ϰ Λ λ Μ μ Ν ν Ξ ξ Ο ο Π π ϖ Ρ ρ ϱ Σ σ ς ϲ Τ τ Υ υ ϒ Φ φ ϕ Χ χ Ψ ψ Ω ω'.split(' ')
-            for i, s in enumerate(symbols):
-                col_idx = 4 + i
-                worksheet.write_string(0, col_idx, s, editable)
-                worksheet.set_column(col_idx, col_idx, 2)
+                for i, s in enumerate(symbols):
+                    col_idx = 4 + i
+                    worksheet.write_string(0, col_idx, s, editable)
+                    worksheet.set_column(col_idx, col_idx, 2)
                 worksheet.protect('', {
                     'objects':               True,
                     'scenarios':             True,
@@ -255,7 +255,7 @@ def process(self):
                         self.output_file_grp,
                         page_id=page_id,
                         mimetype=self.parameter['mimetype'])
-                    if 'xslx' in out_types:
+                    if 'xlsx' in out_types:
                         scale = 40.0 / line_image.height
                         worksheet.write('A%d' % i, file_id + '_' + region.id + '_' + line.id, normal)
                         if len(ltext) > max_text_length:

diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.1.17",
+  "version": "0.1.18",
   "git_url": "https://github.com/OCR-D/ocrd_segment",
   "tools": {
     "ocrd-segment-repair": {
@@ -315,6 +315,36 @@
           "type": "boolean",
           "default": true,
           "description": "Add alpha channels with segment masks to the images"
+        },
+        "output-types": {
+          "type": "array",
+          "default": ["text", "json", "xlsx"],
+          "items": {
+            "type": "string",
+            "enum": ["text", "json", "xlsx"]
+          },
+          "description": "What kind of files to extract besides the line image itself (text/json files for  each line, xlsx per page)."
+        },
+        "library-convention": {
+          "type": "string",
+          "enum": ["slub", "sbb", "none"],
+          "default": "none",
+          "description": "For xlsx extraction, to make line images hyperlinked, use this scheme in reconstructing presentation URLs of original pages. Libraries have different conventions in their METS files. Set to none to disable."
+        },
+        "min-line-length": {
+          "type": "number",
+          "default": 5,
+          "description": "Only extract lines with at least this many characters."
+        },
+        "min-line-width": {
+          "type": "number",
+          "default": 200,
+          "description": "Only extract lines that are at least this wide (in px)."
+        },
+        "min-line-height": {
+          "type": "number",
+          "default": 30,
+          "description": "Only extract lines that are at least this high (in px)."
         }
       }
     },
@@ -346,36 +376,6 @@
           "type": "boolean",
           "default": true,
           "description": "Add alpha channels with segment masks to the images"
-        },
-        "output-types": {
-          "type": "array",
-          "default": ["text", "json", "xslx"],
-          "items": {
-            "type": "string",
-            "enum": ["text", "json", "xslx"]
-          },
-          "description": "What kind of files to extract besides the line image itself (text/json files for  each line, xlsx per page)."
-        },
-        "library-convention": {
-          "type": "string",
-          "enum": ["slub", "sbb", "none"],
-          "default": "none",
-          "description": "For xlsx extraction, to make line images hyperlinked, use this scheme in reconstructing presentation URLs of original pages. Libraries have different conventions in their METS files. Set to none to disable."
-        },
-        "min-line-length": {
-          "type": "number",
-          "default": 5,
-          "description": "Only extract lines with at least this many characters."
-        },
-        "min-line-width": {
-          "type": "number",
-          "default": 200,
-          "description": "Only extract lines that are at least this wide (in px)."
-        },
-        "min-line-height": {
-          "type": "number",
-          "default": 30,
-          "description": "Only extract lines that are at least this high (in px)."
         }
       }
     },