Merge pull request #101 from OCR-D/use-ruff-config-like-dinglehoppers

Use ruff config like dinglehopper's
OCR-D · Oct 19, 2023 · 6a158f0 · 6a158f0
2 parents 8188219 + a1aa6ab
commit 6a158f0
Show file tree

Hide file tree

Showing 10 changed files with 132 additions and 80 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,29 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_size = 4
+indent_style = space
+insert_final_newline = true
+trim_trailing_whitespace = true
+max_line_length = 88
+tab_width = 4
+
+[{*.cfg, *.ini, *.html, *.yaml, *.yml}]
+indent_size = 2
+
+[*.json]
+indent_size = 2
+insert_final_newline = true
+
+# trailing spaces in markdown indicate word wrap
+[*.md]
+trim_trailing_whitespace = false
+
+[*.py]
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_wrap = 0
+use_parentheses = True
+ensure_newline_before_comments = True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,30 +1,32 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
-    hooks:
+-   hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-json
     -   id: check-toml
     -   id: check-yaml
     -   id: check-added-large-files
     -   id: check-ast
-
--   repo: https://github.com/psf/black
-    rev: 22.10.0
-    hooks:
+    repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+-   hooks:
     -   id: black
-
--   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.280
-    hooks:
-    -   id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
-
--   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
-    hooks:
-    -   id: mypy
-        additional_dependencies: ['types-setuptools']
+    repo: https://github.com/psf/black
+    rev: 23.10.0
+-   hooks:
+    -   args:
+        - --fix
+        - --exit-non-zero-on-fix
+        id: ruff
+    repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.0
+-   hooks:
+    -   additional_dependencies:
+        - types-setuptools
+        id: mypy
+    repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.6.1
+-   hooks:
+    -   id: pre-commit-update
+    repo: https://gitlab.com/vojko.pribudic/pre-commit-update
+    rev: v0.1.0
diff --git a/ocrd_calamari/cli.py b/ocrd_calamari/cli.py
@@ -1,6 +1,6 @@
 import click
-
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
 from ocrd_calamari.recognize import CalamariRecognize
 
 

diff --git a/ocrd_calamari/config.py b/ocrd_calamari/config.py
@@ -1,4 +1,5 @@
 import json
+
 from pkg_resources import resource_string
 
 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
diff --git a/ocrd_calamari/fix_calamari1_model.py b/ocrd_calamari/fix_calamari1_model.py
@@ -1,8 +1,9 @@
-import re
 import json
-import click
-from glob import glob
+import re
 from copy import deepcopy
+from glob import glob
+
+import click
 
 from ocrd_calamari.util import working_directory
 
@@ -23,7 +24,7 @@ def fix_calamari1_model(checkpoint_dir):
                 old_j = deepcopy(j)
 
             for v in j["model"].values():
-                if type(v) != dict:
+                if not isinstance(v, dict):
                     continue
                 for child in v.get("children", []):
                     for replacement in child.get("replacements", []):

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
@@ -1,41 +1,45 @@
 from __future__ import absolute_import
 
-import os
 import itertools
+import os
 from glob import glob
 
 import numpy as np
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models.ocrd_page import (
+    CoordsType,
+    GlyphType,
+    TextEquivType,
+    WordType,
+    to_xml,
+)
 from ocrd_utils import (
+    MIMETYPE_PAGE,
     assert_file_grp_cardinality,
     coordinates_for_segment,
     getLogger,
     make_file_id,
     points_from_polygon,
     polygon_from_x0y0x1y1,
-    MIMETYPE_PAGE,
     tf_disable_interactive_logs,
 )
 
 # Disable tensorflow/keras logging via print before importing calamari
+# (and disable ruff's import checks and sorting here)
+# ruff: noqa: E402
+# ruff: isort: off
 tf_disable_interactive_logs()
 
+from tensorflow import __version__ as tensorflow_version
 from calamari_ocr import __version__ as calamari_version
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
-from ocrd import Processor
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    TextEquivType,
-    WordType,
-    GlyphType,
-    CoordsType,
-    to_xml,
-)
 
-from ocrd_calamari.config import OCRD_TOOL
+# ruff: isort: on
 
-from tensorflow import __version__ as tensorflow_version
+from ocrd_calamari.config import OCRD_TOOL
 
 TOOL = "ocrd-calamari-recognize"
 
@@ -64,8 +68,14 @@ def setup(self):
         self.network_input_channels = self.predictor.predictors[
             0
         ].network.input_channels
-        # self.network_input_channels = self.predictor.predictors[0].network_params.channels # not used!
-        # binarization = self.predictor.predictors[0].model_params.data_preprocessor.binarization # not used!
+
+        # not used:
+        # self.network_input_channels = \
+        #        self.predictor.predictors[0].network_params.channels
+        # not used:
+        # binarization = \
+        #        self.predictor.predictors[0].model_params\
+        #        .data_preprocessor.binarization
         # self.features = ('' if self.network_input_channels != 1 else
         #                  'binarized' if binarization != 'GRAY' else
         #                  'grayscale_normalized')
@@ -79,16 +89,17 @@ def process(self):
         """
         Perform text recognition with Calamari on the workspace.
 
-        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
-        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
-        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
+        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word /
+        glyph level segments by splitting at white space characters / glyph boundaries.
+        In the case of ``glyph``, add all alternative character hypotheses down to
+        ``glyph_conf_cutoff`` confidence threshold.
         """
         log = getLogger("processor.CalamariRecognize")
 
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
-        for (n, input_file) in enumerate(self.input_files):
+        for n, input_file in enumerate(self.input_files):
             page_id = input_file.pageId or input_file.ID
             log.info("INPUT FILE %i / %s", n, page_id)
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -162,7 +173,6 @@ def process(self):
                 for line, line_coords, raw_results in zip(
                     textlines, line_coordss, raw_results_all
                 ):
-
                     for i, p in enumerate(raw_results):
                         p.prediction.id = "fold_{}".format(i)
 
@@ -171,10 +181,12 @@ def process(self):
 
                     # Build line text on our own
                     #
-                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
-                    # on prediction.positions. Do it on our own to have consistency.
+                    # Calamari does whitespace post-processing on prediction.sentence,
+                    # while it does not do the same on prediction.positions. Do it on
+                    # our own to have consistency.
                     #
-                    # XXX Check Calamari's built-in post-processing on prediction.sentence
+                    # XXX Check Calamari's built-in post-processing on
+                    #     prediction.sentence
 
                     def _sort_chars(p):
                         """Filter and sort chars of prediction p"""
@@ -223,9 +235,8 @@ def _drop_double_spaces_generator(positions):
                     line_text = "".join(_sort_chars(p)[0].char for p in positions)
                     if line_text != prediction.sentence:
                         log.warning(
-                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
-                            line_text,
-                            prediction.sentence,
+                            f"Our own line text is not the same as Calamari's:"
+                            f"'{line_text}' != '{prediction.sentence}'"
                         )
 
                     # Delete existing results
@@ -246,8 +257,9 @@ def _drop_double_spaces_generator(positions):
 
                     # Save word results
                     #
-                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
-                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
+                    # Calamari OCR does not provide word positions, so we infer word
+                    # positions from a. text segmentation and b. the glyph positions.
+                    # This is necessary because the PAGE XML format enforces a strict
                     # hierarchy of lines > words > glyphs.
 
                     def _words(s):
@@ -316,7 +328,9 @@ def _words(s):
                                         )
 
                                         # Add predictions (= TextEquivs)
-                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
+                                        char_index_start = 1
+                                        # Index must start with 1, see
+                                        # https://ocr-d.github.io/page#multiple-textequivs
                                         for char_index, char in enumerate(
                                             _sort_chars(p), start=char_index_start
                                         ):
@@ -351,13 +365,14 @@ def _words(s):
             )
 
 
-# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
+# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a
+#       ocrd lib
 def _page_update_higher_textequiv_levels(level, pcgts):
-    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
+    """Update the TextEquivs of all higher PAGE-XML hierarchy levels for consistency.
 
-    Starting with the hierarchy level chosen for processing,
-    join all first TextEquiv (by the rules governing the respective level)
-    into TextEquiv of the next higher level, replacing them.
+    Starting with the hierarchy level `level`chosen for processing, join all first
+    TextEquiv (by the rules governing the respective level) into TextEquiv of the next
+    higher level, replacing them.
     """
     regions = pcgts.get_Page().get_TextRegion()
     if level != "region":
@@ -390,6 +405,3 @@ def _page_update_higher_textequiv_levels(level, pcgts):
                 for line in lines
             )
             region.set_TextEquiv([TextEquivType(Unicode=region_unicode)])  # remove old
-
-
-# vim:tw=120:
diff --git a/ruff.toml b/ruff.toml
@@ -0,0 +1,2 @@
+# TODO: This should go to pyproject.toml once we have one
+select = ["E", "F", "I"]
diff --git a/setup.py b/setup.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
-from pathlib import Path
 import json
+from pathlib import Path
 
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 with open("./ocrd-tool.json", "r") as f:
     version = json.load(f)["version"]

diff --git a/test/base.py b/test/base.py
@@ -2,8 +2,8 @@
 
 import os
 import sys
-
 from test.assets import assets
+
 from ocrd_utils import initLogging
 
 PWD = os.path.dirname(os.path.realpath(__file__))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# TODO: This should go to pyproject.toml once we have one
		select = ["E", "F", "I"]