Skip to content

Commit

Permalink
Merge pull request #101 from OCR-D/use-ruff-config-like-dinglehoppers
Browse files Browse the repository at this point in the history
Use ruff config like dinglehopper's
  • Loading branch information
mikegerber authored Oct 19, 2023
2 parents 8188219 + a1aa6ab commit 6a158f0
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 80 deletions.
29 changes: 29 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
root = true

[*]
charset = utf-8
end_of_line = lf
indent_size = 4
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
max_line_length = 88
tab_width = 4

[{*.cfg, *.ini, *.html, *.yaml, *.yml}]
indent_size = 2

[*.json]
indent_size = 2
insert_final_newline = true

# trailing spaces in markdown indicate word wrap
[*.md]
trim_trailing_whitespace = false

[*.py]
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
use_parentheses = True
ensure_newline_before_comments = True
44 changes: 23 additions & 21 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,30 +1,32 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-json
- id: check-toml
- id: check-yaml
- id: check-added-large-files
- id: check-ast

- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
- hooks:
- id: black

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.280
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1
hooks:
- id: mypy
additional_dependencies: ['types-setuptools']
repo: https://github.com/psf/black
rev: 23.10.0
- hooks:
- args:
- --fix
- --exit-non-zero-on-fix
id: ruff
repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.0
- hooks:
- additional_dependencies:
- types-setuptools
id: mypy
repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.6.1
- hooks:
- id: pre-commit-update
repo: https://gitlab.com/vojko.pribudic/pre-commit-update
rev: v0.1.0
2 changes: 1 addition & 1 deletion ocrd_calamari/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import click

from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor

from ocrd_calamari.recognize import CalamariRecognize


Expand Down
1 change: 1 addition & 0 deletions ocrd_calamari/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json

from pkg_resources import resource_string

OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
9 changes: 5 additions & 4 deletions ocrd_calamari/fix_calamari1_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import json
import click
from glob import glob
import re
from copy import deepcopy
from glob import glob

import click

from ocrd_calamari.util import working_directory

Expand All @@ -23,7 +24,7 @@ def fix_calamari1_model(checkpoint_dir):
old_j = deepcopy(j)

for v in j["model"].values():
if type(v) != dict:
if not isinstance(v, dict):
continue
for child in v.get("children", []):
for replacement in child.get("replacements", []):
Expand Down
86 changes: 49 additions & 37 deletions ocrd_calamari/recognize.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,45 @@
from __future__ import absolute_import

import os
import itertools
import os
from glob import glob

import numpy as np
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
GlyphType,
TextEquivType,
WordType,
to_xml,
)
from ocrd_utils import (
MIMETYPE_PAGE,
assert_file_grp_cardinality,
coordinates_for_segment,
getLogger,
make_file_id,
points_from_polygon,
polygon_from_x0y0x1y1,
MIMETYPE_PAGE,
tf_disable_interactive_logs,
)

# Disable tensorflow/keras logging via print before importing calamari
# (and disable ruff's import checks and sorting here)
# ruff: noqa: E402
# ruff: isort: off
tf_disable_interactive_logs()

from tensorflow import __version__ as tensorflow_version
from calamari_ocr import __version__ as calamari_version
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
TextEquivType,
WordType,
GlyphType,
CoordsType,
to_xml,
)

from ocrd_calamari.config import OCRD_TOOL
# ruff: isort: on

from tensorflow import __version__ as tensorflow_version
from ocrd_calamari.config import OCRD_TOOL

TOOL = "ocrd-calamari-recognize"

Expand Down Expand Up @@ -64,8 +68,14 @@ def setup(self):
self.network_input_channels = self.predictor.predictors[
0
].network.input_channels
# self.network_input_channels = self.predictor.predictors[0].network_params.channels # not used!
# binarization = self.predictor.predictors[0].model_params.data_preprocessor.binarization # not used!

# not used:
# self.network_input_channels = \
# self.predictor.predictors[0].network_params.channels
# not used:
# binarization = \
# self.predictor.predictors[0].model_params\
# .data_preprocessor.binarization
# self.features = ('' if self.network_input_channels != 1 else
# 'binarized' if binarization != 'GRAY' else
# 'grayscale_normalized')
Expand All @@ -79,16 +89,17 @@ def process(self):
"""
Perform text recognition with Calamari on the workspace.
If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word /
glyph level segments by splitting at white space characters / glyph boundaries.
In the case of ``glyph``, add all alternative character hypotheses down to
``glyph_conf_cutoff`` confidence threshold.
"""
log = getLogger("processor.CalamariRecognize")

assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)

for (n, input_file) in enumerate(self.input_files):
for n, input_file in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
log.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
Expand Down Expand Up @@ -162,7 +173,6 @@ def process(self):
for line, line_coords, raw_results in zip(
textlines, line_coordss, raw_results_all
):

for i, p in enumerate(raw_results):
p.prediction.id = "fold_{}".format(i)

Expand All @@ -171,10 +181,12 @@ def process(self):

# Build line text on our own
#
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
# on prediction.positions. Do it on our own to have consistency.
# Calamari does whitespace post-processing on prediction.sentence,
# while it does not do the same on prediction.positions. Do it on
# our own to have consistency.
#
# XXX Check Calamari's built-in post-processing on prediction.sentence
# XXX Check Calamari's built-in post-processing on
# prediction.sentence

def _sort_chars(p):
"""Filter and sort chars of prediction p"""
Expand Down Expand Up @@ -223,9 +235,8 @@ def _drop_double_spaces_generator(positions):
line_text = "".join(_sort_chars(p)[0].char for p in positions)
if line_text != prediction.sentence:
log.warning(
"Our own line text is not the same as Calamari's: '%s' != '%s'",
line_text,
prediction.sentence,
f"Our own line text is not the same as Calamari's:"
f"'{line_text}' != '{prediction.sentence}'"
)

# Delete existing results
Expand All @@ -246,8 +257,9 @@ def _drop_double_spaces_generator(positions):

# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# Calamari OCR does not provide word positions, so we infer word
# positions from a. text segmentation and b. the glyph positions.
# This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.

def _words(s):
Expand Down Expand Up @@ -316,7 +328,9 @@ def _words(s):
)

# Add predictions (= TextEquivs)
char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
char_index_start = 1
# Index must start with 1, see
# https://ocr-d.github.io/page#multiple-textequivs
for char_index, char in enumerate(
_sort_chars(p), start=char_index_start
):
Expand Down Expand Up @@ -351,13 +365,14 @@ def _words(s):
)


# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a ocrd lib
# TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a
# ocrd lib
def _page_update_higher_textequiv_levels(level, pcgts):
"""Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
"""Update the TextEquivs of all higher PAGE-XML hierarchy levels for consistency.
Starting with the hierarchy level chosen for processing,
join all first TextEquiv (by the rules governing the respective level)
into TextEquiv of the next higher level, replacing them.
Starting with the hierarchy level `level`chosen for processing, join all first
TextEquiv (by the rules governing the respective level) into TextEquiv of the next
higher level, replacing them.
"""
regions = pcgts.get_Page().get_TextRegion()
if level != "region":
Expand Down Expand Up @@ -390,6 +405,3 @@ def _page_update_higher_textequiv_levels(level, pcgts):
for line in lines
)
region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) # remove old


# vim:tw=120:
2 changes: 2 additions & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TODO: This should go to pyproject.toml once we have one
select = ["E", "F", "I"]
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
from pathlib import Path
import json
from pathlib import Path

from setuptools import setup, find_packages
from setuptools import find_packages, setup

with open("./ocrd-tool.json", "r") as f:
version = json.load(f)["version"]
Expand Down
2 changes: 1 addition & 1 deletion test/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import os
import sys

from test.assets import assets

from ocrd_utils import initLogging

PWD = os.path.dirname(os.path.realpath(__file__))
Expand Down
Loading

0 comments on commit 6a158f0

Please sign in to comment.