Skip to content

Commit

Permalink
fix handling input fileGrps where PAGE is mixed with derived images: (#…
Browse files Browse the repository at this point in the history
…68)

* fix handling input fileGrps where PAGE is mixed with derived images:

- aggregate a list of PAGE pages
- aggregate a list of multi-image pages
- when iterating through files, check if the current page
  has a PAGE file:
  - if so, skip if not a PAGE file
  - otherwise, raise an error if page has multiple images

* require up-to-date pip and ocrd
  • Loading branch information
bertsky authored Aug 21, 2020
1 parent 50ecb0b commit 8706818
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 6 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ deps: #deps-ubuntu
test -x $(BINDIR)/scribo-cli && \
$(BINDIR)/scribo-cli sauvola --help >/dev/null 2>&1 || \
$(MAKE) build-olena
ocrd ocrd-tool --help >/dev/null 2>&1 || \
$(PIP) install ocrd # needed for ocrd CLI (and bashlib)
$(PIP) install -U pip
$(PIP) install "ocrd>=2.13" # needed for ocrd CLI (and bashlib)

# Install
install: deps
Expand Down
28 changes: 25 additions & 3 deletions ocrd-olena-binarize
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ function main {
# shellcheck source=../core/ocrd/bashlib/lib.bash
source $(ocrd bashlib filename)
ocrd__wrap "$SHAREDIR/ocrd-tool.json" "ocrd-olena-binarize" "$@"
ocrd__minversion 2.11.0
ocrd__minversion 2.13.0

scribo_options=(--enable-negate-output)
case ${params[impl]} in
Expand Down Expand Up @@ -393,6 +393,21 @@ function main {
-k mimetype \
-k pageId \
--download))
page_pages=($(ocrd workspace find \
-G $in_file_grp \
-m $MIMETYPE_PAGE \
-k pageId))
multi_pages=($(ocrd workspace find \
-G $in_file_grp \
-m //image/.* \
-k pageId | sort | uniq -d))
declare -A is_page is_multi
for page in "${page_pages[@]}"; do
is_page[$page]=1
done
for page in "${multi_pages[@]}"; do
is_multi[$page]=1
done
local IFS=$' \t\n'
local n=0 zeros=0000
for csv in "${files[@]}"; do
Expand All @@ -410,8 +425,15 @@ function main {
if ! test -f "${in_fpath#file://}"; then
ocrd log error "input file ID=${in_id} (pageId=${in_pageId} MIME=${in_mimetype}) is not on disk"
continue
fi

fi
# fileGrps may contain PAGE and (derived) images
# so if this pageId has a PAGE file, ignore all others
# and otherwise if it has multiple images, raise an error
if ((${is_page[$in_pageId]:-0})); then
test x$in_mimetype != x$MIMETYPE_PAGE && continue
elif ((${is_multi[$in_pageId]:-0})); then
ocrd_raise "No PAGE-XML for page $in_pageId in fileGrp '$in_file_grp' but multiple images."
fi
local out_id="${in_id//$in_file_grp/$out_file_grp}"
if [ "x$out_id" = "x$in_id" ]; then
out_id=${out_file_grp}_${zeros:0:$((4-${#n}))}$n
Expand Down
2 changes: 1 addition & 1 deletion ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "1.2.0",
"version": "1.2.1",
"git_url": "https://github.com/OCR-D/ocrd_olena",
"tools": {
"ocrd-olena-binarize": {
Expand Down

0 comments on commit 8706818

Please sign in to comment.