From 326cf1c087fe4fd6275276e3e7e6023e9489f3a2 Mon Sep 17 00:00:00 2001 From: eikek Date: Fri, 13 Aug 2021 16:44:56 +0200 Subject: [PATCH] Use different japanese train files for tesseract They seem to work better as suggested here: https://github.com/tesseract-ocr/tessdata/issues/119 Refs: #973 --- docker/dockerfiles/joex.dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 3fdebd6fd2..0d105e9818 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -63,6 +63,12 @@ RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$versi rm docspell-joex-*.zip && \ ln -snf docspell-joex-* docspell-joex +# Using these data files for japanese, because they work better. See #973 +RUN \ + wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ + wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \ + mv jpn*.traineddata /usr/share/tessdata + COPY joex-entrypoint.sh /opt/joex-entrypoint.sh ENTRYPOINT ["/opt/joex-entrypoint.sh", "-J-XX:+UseG1GC"]