From 369aa78967b4c0356957ac1e657aaa61a9f134d7 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 17 Dec 2023 20:09:41 +0100 Subject: [PATCH] PDF Renderer: allow to specify an alternate image or resolution programmatically Support new rendering_dpi api params. Add pdf renderer tests. Install pdf font in cmake tool chain. resolves #210 resolves #3798 --- CMakeLists.txt | 7 ++ Makefile.am | 5 ++ include/tesseract/renderer.h | 34 +++++++++ src/api/pdfrenderer.cpp | 13 +++- src/api/renderer.cpp | 62 +++++++++++++++ src/ccmain/tesseractclass.cpp | 1 + src/ccmain/tesseractclass.h | 1 + unittest/pdfrenderer_test.cc | 139 ++++++++++++++++++++++++++++++++++ 8 files changed, 259 insertions(+), 3 deletions(-) create mode 100644 unittest/pdfrenderer_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 85af2df010..e96ec96eff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ option(DISABLE_TIFF "Disable build with libtiff (if available)" OFF) option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF) option(DISABLE_CURL "Disable build with libcurl (if available)" OFF) option(INSTALL_CONFIGS "Install tesseract configs" ON) +option(INSTALL_PDF_TTF "Install pdf font file" ON) if(NOT ${CMAKE_VERSION} VERSION_LESS "3.15.0") if(WIN32 AND MSVC) @@ -555,6 +556,8 @@ message(STATUS "Build tests [BUILD_TESTS]: ${BUILD_TESTS}") message(STATUS "Use system ICU Library [USE_SYSTEM_ICU]: ${USE_SYSTEM_ICU}") message( STATUS "Install tesseract configs [INSTALL_CONFIGS]: ${INSTALL_CONFIGS}") +message( + STATUS "Install tesseract pdf font [INSTALL_PDF_TTF]: ${INSTALL_PDF_TTF}") message(STATUS "--------------------------------------------------------") message(STATUS) @@ -962,6 +965,10 @@ if(INSTALL_CONFIGS) install(FILES ${TESSERACT_TESSCONFIGS} DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata/tessconfigs) endif() +if (INSTALL_PDF_TTF) + install(FILES tessdata/pdf.ttf + DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata) +endif () # ############################################################################## # uninstall target diff --git a/Makefile.am b/Makefile.am index c07567ec25..a157f7b45d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1241,6 +1241,7 @@ check_PROGRAMS += paragraphs_test if !DISABLED_LEGACY_ENGINE check_PROGRAMS += params_model_test endif # !DISABLED_LEGACY_ENGINE +check_PROGRAMS += pdfrenderer_test check_PROGRAMS += progress_test check_PROGRAMS += qrsequence_test check_PROGRAMS += recodebeam_test @@ -1469,6 +1470,10 @@ progress_test_CPPFLAGS = $(unittest_CPPFLAGS) progress_test_LDFLAGS = $(LEPTONICA_LIBS) progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS) +pdfrenderer_test_SOURCES = unittest/pdfrenderer_test.cc +pdfrenderer_test_CPPFLAGS = $(unittest_CPPFLAGS) +pdfrenderer_test_LDADD = $(TESS_LIBS) $(TRAINING_LIBS) + qrsequence_test_SOURCES = unittest/qrsequence_test.cc qrsequence_test_CPPFLAGS = $(unittest_CPPFLAGS) qrsequence_test_LDADD = $(TESS_LIBS) diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index a8745a09ee..4a9517a94d 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -106,6 +106,23 @@ class TESS_API TessResultRenderer { return imagenum_; } + /** + * Specifies an alternate image to render with the extracted text. + * It must be called after BeginDocument and before AddImage. + */ + void SetRenderingImage(Pix *rendering_image) { + rendering_image_ = rendering_image; + } + + /** + * Specifies the expected rendering resolution. + * If not set, rendering_dpi api params will be used, else the source image + * resolution. + */ + void SetRenderingResolution(int rendering_dpi) { + rendering_dpi_ = rendering_dpi; + } + protected: /** * Called by concrete classes. @@ -139,6 +156,21 @@ class TESS_API TessResultRenderer { // This method will grow the output buffer if needed. void AppendData(const char *s, int len); + // Renderers can call this to get the actual image to render with extracted + // text. This method returns: + // - the rendering image set by the caller or + // - the input image scaled to the rendering_dpi field if defined or + // - the input image from the api otherwise + Pix *GetRenderingImage(TessBaseAPI *api); + + // Resolution of the rendering image either set manually by the caller or with + // the rendering_dpi api parameter. + int GetRenderingResolution(TessBaseAPI *api); + + // Reset rendering image and dpi to previous state. Destroy scaled rendered + // image if exists. + void ResetRenderingState(Pix *rendering_image_prev, int rendering_dpi_prev); + template auto AppendData(T &&d) { AppendData(d.data(), d.size()); @@ -151,6 +183,8 @@ class TESS_API TessResultRenderer { const char *file_extension_; // standard extension for generated output std::string title_; // title of document being rendered int imagenum_; // index of last image added + Pix *rendering_image_; // Image to render with the extracted text + int rendering_dpi_; // Resolution of the rendering_image bool happy_; // I get grumpy when the disk fills up, etc. }; diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index e84b063a64..818b57acdc 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -329,7 +329,12 @@ static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) { } char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) { - double ppi = api->GetSourceYResolution(); + double input_image_ppi = api->GetSourceYResolution(); + double ppi = GetRenderingResolution(api); + double scale = 1; + if (input_image_ppi > 0) { + scale = ppi / input_image_ppi; + } // These initial conditions are all arbitrary and will be overwritten double old_x = 0.0, old_y = 0.0; @@ -379,6 +384,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { int x1, y1, x2, y2; res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); + x1 *= scale; y1 *= scale; x2 *= scale; y2 *= scale; ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } @@ -413,6 +419,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double { int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); + word_x1 *= scale; word_y1 *= scale; word_x2 *= scale; word_y2 *= scale; GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, line_y1, line_x2, line_y2, &x, &y, &word_length); } @@ -828,9 +835,9 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int obj } bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) { - Pix *pix = api->GetInputImage(); + Pix *pix = GetRenderingImage(api); const char *filename = api->GetInputName(); - int ppi = api->GetSourceYResolution(); + int ppi = GetRenderingResolution(api); if (!pix || ppi <= 0) { return false; } diff --git a/src/api/renderer.cpp b/src/api/renderer.cpp index 8d4f1adc1b..d54a55a95d 100644 --- a/src/api/renderer.cpp +++ b/src/api/renderer.cpp @@ -18,12 +18,14 @@ #ifdef HAVE_CONFIG_H # include "config_auto.h" #endif +#include #include #include #include #include // std::unique_ptr #include // std::string #include "serialis.h" // Serialize +#include "tprintf.h" namespace tesseract { @@ -36,6 +38,8 @@ TessResultRenderer::TessResultRenderer(const char *outputbase, const char *exten , file_extension_(extension) , title_("") , imagenum_(-1) + , rendering_image_(nullptr) + , rendering_dpi_(0) , happy_(true) { if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) { std::string outfile = std::string(outputbase) + "." + extension; @@ -90,13 +94,71 @@ bool TessResultRenderer::AddImage(TessBaseAPI *api) { return false; } ++imagenum_; + Pix *rendering_image_prev = rendering_image_; + int rendering_dpi_prev = rendering_dpi_; bool ok = AddImageHandler(api); + ResetRenderingState(rendering_image_prev, rendering_dpi_prev); if (next_) { ok = next_->AddImage(api) && ok; } return ok; } +void TessResultRenderer::ResetRenderingState(Pix *rendering_image_prev, + int rendering_dpi_prev) { + if (rendering_image_ != rendering_image_prev) { + pixDestroy(&rendering_image_); + rendering_image_ = rendering_image_prev; + } + if (rendering_dpi_ != rendering_dpi_prev) { + rendering_dpi_ = rendering_dpi_prev; + } +} + +Pix *TessResultRenderer::GetRenderingImage(TessBaseAPI *api) { + if (!rendering_image_) { + Pix *source_image = api->GetInputImage(); + int source_dpi = api->GetSourceYResolution(); + if (!source_image || source_dpi <= 0) { + happy_ = false; + return nullptr; + } + + int rendering_dpi = GetRenderingResolution(api); + if (rendering_dpi != source_dpi) { + float scale = (float)rendering_dpi / (float)source_dpi; + + rendering_image_ = pixScale(source_image, scale, scale); + } else { + return source_image; + } + } + return rendering_image_; +} + +int TessResultRenderer::GetRenderingResolution(tesseract::TessBaseAPI *api) { + if (rendering_dpi_) { + return rendering_dpi_; + } + int source_dpi = api->GetSourceYResolution(); + int rendering_dpi; + if (api->GetIntVariable("rendering_dpi", &rendering_dpi) && + rendering_dpi > 0 && rendering_dpi != source_dpi) { + if (rendering_dpi < kMinCredibleResolution || + rendering_dpi > kMaxCredibleResolution) { +#if !defined(NDEBUG) + tprintf( + "Warning: User defined rendering dpi %d is outside of expected range " + "(%d - %d)!\n", + rendering_dpi, kMinCredibleResolution, kMaxCredibleResolution); +#endif + } + rendering_dpi_ = rendering_dpi; + return rendering_dpi_; + } + return source_dpi; +} + bool TessResultRenderer::EndDocument() { if (!happy_) { return false; diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index bb645aba82..34c4892dbc 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -352,6 +352,7 @@ Tesseract::Tesseract() , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer", this->params()) , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()) + , INT_MEMBER(rendering_dpi, 0, "Scaled input image resolution before rendering", this->params()) , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params()) , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD", this->params()) diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index c03e045742..5154020455 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -906,6 +906,7 @@ class TESS_API Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_pdf); BOOL_VAR_H(textonly_pdf); INT_VAR_H(jpg_quality); + INT_VAR_H(rendering_dpi); INT_VAR_H(user_defined_dpi); INT_VAR_H(min_characters_to_try); STRING_VAR_H(unrecognised_char); diff --git a/unittest/pdfrenderer_test.cc b/unittest/pdfrenderer_test.cc new file mode 100644 index 0000000000..d8427413c8 --- /dev/null +++ b/unittest/pdfrenderer_test.cc @@ -0,0 +1,139 @@ +// (C) Copyright 2023, Tesseract Contributors. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "include_gunit.h" + +namespace tesseract { + +static std::map userdefined_dpi_variables = { + {"user_defined_dpi", "300"}}; + +class TessPDFRendererTest : public testing::Test { +protected: + static std::string TestDataNameToPath(const std::string &name) { + return file::JoinPath(TESTING_DIR, name); + } + static std::string TessdataPath() { + return TESSDATA_DIR; + } + static std::string TestPDFName(const std::string &suffix) { + return "/tmp/tesseract_pdf_renderer_test_phottest" + suffix; + } + + static void AssertPDFSizeLT(const std::string &filename, int size) { + std::filesystem::path p = filename + ".pdf"; + ASSERT_LT(std::filesystem::file_size(p), size); + } + + static void AssertPDFRemove(const std::string &filename) { + ASSERT_EQ(std::remove((filename + ".pdf").c_str()), 0); + } + + static bool initializeAPI( + TessBaseAPI &api, const std::map &variables) { + EXPECT_EQ(api.Init(TESSDATA_DIR, "eng", OEM_LSTM_ONLY), 0); + for (const auto &[name, value] : variables) { + api.SetVariable(name.c_str(), value.c_str()); + } + return true; + } + + static bool ProcessAndRenderPages( + const std::string &input_filename, TessPDFRenderer *pdf_renderer, + const std::map &variables) { + TessBaseAPI api; + initializeAPI(api, variables); + auto testdata_input_filename = TestDataNameToPath(input_filename); + EXPECT_TRUE(api.ProcessPages(testdata_input_filename.c_str(), TESSDATA_DIR, + 1000, pdf_renderer)); + api.End(); + return pdf_renderer->happy(); + } + + static void RenderPDFAndAssertSize( + const std::string &image_file, const std::string &pdf_suffix, + bool text_only, int max_file_size, + const std::map &variables = {}) { + auto pdf_name = TestPDFName(pdf_suffix); + auto pdf_renderer = std::make_unique( + pdf_name.c_str(), "tessdata", text_only); + ASSERT_TRUE( + ProcessAndRenderPages(image_file, pdf_renderer.get(), variables)); + AssertPDFSizeLT(pdf_name, max_file_size); + AssertPDFRemove(pdf_name); + } +}; + +// Test basic pdf rendering +TEST_F(TessPDFRendererTest, TestPDFRenderBasicTest) { + RenderPDFAndAssertSize("phototest_2.tif", "", false, 113000); +} + +// Test pdf rendering with lower jpeg quality +TEST_F(TessPDFRendererTest, TestPDFRenderJPEGQualityTest) { + static std::map variables = {{"jpg_quality", "40"}}; + RenderPDFAndAssertSize("phototest_2.tif", "jpg_quality", false, 66000, + variables); +} + +// Test pdf renderer text only +TEST_F(TessPDFRendererTest, TestPDFRenderTextOnlyTest) { + RenderPDFAndAssertSize("phototest_2.tif", "text_only", true, 3500); +} + +// Test that pdf renderer generates a custom image resolution in the pdf export +TEST_F(TessPDFRendererTest, TestPDFRenderLowerResolutionTest) { + std::string pdf_name = TestPDFName("lower_resolution"); + auto pdf_renderer = + std::make_unique(pdf_name.c_str(), "tessdata", false); + pdf_renderer->SetRenderingResolution(110); + CHECK_OK(ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(), + userdefined_dpi_variables)); + AssertPDFSizeLT(pdf_name, 35000); + AssertPDFRemove(pdf_name); +} + +// Test that pdf renderer generates a custom image resolution in the pdf export +// with variable directive +TEST_F(TessPDFRendererTest, TestPDFLowerResolutionVariableTest) { + std::string pdf_name = TestPDFName("lower_resolution_variable"); + static std::map variables = { + {"rendering_dpi", "110"}}; + variables.insert(begin(userdefined_dpi_variables), + end(userdefined_dpi_variables)); + auto pdf_renderer = + std::make_unique(pdf_name.c_str(), "tessdata", false); + CHECK_OK( + ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(), variables)); + AssertPDFSizeLT(pdf_name, 35000); + AssertPDFRemove(pdf_name); +} + +// Test that pdf renderer generates an alternate image in the pdf export +TEST_F(TessPDFRendererTest, TestPDFAlternateImageTest) { + std::string pdf_name = TestPDFName("alternate_image"); + auto pdf_renderer = + std::make_unique(pdf_name.c_str(), "tessdata", false); + auto alternate_image = pixRead(TestDataNameToPath("phototest.tif").c_str()); + pdf_renderer->SetRenderingImage(alternate_image); + CHECK_OK(ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(), + std::map())); + pixDestroy(&alternate_image); + AssertPDFSizeLT(pdf_name, 8000); + AssertPDFRemove(pdf_name); +} + +} // namespace tesseract \ No newline at end of file