From b9a0d262d300e3db693da6928fbfd32e0feb7fa7 Mon Sep 17 00:00:00 2001 From: Pablo Duboue Date: Wed, 13 Dec 2023 11:49:52 -0800 Subject: [PATCH 1/3] Add language information to the TSV output (fixes #1861) Also make the font_info flag work on TSV output. --- include/tesseract/baseapi.h | 3 +- include/tesseract/renderer.h | 2 ++ src/api/baseapi.cpp | 67 +++++++++++++++++++++++++++++++---- src/api/renderer.cpp | 19 ++++++++-- src/ccmain/tesseractclass.cpp | 1 + src/ccmain/tesseractclass.h | 1 + src/tesseract.cpp | 4 ++- tessdata/configs/tsv | 1 + 8 files changed, 88 insertions(+), 10 deletions(-) diff --git a/include/tesseract/baseapi.h b/include/tesseract/baseapi.h index 103ca7b1c9..b9979e04b7 100644 --- a/include/tesseract/baseapi.h +++ b/include/tesseract/baseapi.h @@ -564,7 +564,8 @@ class TESS_API TessBaseAPI { * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ - char *GetTSVText(int page_number); + char *GetTSVText(int page_number, bool font_info=false, + bool lang_info=false); /** * Make a box file for LSTM training from the internal data structures. diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index 6f40523335..ec37c62f71 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -197,6 +197,7 @@ class TESS_API TessAltoRenderer : public TessResultRenderer { */ class TESS_API TessTsvRenderer : public TessResultRenderer { public: + TessTsvRenderer(const char *outputbase, bool font_info, bool lang_info); explicit TessTsvRenderer(const char *outputbase, bool font_info); explicit TessTsvRenderer(const char *outputbase); @@ -207,6 +208,7 @@ class TESS_API TessTsvRenderer : public TessResultRenderer { private: bool font_info_; // whether to print font information + bool lang_info_; // whether to print language information }; /** diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index a21798429a..bf7fe1beda 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1421,7 +1421,7 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ -char *TessBaseAPI::GetTSVText(int page_number) { +char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { return nullptr; } @@ -1434,6 +1434,9 @@ char *TessBaseAPI::GetTSVText(int page_number) { int par_num = 0; int line_num = 0; int word_num = 0; + std::string x_font; + int x_fsize = 0; + std::string lang; std::string tsv_str; tsv_str += "1\t" + std::to_string(page_num); // level 1 - page @@ -1445,7 +1448,15 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(rect_top_); tsv_str += "\t" + std::to_string(rect_width_); tsv_str += "\t" + std::to_string(rect_height_); - tsv_str += "\t-1\t\n"; + tsv_str += "\t-1"; + if (font_info) { + tsv_str += "\t" + x_font; + tsv_str += "\t" + x_fsize; + } + if (lang_info) { + tsv_str += "\t" + lang; + } + tsv_str += "\t\n"; const std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { @@ -1466,9 +1477,19 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for block + tsv_str += "\t-1"; + if (font_info) { + tsv_str += "\t\t"; + } + if (lang_info) { + tsv_str += "\t"; + } + tsv_str += "\t\n"; // end of row for block } if (res_it->IsAtBeginningOf(RIL_PARA)) { + if (lang_info) { + lang = res_it->WordRecognitionLanguage(); + } par_num++; line_num = 0; word_num = 0; @@ -1478,7 +1499,14 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for para + tsv_str += "\t-1"; + if (font_info) { + tsv_str += "\t\t"; + } + if (lang_info) { + tsv_str += "\t" + lang; + } + tsv_str += "\t\n"; // end of row for para } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { line_num++; @@ -1489,7 +1517,14 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(line_num); tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); - tsv_str += "\t-1\t\n"; // end of row for line + tsv_str += "\t-1"; + if (font_info) { + tsv_str += "\t\t"; + } + if (lang_info) { + tsv_str += "\t"; + } + tsv_str += "\t\n"; // end of row for line } // Now, process the word... @@ -1506,9 +1541,29 @@ char *TessBaseAPI::GetTSVText(int page_number) { tsv_str += "\t" + std::to_string(right - left); tsv_str += "\t" + std::to_string(bottom - top); tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); - tsv_str += "\t"; + + if (font_info) { + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + const char *font_name = + res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, + &serif, &smallcaps, &pointsize, &font_id); + tsv_str += "\t"; + if (font_name) { + tsv_str += HOcrEscape(font_name); + } + tsv_str += "\t" + std::to_string(pointsize); + } + if (lang_info) { + const char *word_lang = res_it->WordRecognitionLanguage(); + tsv_str += "\t"; + if (word_lang) { + tsv_str += word_lang; + } + } // Increment counts if at end of block/paragraph/textline. + tsv_str += "\t"; if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) { lcnt++; } diff --git a/src/api/renderer.cpp b/src/api/renderer.cpp index 8d4f1adc1b..d3579cf763 100644 --- a/src/api/renderer.cpp +++ b/src/api/renderer.cpp @@ -157,18 +157,33 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) { **********************************************************************/ TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") { font_info_ = false; + lang_info_ = false; } TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) : TessResultRenderer(outputbase, "tsv") { font_info_ = font_info; + lang_info_ = false; +} + +TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info, bool lang_info) + : TessResultRenderer(outputbase, "tsv") { + font_info_ = font_info; + lang_info_ = lang_info; } bool TessTsvRenderer::BeginDocumentHandler() { // Output TSV column headings AppendString( "level\tpage_num\tblock_num\tpar_num\tline_num\tword_" - "num\tleft\ttop\twidth\theight\tconf\ttext\n"); + "num\tleft\ttop\twidth\theight\tconf\t"); + if (font_info_) { + AppendString("x_font\tx_fsize\t"); + } + if (lang_info_) { + AppendString("lang\t"); + } + AppendString("text\n"); return true; } @@ -177,7 +192,7 @@ bool TessTsvRenderer::EndDocumentHandler() { } bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) { - const std::unique_ptr tsv(api->GetTSVText(imagenum())); + const std::unique_ptr tsv(api->GetTSVText(imagenum(), font_info_, lang_info_)); if (tsv == nullptr) { return false; } diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index fd58ac8746..a34041fc4a 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -399,6 +399,7 @@ Tesseract::Tesseract() this->params()) , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params()) , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params()) + , BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params()) , BOOL_MEMBER(poly_allow_detailed_fx, false, "Allow feature extractors to see the original outline", this->params()) , BOOL_INIT_MEMBER(tessedit_init_config_only, false, diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 732bb9e62e..345e5cab79 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -920,6 +920,7 @@ class TESS_API Tesseract : public Wordrec { BOOL_VAR_H(tessedit_flip_0O); double_VAR_H(tessedit_lower_flip_hyphen); double_VAR_H(tessedit_upper_flip_hyphen); + BOOL_VAR_H(tsv_lang_info); BOOL_VAR_H(rej_trust_doc_dawg); BOOL_VAR_H(rej_1Il_use_dict_word); BOOL_VAR_H(rej_1Il_trust_permuter_type); diff --git a/src/tesseract.cpp b/src/tesseract.cpp index 480815564c..e6d480e0a0 100644 --- a/src/tesseract.cpp +++ b/src/tesseract.cpp @@ -534,8 +534,10 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api, api.GetBoolVariable("tessedit_create_tsv", &b); if (b) { bool font_info; + bool lang_info; api.GetBoolVariable("hocr_font_info", &font_info); - auto renderer = std::make_unique(outputbase, font_info); + api.GetBoolVariable("tsv_lang_info", &lang_info); + auto renderer = std::make_unique(outputbase, font_info, lang_info); if (renderer->happy()) { renderers.push_back(std::move(renderer)); } else { diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv index dc52478177..84fd7adc43 100644 --- a/tessdata/configs/tsv +++ b/tessdata/configs/tsv @@ -1 +1,2 @@ tessedit_create_tsv 1 +tsv_lang_info 0 From 30aef44b9fe6b233e047fbf7d3d44a1c432e8d23 Mon Sep 17 00:00:00 2001 From: Pablo Duboue Date: Thu, 28 Dec 2023 12:28:10 -0800 Subject: [PATCH 2/3] changes from review --- include/tesseract/baseapi.h | 3 +-- include/tesseract/renderer.h | 4 +--- src/api/baseapi.cpp | 29 +---------------------------- src/api/renderer.cpp | 15 ++------------- src/tesseract.cpp | 4 +--- 5 files changed, 6 insertions(+), 49 deletions(-) diff --git a/include/tesseract/baseapi.h b/include/tesseract/baseapi.h index b9979e04b7..c462fe4cbe 100644 --- a/include/tesseract/baseapi.h +++ b/include/tesseract/baseapi.h @@ -564,8 +564,7 @@ class TESS_API TessBaseAPI { * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ - char *GetTSVText(int page_number, bool font_info=false, - bool lang_info=false); + char *GetTSVText(int page_number, bool lang_info=false); /** * Make a box file for LSTM training from the internal data structures. diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index ec37c62f71..f06ceab0ee 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -197,8 +197,7 @@ class TESS_API TessAltoRenderer : public TessResultRenderer { */ class TESS_API TessTsvRenderer : public TessResultRenderer { public: - TessTsvRenderer(const char *outputbase, bool font_info, bool lang_info); - explicit TessTsvRenderer(const char *outputbase, bool font_info); + explicit TessTsvRenderer(const char *outputbase, bool lang_info); explicit TessTsvRenderer(const char *outputbase); protected: @@ -207,7 +206,6 @@ class TESS_API TessTsvRenderer : public TessResultRenderer { bool EndDocumentHandler() override; private: - bool font_info_; // whether to print font information bool lang_info_; // whether to print language information }; diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index bf7fe1beda..6f2b4c21e4 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1421,7 +1421,7 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ -char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { +char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { return nullptr; } @@ -1434,8 +1434,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { int par_num = 0; int line_num = 0; int word_num = 0; - std::string x_font; - int x_fsize = 0; std::string lang; std::string tsv_str; @@ -1449,10 +1447,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { tsv_str += "\t" + std::to_string(rect_width_); tsv_str += "\t" + std::to_string(rect_height_); tsv_str += "\t-1"; - if (font_info) { - tsv_str += "\t" + x_font; - tsv_str += "\t" + x_fsize; - } if (lang_info) { tsv_str += "\t" + lang; } @@ -1478,9 +1472,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); tsv_str += "\t-1"; - if (font_info) { - tsv_str += "\t\t"; - } if (lang_info) { tsv_str += "\t"; } @@ -1500,9 +1491,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); tsv_str += "\t-1"; - if (font_info) { - tsv_str += "\t\t"; - } if (lang_info) { tsv_str += "\t" + lang; } @@ -1518,9 +1506,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { tsv_str += "\t" + std::to_string(word_num); AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); tsv_str += "\t-1"; - if (font_info) { - tsv_str += "\t\t"; - } if (lang_info) { tsv_str += "\t"; } @@ -1542,18 +1527,6 @@ char *TessBaseAPI::GetTSVText(int page_number, bool font_info, bool lang_info) { tsv_str += "\t" + std::to_string(bottom - top); tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); - if (font_info) { - bool bold, italic, underlined, monospace, serif, smallcaps; - int pointsize, font_id; - const char *font_name = - res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, - &serif, &smallcaps, &pointsize, &font_id); - tsv_str += "\t"; - if (font_name) { - tsv_str += HOcrEscape(font_name); - } - tsv_str += "\t" + std::to_string(pointsize); - } if (lang_info) { const char *word_lang = res_it->WordRecognitionLanguage(); tsv_str += "\t"; diff --git a/src/api/renderer.cpp b/src/api/renderer.cpp index d3579cf763..aa25d905d6 100644 --- a/src/api/renderer.cpp +++ b/src/api/renderer.cpp @@ -156,19 +156,11 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) { * TSV Text Renderer interface implementation **********************************************************************/ TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") { - font_info_ = false; lang_info_ = false; } -TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) +TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info) : TessResultRenderer(outputbase, "tsv") { - font_info_ = font_info; - lang_info_ = false; -} - -TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info, bool lang_info) - : TessResultRenderer(outputbase, "tsv") { - font_info_ = font_info; lang_info_ = lang_info; } @@ -177,9 +169,6 @@ bool TessTsvRenderer::BeginDocumentHandler() { AppendString( "level\tpage_num\tblock_num\tpar_num\tline_num\tword_" "num\tleft\ttop\twidth\theight\tconf\t"); - if (font_info_) { - AppendString("x_font\tx_fsize\t"); - } if (lang_info_) { AppendString("lang\t"); } @@ -192,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() { } bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) { - const std::unique_ptr tsv(api->GetTSVText(imagenum(), font_info_, lang_info_)); + const std::unique_ptr tsv(api->GetTSVText(imagenum(), lang_info_)); if (tsv == nullptr) { return false; } diff --git a/src/tesseract.cpp b/src/tesseract.cpp index e6d480e0a0..25d1c6aaa4 100644 --- a/src/tesseract.cpp +++ b/src/tesseract.cpp @@ -533,11 +533,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api, api.GetBoolVariable("tessedit_create_tsv", &b); if (b) { - bool font_info; bool lang_info; - api.GetBoolVariable("hocr_font_info", &font_info); api.GetBoolVariable("tsv_lang_info", &lang_info); - auto renderer = std::make_unique(outputbase, font_info, lang_info); + auto renderer = std::make_unique(outputbase, lang_info); if (renderer->happy()) { renderers.push_back(std::move(renderer)); } else { From efb267097d920aa4055e741cc9084fdfb700c33a Mon Sep 17 00:00:00 2001 From: Pablo Duboue Date: Wed, 3 Apr 2024 04:41:40 -0700 Subject: [PATCH 3/3] Using overload to avoid API change --- include/tesseract/baseapi.h | 10 +++++++++- src/api/baseapi.cpp | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/tesseract/baseapi.h b/include/tesseract/baseapi.h index c462fe4cbe..afb5595ea5 100644 --- a/include/tesseract/baseapi.h +++ b/include/tesseract/baseapi.h @@ -564,7 +564,15 @@ class TESS_API TessBaseAPI { * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ - char *GetTSVText(int page_number, bool lang_info=false); + char *GetTSVText(int page_number); + + /** + * Make a TSV-formatted string from the internal data structures. + * Allows additional column with detected language. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ + char *GetTSVText(int page_number, bool lang_info); /** * Make a box file for LSTM training from the internal data structures. diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 6f2b4c21e4..c6ead6fc7b 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1421,6 +1421,16 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st * page_number is 0-based but will appear in the output as 1-based. * Returned string must be freed with the delete [] operator. */ +char *TessBaseAPI::GetTSVText(int page_number) { + return GetTSVText(page_number, false); +} + +/** + * Make a TSV-formatted string from the internal data structures. + * Allows additional column with detected language. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { return nullptr;