Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add language information to the TSV output (fixes #1861) #4168

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions include/tesseract/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,14 @@ class TESS_API TessBaseAPI {
*/
char *GetTSVText(int page_number);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an API change. It requires a new major version (Tesseract 6.0.0) and changes in other software like for example tesserocr.

Therefore we cannot simply merge this pull request.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood. If you have a suggestion how to provide this functionality without modifying the API, I could steer the PR in that direction.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use overload

char *GetTSVText(int page_number, bool lang_info=false);
to
char *GetTSVText(int page_number, bool lang_info);

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have pushed the overload approach, it does not break the API now.


/**
* Make a TSV-formatted string from the internal data structures.
* Allows additional column with detected language.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *GetTSVText(int page_number, bool lang_info);

/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.
Expand Down
4 changes: 2 additions & 2 deletions include/tesseract/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ class TESS_API TessAltoRenderer : public TessResultRenderer {
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase, bool lang_info);
explicit TessTsvRenderer(const char *outputbase);

protected:
Expand All @@ -206,7 +206,7 @@ class TESS_API TessTsvRenderer : public TessResultRenderer {
bool EndDocumentHandler() override;

private:
bool font_info_; // whether to print font information
bool lang_info_; // whether to print language information
};

/**
Expand Down
48 changes: 43 additions & 5 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,16 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetTSVText(int page_number) {
return GetTSVText(page_number, false);
}

/**
* Make a TSV-formatted string from the internal data structures.
* Allows additional column with detected language.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
return nullptr;
}
Expand All @@ -1434,6 +1444,7 @@ char *TessBaseAPI::GetTSVText(int page_number) {
int par_num = 0;
int line_num = 0;
int word_num = 0;
std::string lang;

std::string tsv_str;
tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
Expand All @@ -1445,7 +1456,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(rect_top_);
tsv_str += "\t" + std::to_string(rect_width_);
tsv_str += "\t" + std::to_string(rect_height_);
tsv_str += "\t-1\t\n";
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n";

const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
Expand All @@ -1466,9 +1481,16 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for block
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for block
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (lang_info) {
lang = res_it->WordRecognitionLanguage();
}
par_num++;
line_num = 0;
word_num = 0;
Expand All @@ -1478,7 +1500,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for para
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n"; // end of row for para
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
line_num++;
Expand All @@ -1489,7 +1515,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for line
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for line
}

// Now, process the word...
Expand All @@ -1506,9 +1536,17 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(right - left);
tsv_str += "\t" + std::to_string(bottom - top);
tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
tsv_str += "\t";

if (lang_info) {
const char *word_lang = res_it->WordRecognitionLanguage();
tsv_str += "\t";
if (word_lang) {
tsv_str += word_lang;
}
}

// Increment counts if at end of block/paragraph/textline.
tsv_str += "\t";
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
lcnt++;
}
Expand Down
14 changes: 9 additions & 5 deletions src/api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,19 +156,23 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
* TSV Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
lang_info_ = false;
}

TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
lang_info_ = lang_info;
}

bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString(
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
"num\tleft\ttop\twidth\theight\tconf\t");
if (lang_info_) {
AppendString("lang\t");
}
AppendString("text\n");
return true;
}

Expand All @@ -177,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() {
}

bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum(), lang_info_));
if (tsv == nullptr) {
return false;
}
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ Tesseract::Tesseract()
this->params())
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
, BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params())
, BOOL_MEMBER(poly_allow_detailed_fx, false,
"Allow feature extractors to see the original outline", this->params())
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_flip_0O);
double_VAR_H(tessedit_lower_flip_hyphen);
double_VAR_H(tessedit_upper_flip_hyphen);
BOOL_VAR_H(tsv_lang_info);
BOOL_VAR_H(rej_trust_doc_dawg);
BOOL_VAR_H(rej_1Il_use_dict_word);
BOOL_VAR_H(rej_1Il_trust_permuter_type);
Expand Down
6 changes: 3 additions & 3 deletions src/tesseract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,9 +533,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,

api.GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api.GetBoolVariable("hocr_font_info", &font_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info);
bool lang_info;
api.GetBoolVariable("tsv_lang_info", &lang_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, lang_info);
if (renderer->happy()) {
renderers.push_back(std::move(renderer));
} else {
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/tsv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
tessedit_create_tsv 1
tsv_lang_info 0