From 0e868ef377103c448cfe58d873957f966a53ce28 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 16:47:02 -0700 Subject: [PATCH] Major change to improve layout analysis for heavily diacritic languages: Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them. --- ccmain/control.cpp | 378 +++++++++++- ccmain/fixspace.cpp | 3 +- ccmain/pageiterator.cpp | 132 +++-- ccmain/pageiterator.h | 18 + ccmain/pagesegmain.cpp | 40 +- ccmain/pgedit.cpp | 3 +- ccmain/recogtraining.cpp | 3 +- ccmain/tesseractclass.cpp | 1054 ++++++++++++++++++---------------- ccmain/tesseractclass.h | 64 ++- ccstruct/blobbox.h | 9 + ccstruct/ocrblock.cpp | 12 + ccstruct/ocrblock.h | 8 +- ccstruct/ocrrow.cpp | 11 + ccstruct/ocrrow.h | 3 + ccstruct/pageres.cpp | 44 +- ccstruct/pageres.h | 6 +- ccstruct/pdblock.cpp | 4 +- ccstruct/pdblock.h | 4 +- ccstruct/werd.cpp | 130 ++++- ccstruct/werd.h | 28 +- textord/colfind.cpp | 30 +- textord/colfind.h | 12 +- textord/colpartition.cpp | 27 + textord/colpartition.h | 5 + textord/colpartitiongrid.cpp | 59 ++ textord/colpartitiongrid.h | 9 + textord/strokewidth.cpp | 112 +++- textord/strokewidth.h | 42 +- textord/tablefind.cpp | 4 +- textord/textord.cpp | 6 +- textord/textord.h | 53 +- textord/topitch.cpp | 27 +- textord/tordmain.cpp | 239 ++++++-- textord/tordmain.h | 21 +- 34 files changed, 1856 insertions(+), 744 deletions(-) diff --git a/ccmain/control.cpp b/ccmain/control.cpp index a765a97c8a..3abf216e34 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) { WordData word_data(*pr_it); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, pr_it, - &word_data); + classify_word_and_language(2, pr_it, &word_data); if (tessedit_debug_quality_metrics) { WERD_RES* word_res = pr_it->word(); word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual); @@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) { if (word->word->x_height == 0.0f) word->word->x_height = word->row->x_height(); } + word->lang_words.truncate(0); for (int s = 0; s <= sub_langs_.size(); ++s) { // The sub_langs_.size() entry is for the master language. Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; @@ -249,15 +249,23 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, while (pr_it->word() != NULL && pr_it->word() != word->word) pr_it->forward(); ASSERT_HOST(pr_it->word() != NULL); - WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 - : &Tesseract::classify_word_pass2; - classify_word_and_language(recognizer, pr_it, word); - if (tessedit_dump_choices) { + bool make_next_word_fuzzy = false; + if (!AnyLSTMLang() && + ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { + // Needs to be setup again to see the new outlines in the chopped_word. + SetupWordPassN(pass_n, word); + } + + classify_word_and_language(pass_n, pr_it, word); + if (tessedit_dump_choices || debug_noise_removal) { tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().string(), word->word->best_choice->debug_string().string()); } pr_it->forward(); + if (make_next_word_fuzzy && pr_it->word() != NULL) { + pr_it->MakeCurrentWordFuzzy(); + } } return true; } @@ -898,6 +906,359 @@ static bool WordsAcceptable(const PointerVector& words) { return true; } +// Moves good-looking "noise"/diacritics from the reject list to the main +// blob list on the current word. Returns true if anything was done, and +// sets make_next_word_fuzzy if blob(s) were added to the end of the word. +bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy) { + *make_next_word_fuzzy = false; + WERD* real_word = pr_it->word()->word; + if (real_word->rej_cblob_list()->empty() || + real_word->cblob_list()->empty() || + real_word->rej_cblob_list()->length() > noise_maxperword) + return false; + real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); + // Get the noise outlines into a vector with matching bool map. + GenericVector outlines; + real_word->GetNoiseOutlines(&outlines); + GenericVector word_wanted; + GenericVector overlapped_any_blob; + GenericVector target_blobs; + AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, + &word_wanted, &overlapped_any_blob, + &target_blobs); + // Filter the outlines that overlapped any blob and put them into the word + // now. This simplifies the remaining task and also makes it more accurate + // as it has more completed blobs to work on. + GenericVector wanted; + GenericVector wanted_blobs; + GenericVector wanted_outlines; + int num_overlapped = 0; + int num_overlapped_used = 0; + for (int i = 0; i < overlapped_any_blob.size(); ++i) { + if (overlapped_any_blob[i]) { + ++num_overlapped; + if (word_wanted[i]) ++num_overlapped_used; + wanted.push_back(word_wanted[i]); + wanted_blobs.push_back(target_blobs[i]); + wanted_outlines.push_back(outlines[i]); + outlines[i] = NULL; + } + } + real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL); + AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, + &target_blobs); + int non_overlapped = 0; + int non_overlapped_used = 0; + for (int i = 0; i < word_wanted.size(); ++i) { + if (word_wanted[i]) ++non_overlapped_used; + if (outlines[i] != NULL) ++non_overlapped_used; + } + if (debug_noise_removal) { + tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", + num_overlapped_used, num_overlapped, non_overlapped_used, + non_overlapped); + real_word->bounding_box().print(); + } + // Now we have decided which outlines we want, put them into the real_word. + if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, + make_next_word_fuzzy)) { + pr_it->MakeCurrentWordFuzzy(); + } + // TODO(rays) Parts of combos have a deep copy of the real word, and need + // to have their noise outlines moved/assigned in the same way!! + return num_overlapped_used != 0 || non_overlapped_used != 0; +} + +// Attempts to put noise/diacritic outlines into the blobs that they overlap. +// Input: a set of noisy outlines that probably belong to the real_word. +// Output: word_wanted indicates which outlines are to be assigned to a blob, +// target_blobs indicates which to assign to, and overlapped_any_blob is +// true for all outlines that overlapped a blob. +void Tesseract::AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + overlapped_any_blob->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // For each real blob, find the outlines that seriously overlap it. + // A single blob could be several merged characters, so there can be quite + // a few outlines overlapping, and the full engine needs to be used to chop + // and join to get a sensible result. + C_BLOB_IT blob_it(real_word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + for (int i = 0; i < outlines.size(); ++i) { + if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && + !(*word_wanted)[i]) { + blob_wanted[i] = true; + (*overlapped_any_blob)[i] = true; + ++num_blob_outlines; + } + } + if (debug_noise_removal) { + tprintf("%d noise outlines overlap blob at:", num_blob_outlines); + blob_box.print(); + } + // If any outlines overlap the blob, and not too many, classify the blob + // (using the full engine, languages and all), and choose the maximal + // combination of outlines that doesn't hurt the end-result classification + // by too much. Mark them as wanted. + if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { + if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, + outlines, num_blob_outlines, + &blob_wanted)) { + for (int i = 0; i < blob_wanted.size(); ++i) { + if (blob_wanted[i]) { + // Claim the outline and record where it is going. + (*word_wanted)[i] = true; + (*target_blobs)[i] = blob; + } + } + } + } + } +} + +// Attempts to assign non-overlapping outlines to their nearest blobs or +// make new blobs out of them. +void Tesseract::AssignDiacriticsToNewBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // Check for outlines that need to be turned into stand-alone blobs. + for (int i = 0; i < outlines.size(); ++i) { + if (outlines[i] == NULL) continue; + // Get a set of adjacent outlines that don't overlap any existing blob. + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + TBOX total_ol_box(outlines[i]->bounding_box()); + while (i < outlines.size() && outlines[i] != NULL) { + blob_wanted[i] = true; + total_ol_box += outlines[i]->bounding_box(); + ++i; + ++num_blob_outlines; + } + // Find the insertion point. + C_BLOB_IT blob_it(real_word->cblob_list()); + while (!blob_it.at_last() && + blob_it.data_relative(1)->bounding_box().left() <= + total_ol_box.left()) { + blob_it.forward(); + } + // Choose which combination of them we actually want and where to put + // them. + if (debug_noise_removal) + tprintf("Num blobless outlines = %d\n", num_blob_outlines); + C_BLOB* left_blob = blob_it.data(); + TBOX left_box = left_blob->bounding_box(); + C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1); + if ((left_box.x_overlap(total_ol_box) || right_blob == NULL || + !right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to left blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = left_blob; + } + } + } else if (right_blob != NULL && + (!left_box.x_overlap(total_ol_box) || + right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, + right_blob, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to right blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = right_blob; + } + } + } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Fitted between blobs\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = NULL; + } + } + } + } +} + +// Starting with ok_outlines set to indicate which outlines overlap the blob, +// chooses the optimal set (approximately) and returns true if any outlines +// are desired, in which case ok_outlines indicates which ones. +bool Tesseract::SelectGoodDiacriticOutlines( + int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, int num_outlines, + GenericVector* ok_outlines) { + STRING best_str; + float target_cert = certainty_threshold; + if (blob != NULL) { + float target_c2; + target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2); + if (debug_noise_removal) { + tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(), + target_cert, target_c2); + blob->bounding_box().print(); + } + target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; + } + GenericVector test_outlines = *ok_outlines; + // Start with all the outlines in. + STRING all_str; + GenericVector best_outlines = *ok_outlines; + float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &all_str); + if (debug_noise_removal) { + TBOX ol_box; + for (int i = 0; i < test_outlines.size(); ++i) { + if (test_outlines[i]) ol_box += outlines[i]->bounding_box(); + } + tprintf("All Noise blob classified as %s=%g, delta=%g at:", + all_str.string(), best_cert, best_cert - target_cert); + ol_box.print(); + } + // Iteratively zero out the bit that improves the certainty the most, until + // we get past the threshold, have zero bits, or fail to improve. + int best_index = 0; // To zero out. + while (num_outlines > 1 && best_index >= 0 && + (blob == NULL || best_cert < target_cert || blob != NULL)) { + // Find the best bit to zero out. + best_index = -1; + for (int i = 0; i < outlines.size(); ++i) { + if (test_outlines[i]) { + test_outlines[i] = false; + STRING str; + float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &str); + if (debug_noise_removal) { + TBOX ol_box; + for (int j = 0; j < outlines.size(); ++j) { + if (test_outlines[j]) ol_box += outlines[j]->bounding_box(); + tprintf("%d", test_outlines[j]); + } + tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(), + cert, cert - target_cert); + ol_box.print(); + } + if (cert > best_cert) { + best_cert = cert; + best_index = i; + best_outlines = test_outlines; + } + test_outlines[i] = true; + } + } + if (best_index >= 0) { + test_outlines[best_index] = false; + --num_outlines; + } + } + if (best_cert >= target_cert) { + // Save the best combination. + *ok_outlines = best_outlines; + if (debug_noise_removal) { + tprintf("%s noise combination ", blob ? "Adding" : "New"); + for (int i = 0; i < best_outlines.size(); ++i) { + tprintf("%d", best_outlines[i]); + } + tprintf(" yields certainty %g, beating target of %g\n", best_cert, + target_cert); + } + return true; + } + return false; +} + +// Classifies the given blob plus the outlines flagged by ok_outlines, undoes +// the inclusion of the outlines, and returns the certainty of the raw choice. +float Tesseract::ClassifyBlobPlusOutlines( + const GenericVector& ok_outlines, + const GenericVector& outlines, int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str) { + C_OUTLINE_IT ol_it; + C_OUTLINE* first_to_keep = NULL; + if (blob != NULL) { + // Add the required outlines to the blob. + ol_it.set_to_list(blob->out_list()); + first_to_keep = ol_it.data(); + } + for (int i = 0; i < ok_outlines.size(); ++i) { + if (ok_outlines[i]) { + // This outline is to be added. + if (blob == NULL) { + blob = new C_BLOB(outlines[i]); + ol_it.set_to_list(blob->out_list()); + } else { + ol_it.add_before_stay_put(outlines[i]); + } + } + } + float c2; + float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); + ol_it.move_to_first(); + if (first_to_keep == NULL) { + // We created blob. Empty its outlines and delete it. + for (; !ol_it.empty(); ol_it.forward()) ol_it.extract(); + delete blob; + cert = -c2; + } else { + // Remove the outlines that we put in. + for (; ol_it.data() != first_to_keep; ol_it.forward()) { + ol_it.extract(); + } + } + return cert; +} + +// Classifies the given blob (part of word_data->word->word) as an individual +// word, using languages, chopper etc, returning only the certainty of the +// best raw choice, and undoing all the work done to fake out the word. +float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str, float* c2) { + WERD* real_word = pr_it->word()->word; + WERD* word = real_word->ConstructFromSingleBlob( + real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob)); + WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); + // Get a new iterator that points to the new word. + PAGE_RES_IT it(pr_it->page_res); + while (it.word() != word_res && it.word() != NULL) it.forward(); + ASSERT_HOST(it.word() == word_res); + WordData wd(it); + // Force full initialization. + SetupWordPassN(1, &wd); + classify_word_and_language(pass_n, &it, &wd); + if (debug_noise_removal) { + tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, + wd.row->x_height(), wd.word->raw_choice->min_x_height(), + wd.word->raw_choice->max_x_height()); + } + float cert = wd.word->raw_choice->certainty(); + float rat = wd.word->raw_choice->rating(); + *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; + *best_str = wd.word->raw_choice->unichar_string(); + it.DeleteCurrentWord(); + pr_it->ResetWordIterator(); + return cert; +} + // Generic function for classifying a word. Can be used either for pass1 or // pass2 according to the function passed to recognizer. // word_data holds the word to be recognized, and its block and row, and @@ -906,9 +1267,10 @@ static bool WordsAcceptable(const PointerVector& words) { // Recognizes in the current language, and if successful that is all. // If recognition was not successful, tries all available languages until // it gets a successful result or runs out of languages. Keeps the best result. -void Tesseract::classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, +void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data) { + WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 + : &Tesseract::classify_word_pass2; // Best result so far. PointerVector best_words; // Points to the best result. May be word or in lang_words. diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index 17c4f96ed1..0a561ac9a0 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, if ((!word->part_of_combo) && (word->box_word == NULL)) { WordData word_data(block, row, word); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, NULL, - &word_data); + classify_word_and_language(2, NULL, &word_data); } prev_word_best_choice_ = word->best_choice; } diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp index c8e025c13f..ed03ceaba5 100644 --- a/ccmain/pageiterator.cpp +++ b/ccmain/pageiterator.cpp @@ -26,15 +26,23 @@ namespace tesseract { -PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, - int scale, int scaled_yres, - int rect_left, int rect_top, +PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale, + int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height) - : page_res_(page_res), tesseract_(tesseract), - word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL), - scale_(scale), scaled_yres_(scaled_yres), - rect_left_(rect_left), rect_top_(rect_top), - rect_width_(rect_width), rect_height_(rect_height) { + : page_res_(page_res), + tesseract_(tesseract), + word_(NULL), + word_length_(0), + blob_index_(0), + cblob_it_(NULL), + include_upper_dots_(false), + include_lower_dots_(false), + scale_(scale), + scaled_yres_(scaled_yres), + rect_left_(rect_left), + rect_top_(rect_top), + rect_width_(rect_width), + rect_height_(rect_height) { it_ = new PAGE_RES_IT(page_res); PageIterator::Begin(); } @@ -50,12 +58,20 @@ PageIterator::~PageIterator() { * objects at a higher level. */ PageIterator::PageIterator(const PageIterator& src) - : page_res_(src.page_res_), tesseract_(src.tesseract_), - word_(NULL), word_length_(src.word_length_), - blob_index_(src.blob_index_), cblob_it_(NULL), - scale_(src.scale_), scaled_yres_(src.scaled_yres_), - rect_left_(src.rect_left_), rect_top_(src.rect_top_), - rect_width_(src.rect_width_), rect_height_(src.rect_height_) { + : page_res_(src.page_res_), + tesseract_(src.tesseract_), + word_(NULL), + word_length_(src.word_length_), + blob_index_(src.blob_index_), + cblob_it_(NULL), + include_upper_dots_(src.include_upper_dots_), + include_lower_dots_(src.include_lower_dots_), + scale_(src.scale_), + scaled_yres_(src.scaled_yres_), + rect_left_(src.rect_left_), + rect_top_(src.rect_top_), + rect_width_(src.rect_width_), + rect_height_(src.rect_height_) { it_ = new PAGE_RES_IT(*src.it_); BeginWord(src.blob_index_); } @@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src) const PageIterator& PageIterator::operator=(const PageIterator& src) { page_res_ = src.page_res_; tesseract_ = src.tesseract_; + include_upper_dots_ = src.include_upper_dots_; + include_lower_dots_ = src.include_lower_dots_; scale_ = src.scale_; scaled_yres_ = src.scaled_yres_; rect_left_ = src.rect_left_; @@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, PARA *para = NULL; switch (level) { case RIL_BLOCK: - box = it_->block()->block->bounding_box(); + box = it_->block()->block->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_PARA: para = it_->row()->row->para(); // explicit fall-through. case RIL_TEXTLINE: - box = it_->row()->row->bounding_box(); + box = it_->row()->row->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_WORD: - box = it_->word()->word->bounding_box(); + box = it_->word()->word->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_SYMBOL: if (cblob_it_ == NULL) @@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const { int left, top, right, bottom; if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) return NULL; - Pix* pix = NULL; - switch (level) { - case RIL_BLOCK: - case RIL_PARA: - int bleft, btop, bright, bbottom; - BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom); - pix = it_->block()->block->render_mask(); - // AND the mask and the image. - pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix), - PIX_SRC & PIX_DST, tesseract_->pix_binary(), - bleft, btop); - if (level == RIL_PARA) { - // RIL_PARA needs further attention: - // clip the paragraph from the block mask. - Box* box = boxCreate(left - bleft, top - btop, - right - left, bottom - top); - Pix* pix2 = pixClipRectangle(pix, box, NULL); - boxDestroy(&box); - pixDestroy(&pix); - pix = pix2; - } - break; - case RIL_TEXTLINE: - case RIL_WORD: - case RIL_SYMBOL: - if (level == RIL_SYMBOL && cblob_it_ != NULL && - cblob_it_->data()->area() != 0) - return cblob_it_->data()->render(); - // Just clip from the bounding box. - Box* box = boxCreate(left, top, right - left, bottom - top); - pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); - boxDestroy(&box); - break; + if (level == RIL_SYMBOL && cblob_it_ != NULL && + cblob_it_->data()->area() != 0) + return cblob_it_->data()->render(); + Box* box = boxCreate(left, top, right - left, bottom - top); + Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); + boxDestroy(&box); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + int mask_x = left - mask_box.left(); + int mask_y = top - (tesseract_->ImageHeight() - mask_box.top()); + // AND the mask and pix, putting the result in pix. + pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix), + pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x), + MAX(0, mask_y)); + pixDestroy(&mask); } return pix; } @@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding, Box* box = boxCreate(*left, *top, right - *left, bottom - *top); Pix* grey_pix = pixClipRectangle(original_img, box, NULL); boxDestroy(&box); - if (level == RIL_BLOCK) { - Pix* mask = it_->block()->block->render_mask(); - Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1); - pixRasterop(expanded_mask, padding, padding, - pixGetWidth(mask), pixGetHeight(mask), - PIX_SRC, mask, 0, 0); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + // Copy the mask registered correctly into an image the size of grey_pix. + int mask_x = *left - mask_box.left(); + int mask_y = *top - (pixGetHeight(original_img) - mask_box.top()); + int width = pixGetWidth(grey_pix); + int height = pixGetHeight(grey_pix); + Pix* resized_mask = pixCreate(width, height, 1); + pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height, + PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y)); pixDestroy(&mask); - pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1); - pixInvert(expanded_mask, expanded_mask); - pixSetMasked(grey_pix, expanded_mask, MAX_UINT32); - pixDestroy(&expanded_mask); + pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, + 2 * padding + 1); + pixInvert(resized_mask, resized_mask); + pixSetMasked(grey_pix, resized_mask, MAX_UINT32); + pixDestroy(&resized_mask); } return grey_pix; } diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h index 27b02ddf8f..56c78150a8 100644 --- a/ccmain/pageiterator.h +++ b/ccmain/pageiterator.h @@ -179,6 +179,21 @@ class TESS_API PageIterator { // If an image rectangle has been set in the API, then returned coordinates // relate to the original (full) image, rather than the rectangle. + /** + * Controls what to include in a bounding box. Bounding boxes of all levels + * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. + * Between layout analysis and recognition, it isn't known where all + * diacritics belong, so this control is used to include or exclude some + * diacritics that are above or below the main body of the word. In most cases + * where the placement is obvious, and after recognition, it doesn't make as + * much difference, as the diacritics will already be included in the word. + */ + void SetBoundingBoxComponents(bool include_upper_dots, + bool include_lower_dots) { + include_upper_dots_ = include_upper_dots; + include_lower_dots_ = include_lower_dots; + } + /** * Returns the bounding rectangle of the current object at the given level. * See comment on coordinate system above. @@ -332,6 +347,9 @@ class TESS_API PageIterator { * Owned by this ResultIterator. */ C_BLOB_IT* cblob_it_; + /** Control over what to include in bounding boxes. */ + bool include_upper_dots_; + bool include_lower_dots_; /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ int scale_; int scaled_yres_; diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index 396be13048..6ced2d4c40 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } + // The diacritic_blobs holds noise blobs that may be diacritics. They + // are separated out on areas of the image that seem noisy and short-circuit + // the layout process, going straight from the initial partition creation + // right through to after word segmentation, where they are added to the + // rej_cblobs list of the most appropriate word. From there classification + // will determine whether they are used. + BLOBNBOX_LIST diacritic_blobs; int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { - auto_page_seg_ret_val = - AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); + auto_page_seg_ret_val = AutoPageSeg( + pageseg_mode, blocks, &to_blocks, + enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: @@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_, pix_grey_, splitting || cjk_mode, - blocks, &to_blocks); + &diacritic_blobs, blocks, &to_blocks); return auto_page_seg_ret_val; } @@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { pixDestroy(&grey_pix); } - /** * Auto page segmentation. Divide the page image into blocks of uniform * text linespacing and images. @@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * The output goes in the blocks list with corresponding TO_BLOCKs in the * to_blocks list. * - * If single_column is true, then no attempt is made to divide the image - * into columns, but multiple blocks are still made if the text is of - * non-uniform linespacing. + * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide + * the image into columns, but multiple blocks are still made if the text is + * of non-uniform linespacing. + * + * If diacritic_blobs is non-null, then diacritics/noise blobs, that would + * confuse layout anaylsis by causing textline overlap, are placed there, + * with the expectation that they will be reassigned to words later and + * noise/diacriticness determined via classification. * * If osd (orientation and script detection) is true then that is performed * as well. If only_osd is true, then only orientation and script detection is @@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * another Tesseract that was initialized especially for osd, and the results * will be output into osr (orientation and script result). */ -int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, - Tesseract* osd_tess, OSResults* osr) { +int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, + BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, + OSResults* osr) { if (textord_debug_images) { WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); } @@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, if (equ_detect_) { finder->SetEquationDetect(equ_detect_); } - result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, - to_block, photomask_pix, - pix_thresholds_, pix_grey_, - &found_blocks, to_blocks); + result = finder->FindBlocks( + pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, + pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks); if (result >= 0) finder->GetDeskewVectors(&deskew_, &reskew_); delete finder; diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index 7c8f626b6b..ea44ead7c9 100644 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) { FCOORD pt(x, y); PAGE_RES_IT pr_it(page_res); - char msg[160]; + const int kBufsize = 512; + char msg[kBufsize]; char *msg_ptr = msg; msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y); diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp index 2dc94886ed..27d7e97ea0 100644 --- a/ccmain/recogtraining.cpp +++ b/ccmain/recogtraining.cpp @@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label, fflush(stdout); WordData word_data(*pr_it); SetupWordPassN(1, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass1, - pr_it, &word_data); + classify_word_and_language(1, pr_it, &word_data); WERD_RES* werd_res = word_data.word; WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index c262bbc95e..25819e8cdd 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -55,507 +55,569 @@ namespace tesseract { Tesseract::Tesseract() - : BOOL_MEMBER(tessedit_resegment_from_boxes, false, - "Take segmentation and labeling from box file", - this->params()), - BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, - "Conversion of word/line box file to char box file", - this->params()), - BOOL_MEMBER(tessedit_train_from_boxes, false, - "Generate training data from boxed chars", this->params()), - BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, - "Generate more boxes from boxed chars", this->params()), - BOOL_MEMBER(tessedit_dump_pageseg_images, false, - "Dump intermediate images made during page segmentation", - this->params()), - // The default for pageseg_mode is the old behaviour, so as not to - // upset anything that relies on that. - INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, - "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," - " 5=line, 6=word, 7=char" - " (Values from PageSegMode enum in publictypes.h)", - this->params()), - INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, - "Which OCR engine(s) to run (Tesseract, Cube, both)." - " Defaults to loading and running only Tesseract" - " (no Cube,no combiner)." - " Values from OcrEngineMode enum in tesseractclass.h)", - this->params()), - STRING_MEMBER(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize", this->params()), - STRING_MEMBER(tessedit_char_whitelist, "", - "Whitelist of chars to recognize", this->params()), - STRING_MEMBER(tessedit_char_unblacklist, "", - "List of chars to override tessedit_char_blacklist", - this->params()), - BOOL_MEMBER(tessedit_ambigs_training, false, - "Perform training for ambiguities", this->params()), - INT_MEMBER(pageseg_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing page-segmentation.", this->params()), - INT_MEMBER(ocr_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing ocr.", this->params()), - STRING_MEMBER(tessedit_write_params_to_file, "", - "Write all parameters to the given file.", this->params()), - BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug" - " information for adaption", this->params()), - INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), - INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), - INT_MEMBER(applybox_page, 0, - "Page number to apply boxes from", this->params()), - STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows" - " this pattern in the image filename. The name of the image" - " files are expected to be in the form" - " [lang].[fontname].exp[num].tif", this->params()), - BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, - "Learn both character fragments (as is done in the" - " special low exposure mode) as well as unfragmented" - " characters.", this->params()), - BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box" - " is assumed to contain ngrams. Only learn the ngrams" - " whose outlines overlap horizontally.", this->params()), - BOOL_MEMBER(tessedit_display_outwords, false, - "Draw output words", this->params()), - BOOL_MEMBER(tessedit_dump_choices, false, - "Dump char choices", this->params()), - BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", - this->params()), - BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, - "Try to improve fuzzy spaces", this->params()), - BOOL_MEMBER(tessedit_unrej_any_wd, false, - "Dont bother with word plausibility", this->params()), - BOOL_MEMBER(tessedit_fix_hyphens, true, - "Crunch double hyphens?", this->params()), - BOOL_MEMBER(tessedit_redo_xheight, true, - "Check/Correct x-height", this->params()), - BOOL_MEMBER(tessedit_enable_doc_dict, true, - "Add words to the document dictionary", this->params()), - BOOL_MEMBER(tessedit_debug_fonts, false, - "Output font info per char", this->params()), - BOOL_MEMBER(tessedit_debug_block_rejection, false, - "Block and Row stats", this->params()), - BOOL_MEMBER(tessedit_enable_bigram_correction, true, - "Enable correction based on the word bigram dictionary.", - this->params()), - BOOL_MEMBER(tessedit_enable_dict_correction, false, - "Enable single word correction based on the dictionary.", - this->params()), - INT_MEMBER(tessedit_bigram_debug, 0, - "Amount of debug output for bigram correction.", - this->params()), - INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), - BOOL_MEMBER(debug_acceptable_wds, false, - "Dump word pass/fail chk", this->params()), - STRING_MEMBER(chs_leading_punct, "('`\"", - "Leading punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct1, ").,;:?!", - "1st Trailing punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct2, ")'`\"", - "2nd Trailing punctuation", this->params()), - double_MEMBER(quality_rej_pc, 0.08, - "good_quality_doc lte rejection limit", this->params()), - double_MEMBER(quality_blob_pc, 0.0, - "good_quality_doc gte good blobs limit", this->params()), - double_MEMBER(quality_outline_pc, 1.0, - "good_quality_doc lte outline error limit", this->params()), - double_MEMBER(quality_char_pc, 0.95, - "good_quality_doc gte good char limit", this->params()), - INT_MEMBER(quality_min_initial_alphas_reqd, 2, - "alphas in a good word", this->params()), - INT_MEMBER(tessedit_tess_adaption_mode, 0x27, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(tessedit_minimal_rej_pass1, false, - "Do minimal rejection on pass 1 output", this->params()), - BOOL_MEMBER(tessedit_test_adaption, false, - "Test adaption criteria", this->params()), - BOOL_MEMBER(tessedit_matcher_log, false, - "Log matcher activity", this->params()), - INT_MEMBER(tessedit_test_adaption_mode, 3, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(test_pt, false, "Test for point", this->params()), - double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), - double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), - INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", - this->params()), - BOOL_MEMBER(paragraph_text_based, true, - "Run paragraph detection on the post-text-recognition " - "(more accurate)", this->params()), - INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), - STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", - this->params()), - STRING_MEMBER(outlines_2, "ij!?%\":;", - "Non standard number of outlines", this->params()), - BOOL_MEMBER(docqual_excuse_outline_errs, false, - "Allow outline errs in unrejection?", this->params()), - BOOL_MEMBER(tessedit_good_quality_unrej, true, - "Reduce rejection on good docs", this->params()), - BOOL_MEMBER(tessedit_use_reject_spaces, true, - "Reject spaces?", this->params()), - double_MEMBER(tessedit_reject_doc_percent, 65.00, - "%rej allowed before rej whole doc", this->params()), - double_MEMBER(tessedit_reject_block_percent, 45.00, - "%rej allowed before rej whole block", this->params()), - double_MEMBER(tessedit_reject_row_percent, 40.00, - "%rej allowed before rej whole row", this->params()), - double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, - "Number of row rejects in whole word rejects" - "which prevents whole row rejection", this->params()), - BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, - "Only rej partially rejected words in block rejection", - this->params()), - BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, - "Only rej partially rejected words in row rejection", - this->params()), - BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - INT_MEMBER(tessedit_preserve_min_wd_len, 2, - "Only preserve wds longer than this", this->params()), - BOOL_MEMBER(tessedit_row_rej_good_docs, true, - "Apply row rejection to good docs", this->params()), - double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, - "rej good doc wd if more than this fraction rejected", - this->params()), - BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, - "Reject all bad quality wds", this->params()), - BOOL_MEMBER(tessedit_debug_doc_rejection, false, - "Page stats", this->params()), - BOOL_MEMBER(tessedit_debug_quality_metrics, false, - "Output data to debug file", this->params()), - BOOL_MEMBER(bland_unrej, false, - "unrej potential with no chekcs", this->params()), - double_MEMBER(quality_rowrej_pc, 1.1, - "good_quality_doc gte good char limit", this->params()), - BOOL_MEMBER(unlv_tilde_crunching, true, - "Mark v.bad words for tilde crunch", this->params()), - BOOL_MEMBER(hocr_font_info, false, - "Add font info to hocr output", this->params()), - BOOL_MEMBER(crunch_early_merge_tess_fails, true, - "Before word crunch?", this->params()), - BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, - "Take out ~^ early?", this->params()), - double_MEMBER(crunch_terrible_rating, 80.0, - "crunch rating lt this", this->params()), - BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), - double_MEMBER(crunch_poor_garbage_cert, -9.0, - "crunch garbage cert lt this", this->params()), - double_MEMBER(crunch_poor_garbage_rate, 60, - "crunch garbage rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_rate, 40, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_cert, -8.0, - "POTENTIAL crunch cert lt this", this->params()), - BOOL_MEMBER(crunch_pot_garbage, true, - "POTENTIAL crunch garbage", this->params()), - double_MEMBER(crunch_del_rating, 60, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_del_cert, -10.0, - "POTENTIAL crunch cert lt this", this->params()), - double_MEMBER(crunch_del_min_ht, 0.7, - "Del if word ht lt xht x this", this->params()), - double_MEMBER(crunch_del_max_ht, 3.0, - "Del if word ht gt xht x this", this->params()), - double_MEMBER(crunch_del_min_width, 3.0, - "Del if word width lt xht x this", this->params()), - double_MEMBER(crunch_del_high_word, 1.5, - "Del if word gt xht x this above bl", this->params()), - double_MEMBER(crunch_del_low_word, 0.5, - "Del if word gt xht x this below bl", this->params()), - double_MEMBER(crunch_small_outlines_size, 0.6, - "Small if lt xht x this", this->params()), - INT_MEMBER(crunch_rating_max, 10, - "For adj length in rating per ch", this->params()), - INT_MEMBER(crunch_pot_indicators, 1, - "How many potential indicators needed", this->params()), - BOOL_MEMBER(crunch_leave_ok_strings, true, - "Dont touch sensible strings", this->params()), - BOOL_MEMBER(crunch_accept_ok, true, - "Use acceptability in okstring", this->params()), - BOOL_MEMBER(crunch_leave_accept_strings, false, - "Dont pot crunch sensible strings", this->params()), - BOOL_MEMBER(crunch_include_numerals, false, - "Fiddle alpha figures", this->params()), - INT_MEMBER(crunch_leave_lc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_leave_uc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_long_repetitions, 3, - "Crunch words with long repetitions", this->params()), - INT_MEMBER(crunch_debug, 0, "As it says", this->params()), - INT_MEMBER(fixsp_non_noise_limit, 1, - "How many non-noise blbs either side?", this->params()), - double_MEMBER(fixsp_small_outlines_size, 0.28, - "Small if lt xht x this", this->params()), - BOOL_MEMBER(tessedit_prefer_joined_punct, false, - "Reward punctation joins", this->params()), - INT_MEMBER(fixsp_done_mode, 1, - "What constitues done for spacing", this->params()), - INT_MEMBER(debug_fix_space_level, 0, - "Contextual fixspace debug", this->params()), - STRING_MEMBER(numeric_punctuation, ".,", - "Punct. chs expected WITHIN numbers", this->params()), - INT_MEMBER(x_ht_acceptance_tolerance, 8, - "Max allowed deviation of blob top outside of font data", - this->params()), - INT_MEMBER(x_ht_min_change, 8, - "Min change in xht before actually trying it", this->params()), - INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", - this->params()), - double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse " - "certainty does a superscript position glyph need to be for " - "us to try classifying it as a char with a different " - "baseline?", this->params()), - double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in " - "badness do we think sufficient to choose a superscript " - "over what we'd thought. For example, a value of 0.6 means " - "we want to reduce badness of certainty by at least 40%", - this->params()), - double_MEMBER(superscript_scaledown_ratio, 0.4, - "A superscript scaled down more than this is unbelievably " - "small. For example, 0.3 means we expect the font size to " - "be no smaller than 30% of the text line font size.", - this->params()), - double_MEMBER(subscript_max_y_top, 0.5, - "Maximum top of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a subscript.", this->params()), - double_MEMBER(superscript_min_y_bottom, 0.3, - "Minimum bottom of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a superscript.", this->params()), - BOOL_MEMBER(tessedit_write_block_separators, false, - "Write block separators in output", this->params()), - BOOL_MEMBER(tessedit_write_rep_codes, false, - "Write repetition char code", this->params()), - BOOL_MEMBER(tessedit_write_unlv, false, - "Write .unlv output file", this->params()), - BOOL_MEMBER(tessedit_create_txt, true, - "Write .txt output file", this->params()), - BOOL_MEMBER(tessedit_create_hocr, false, - "Write .html hOCR output file", this->params()), - BOOL_MEMBER(tessedit_create_pdf, false, - "Write .pdf output file", this->params()), - STRING_MEMBER(unrecognised_char, "|", - "Output char for unidentified blobs", this->params()), - INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), - INT_MEMBER(suspect_space_level, 100, - "Min suspect level for rejecting spaces", this->params()), - INT_MEMBER(suspect_short_words, 2, - "Dont Suspect dict wds longer than this", this->params()), - BOOL_MEMBER(suspect_constrain_1Il, false, - "UNLV keep 1Il chars rejected", this->params()), - double_MEMBER(suspect_rating_per_ch, 999.9, - "Dont touch bad rating limit", this->params()), - double_MEMBER(suspect_accept_rating, -999.9, - "Accept good rating limit", this->params()), - BOOL_MEMBER(tessedit_minimal_rejection, false, - "Only reject tess failures", this->params()), - BOOL_MEMBER(tessedit_zero_rejection, false, - "Dont reject ANYTHING", this->params()), - BOOL_MEMBER(tessedit_word_for_word, false, - "Make output have exactly one word per WERD", this->params()), - BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, - "Dont reject ANYTHING AT ALL", this->params()), - BOOL_MEMBER(tessedit_consistent_reps, true, - "Force all rep chars the same", this->params()), - INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()), - BOOL_MEMBER(tessedit_rejection_debug, false, - "Adaption debug", this->params()), - BOOL_MEMBER(tessedit_flip_0O, true, - "Contextual 0O O0 flips", this->params()), - double_MEMBER(tessedit_lower_flip_hyphen, 1.5, - "Aspect ratio dot/hyphen test", this->params()), - double_MEMBER(tessedit_upper_flip_hyphen, 1.8, - "Aspect ratio dot/hyphen test", this->params()), - BOOL_MEMBER(rej_trust_doc_dawg, false, - "Use DOC dawg in 11l conf. detector", this->params()), - BOOL_MEMBER(rej_1Il_use_dict_word, false, - "Use dictword test", this->params()), - BOOL_MEMBER(rej_1Il_trust_permuter_type, true, - "Dont double check", this->params()), - BOOL_MEMBER(rej_use_tess_accepted, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_tess_blanks, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_good_perm, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_sensible_wd, false, - "Extend permuter check", this->params()), - BOOL_MEMBER(rej_alphas_in_number_perm, false, - "Extend permuter check", this->params()), - double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, - "if >this fract", this->params()), - INT_MEMBER(tessedit_image_border, 2, - "Rej blbs near image edge limit", this->params()), - STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", - "Allow NN to unrej", this->params()), - STRING_MEMBER(conflict_set_I_l_1, "Il1[]", - "Il1 conflict set", this->params()), - INT_MEMBER(min_sane_x_ht_pixels, 8, - "Reject any x-ht lt or eq than this", this->params()), - BOOL_MEMBER(tessedit_create_boxfile, false, - "Output text with boxes", this->params()), - INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages" - " , else specifc page to process", this->params()), - BOOL_MEMBER(tessedit_write_images, false, - "Capture the image from the IPE", this->params()), - BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", - this->params()), - STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), - BOOL_MEMBER(tessedit_override_permuter, true, - "According to dict_word", this->params()), - INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" - " TessdataManager functions.", this->params()), - STRING_MEMBER(tessedit_load_sublangs, "", - "List of languages to load with this one", this->params()), - BOOL_MEMBER(tessedit_use_primary_params_model, false, - "In multilingual mode use params model of the" - " primary language", this->params()), - double_MEMBER(min_orientation_margin, 7.0, - "Min acceptable orientation margin", this->params()), - BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", - this->params()), - BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", - this->params()), - BOOL_MEMBER(poly_allow_detailed_fx, false, - "Allow feature extractors to see the original outline", - this->params()), - BOOL_INIT_MEMBER(tessedit_init_config_only, false, - "Only initialize with the config file. Useful if the " - "instance is not going to be used for OCR but say only " - "for layout analysis.", this->params()), - BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", - this->params()), - BOOL_MEMBER(textord_tabfind_vertical_text, true, - "Enable vertical detection", this->params()), - BOOL_MEMBER(textord_tabfind_force_vertical_text, false, - "Force using vertical text page mode", this->params()), - double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5, - "Fraction of textlines deemed vertical to use vertical page " - "mode", this->params()), - double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75, - "Fraction of height used as a minimum gap for aligned blobs.", - this->params()), - INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", - this->params()), - BOOL_MEMBER(preserve_interword_spaces, false, - "Preserve multiple interword spaces", this->params()), - BOOL_MEMBER(include_page_breaks, FALSE, - "Include page separator string in output text after each " - "image/page.", this->params()), - STRING_MEMBER(page_separator, "\f", - "Page separator (default is form feed control character)", + : BOOL_MEMBER(tessedit_resegment_from_boxes, false, + "Take segmentation and labeling from box file", this->params()), + BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, + "Conversion of word/line box file to char box file", + this->params()), + BOOL_MEMBER(tessedit_train_from_boxes, false, + "Generate training data from boxed chars", this->params()), + BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, + "Generate more boxes from boxed chars", this->params()), + BOOL_MEMBER(tessedit_dump_pageseg_images, false, + "Dump intermediate images made during page segmentation", + this->params()), + // The default for pageseg_mode is the old behaviour, so as not to + // upset anything that relies on that. + INT_MEMBER( + tessedit_pageseg_mode, PSM_SINGLE_BLOCK, + "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," + " 5=line, 6=word, 7=char" + " (Values from PageSegMode enum in publictypes.h)", + this->params()), + INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, + "Which OCR engine(s) to run (Tesseract, Cube, both)." + " Defaults to loading and running only Tesseract" + " (no Cube,no combiner)." + " Values from OcrEngineMode enum in tesseractclass.h)", + this->params()), + STRING_MEMBER(tessedit_char_blacklist, "", + "Blacklist of chars not to recognize", this->params()), + STRING_MEMBER(tessedit_char_whitelist, "", + "Whitelist of chars to recognize", this->params()), + STRING_MEMBER(tessedit_char_unblacklist, "", + "List of chars to override tessedit_char_blacklist", + this->params()), + BOOL_MEMBER(tessedit_ambigs_training, false, + "Perform training for ambiguities", this->params()), + INT_MEMBER(pageseg_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing page-segmentation.", + this->params()), + INT_MEMBER(ocr_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing ocr.", + this->params()), + STRING_MEMBER(tessedit_write_params_to_file, "", + "Write all parameters to the given file.", this->params()), + BOOL_MEMBER(tessedit_adaption_debug, false, + "Generate and print debug" + " information for adaption", + this->params()), + INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), + INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), + INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", + this->params()), + STRING_MEMBER(applybox_exposure_pattern, ".exp", + "Exposure value follows" + " this pattern in the image filename. The name of the image" + " files are expected to be in the form" + " [lang].[fontname].exp[num].tif", + this->params()), + BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, + "Learn both character fragments (as is done in the" + " special low exposure mode) as well as unfragmented" + " characters.", + this->params()), + BOOL_MEMBER(applybox_learn_ngrams_mode, false, + "Each bounding box" + " is assumed to contain ngrams. Only learn the ngrams" + " whose outlines overlap horizontally.", + this->params()), + BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", + this->params()), + BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", + this->params()), + BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", + this->params()), + BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, + "Try to improve fuzzy spaces", this->params()), + BOOL_MEMBER(tessedit_unrej_any_wd, false, + "Dont bother with word plausibility", this->params()), + BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", + this->params()), + BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height", + this->params()), + BOOL_MEMBER(tessedit_enable_doc_dict, true, + "Add words to the document dictionary", this->params()), + BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", + this->params()), + BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", + this->params()), + BOOL_MEMBER(tessedit_enable_bigram_correction, true, + "Enable correction based on the word bigram dictionary.", + this->params()), + BOOL_MEMBER(tessedit_enable_dict_correction, false, + "Enable single word correction based on the dictionary.", + this->params()), + INT_MEMBER(tessedit_bigram_debug, 0, + "Amount of debug output for bigram correction.", + this->params()), + BOOL_MEMBER(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise", + this->params()), + INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", + this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make the + // base + // character worse and still be included. + double_MEMBER(noise_cert_basechar, -8.0, + "Hingepoint for base char certainty", this->params()), + // Worst (min) certainty, for which a non-overlapping diacritic is allowed + // to make the base character worse and still be included. + double_MEMBER(noise_cert_disjoint, -1.0, + "Hingepoint for disjoint certainty", this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_MEMBER(noise_cert_punc, -3.0, + "Threshold for new punc char certainty", this->params()), + // Factor of certainty margin for adding diacritics to not count as worse. + double_MEMBER(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint", + this->params()), + INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", + this->params()), + INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", + this->params()), + INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), + BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk", + this->params()), + STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", + this->params()), + double_MEMBER(quality_rej_pc, 0.08, + "good_quality_doc lte rejection limit", this->params()), + double_MEMBER(quality_blob_pc, 0.0, + "good_quality_doc gte good blobs limit", this->params()), + double_MEMBER(quality_outline_pc, 1.0, + "good_quality_doc lte outline error limit", this->params()), + double_MEMBER(quality_char_pc, 0.95, + "good_quality_doc gte good char limit", this->params()), + INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", + this->params()), + INT_MEMBER(tessedit_tess_adaption_mode, 0x27, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(tessedit_minimal_rej_pass1, false, + "Do minimal rejection on pass 1 output", this->params()), + BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", + this->params()), + BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity", + this->params()), + INT_MEMBER(tessedit_test_adaption_mode, 3, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(test_pt, false, "Test for point", this->params()), + double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), + double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), + INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", + this->params()), + BOOL_MEMBER(paragraph_text_based, true, + "Run paragraph detection on the post-text-recognition " + "(more accurate)", + this->params()), + INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), + STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", + this->params()), + STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", + this->params()), + BOOL_MEMBER(docqual_excuse_outline_errs, false, + "Allow outline errs in unrejection?", this->params()), + BOOL_MEMBER(tessedit_good_quality_unrej, true, + "Reduce rejection on good docs", this->params()), + BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", + this->params()), + double_MEMBER(tessedit_reject_doc_percent, 65.00, + "%rej allowed before rej whole doc", this->params()), + double_MEMBER(tessedit_reject_block_percent, 45.00, + "%rej allowed before rej whole block", this->params()), + double_MEMBER(tessedit_reject_row_percent, 40.00, + "%rej allowed before rej whole row", this->params()), + double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, + "Number of row rejects in whole word rejects" + "which prevents whole row rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, + "Only rej partially rejected words in block rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, + "Only rej partially rejected words in row rejection", + this->params()), + BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + INT_MEMBER(tessedit_preserve_min_wd_len, 2, + "Only preserve wds longer than this", this->params()), + BOOL_MEMBER(tessedit_row_rej_good_docs, true, + "Apply row rejection to good docs", this->params()), + double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, + "rej good doc wd if more than this fraction rejected", + this->params()), + BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, + "Reject all bad quality wds", this->params()), + BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", + this->params()), + BOOL_MEMBER(tessedit_debug_quality_metrics, false, + "Output data to debug file", this->params()), + BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs", + this->params()), + double_MEMBER(quality_rowrej_pc, 1.1, + "good_quality_doc gte good char limit", this->params()), + BOOL_MEMBER(unlv_tilde_crunching, true, + "Mark v.bad words for tilde crunch", this->params()), + BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", + this->params()), + BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", + this->params()), + BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, + "Take out ~^ early?", this->params()), + double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", + this->params()), + BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), + double_MEMBER(crunch_poor_garbage_cert, -9.0, + "crunch garbage cert lt this", this->params()), + double_MEMBER(crunch_poor_garbage_rate, 60, + "crunch garbage rating lt this", this->params()), + double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", + this->params()), + BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage", + this->params()), + double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", + this->params()), + double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", + this->params()), + double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", + this->params()), + double_MEMBER(crunch_del_min_width, 3.0, + "Del if word width lt xht x this", this->params()), + double_MEMBER(crunch_del_high_word, 1.5, + "Del if word gt xht x this above bl", this->params()), + double_MEMBER(crunch_del_low_word, 0.5, + "Del if word gt xht x this below bl", this->params()), + double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", + this->params()), + INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", + this->params()), + INT_MEMBER(crunch_pot_indicators, 1, + "How many potential indicators needed", this->params()), + BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings", + this->params()), + BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", + this->params()), + BOOL_MEMBER(crunch_leave_accept_strings, false, + "Dont pot crunch sensible strings", this->params()), + BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", + this->params()), + INT_MEMBER(crunch_leave_lc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_leave_uc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_long_repetitions, 3, + "Crunch words with long repetitions", this->params()), + INT_MEMBER(crunch_debug, 0, "As it says", this->params()), + INT_MEMBER(fixsp_non_noise_limit, 1, + "How many non-noise blbs either side?", this->params()), + double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", + this->params()), + BOOL_MEMBER(tessedit_prefer_joined_punct, false, + "Reward punctation joins", this->params()), + INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing", + this->params()), + INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", + this->params()), + STRING_MEMBER(numeric_punctuation, ".,", + "Punct. chs expected WITHIN numbers", this->params()), + INT_MEMBER(x_ht_acceptance_tolerance, 8, + "Max allowed deviation of blob top outside of font data", + this->params()), + INT_MEMBER(x_ht_min_change, 8, + "Min change in xht before actually trying it", this->params()), + INT_MEMBER(superscript_debug, 0, + "Debug level for sub & superscript fixer", this->params()), + double_MEMBER( + superscript_worse_certainty, 2.0, + "How many times worse " + "certainty does a superscript position glyph need to be for " + "us to try classifying it as a char with a different " + "baseline?", + this->params()), + double_MEMBER( + superscript_bettered_certainty, 0.97, + "What reduction in " + "badness do we think sufficient to choose a superscript " + "over what we'd thought. For example, a value of 0.6 means " + "we want to reduce badness of certainty by at least 40%", + this->params()), + double_MEMBER(superscript_scaledown_ratio, 0.4, + "A superscript scaled down more than this is unbelievably " + "small. For example, 0.3 means we expect the font size to " + "be no smaller than 30% of the text line font size.", + this->params()), + double_MEMBER(subscript_max_y_top, 0.5, + "Maximum top of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a subscript.", + this->params()), + double_MEMBER(superscript_min_y_bottom, 0.3, + "Minimum bottom of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a superscript.", + this->params()), + BOOL_MEMBER(tessedit_write_block_separators, false, + "Write block separators in output", this->params()), + BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", + this->params()), + BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", + this->params()), + BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file", + this->params()), + BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", + this->params()), + BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", + this->params()), + STRING_MEMBER(unrecognised_char, "|", + "Output char for unidentified blobs", this->params()), + INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), + INT_MEMBER(suspect_space_level, 100, + "Min suspect level for rejecting spaces", this->params()), + INT_MEMBER(suspect_short_words, 2, + "Dont Suspect dict wds longer than this", this->params()), + BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", + this->params()), + double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit", + this->params()), + double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", + this->params()), + BOOL_MEMBER(tessedit_minimal_rejection, false, + "Only reject tess failures", this->params()), + BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING", + this->params()), + BOOL_MEMBER(tessedit_word_for_word, false, + "Make output have exactly one word per WERD", this->params()), + BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, + "Dont reject ANYTHING AT ALL", this->params()), + BOOL_MEMBER(tessedit_consistent_reps, true, + "Force all rep chars the same", this->params()), + INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", + this->params()), + BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", + this->params()), + BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", + this->params()), + double_MEMBER(tessedit_lower_flip_hyphen, 1.5, + "Aspect ratio dot/hyphen test", this->params()), + double_MEMBER(tessedit_upper_flip_hyphen, 1.8, + "Aspect ratio dot/hyphen test", this->params()), + BOOL_MEMBER(rej_trust_doc_dawg, false, + "Use DOC dawg in 11l conf. detector", this->params()), + BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", + this->params()), + BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check", + this->params()), + BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", + this->params()), + BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", + this->params()), + double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, + "if >this fract", this->params()), + INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", + this->params()), + STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", + "Allow NN to unrej", this->params()), + STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", + this->params()), + INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", + this->params()), + BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", + this->params()), + INT_MEMBER(tessedit_page_number, -1, + "-1 -> All pages" + " , else specifc page to process", + this->params()), + BOOL_MEMBER(tessedit_write_images, false, + "Capture the image from the IPE", this->params()), + BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", + this->params()), + STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), + BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", + this->params()), + INT_MEMBER(tessdata_manager_debug_level, 0, + "Debug level for" + " TessdataManager functions.", + this->params()), + STRING_MEMBER(tessedit_load_sublangs, "", + "List of languages to load with this one", this->params()), + BOOL_MEMBER(tessedit_use_primary_params_model, false, + "In multilingual mode use params model of the" + " primary language", + this->params()), + double_MEMBER(min_orientation_margin, 7.0, + "Min acceptable orientation margin", this->params()), + BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", + this->params()), + BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", + this->params()), + BOOL_MEMBER(poly_allow_detailed_fx, false, + "Allow feature extractors to see the original outline", + this->params()), + BOOL_INIT_MEMBER(tessedit_init_config_only, false, + "Only initialize with the config file. Useful if the " + "instance is not going to be used for OCR but say only " + "for layout analysis.", + this->params()), + BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", + this->params()), + BOOL_MEMBER(textord_tabfind_vertical_text, true, + "Enable vertical detection", this->params()), + BOOL_MEMBER(textord_tabfind_force_vertical_text, false, + "Force using vertical text page mode", this->params()), + double_MEMBER( + textord_tabfind_vertical_text_ratio, 0.5, + "Fraction of textlines deemed vertical to use vertical page " + "mode", + this->params()), + double_MEMBER( + textord_tabfind_aligned_gap_fraction, 0.75, + "Fraction of height used as a minimum gap for aligned blobs.", + this->params()), + INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", + this->params()), + BOOL_MEMBER(preserve_interword_spaces, false, + "Preserve multiple interword spaces", this->params()), + BOOL_MEMBER(include_page_breaks, FALSE, + "Include page separator string in output text after each " + "image/page.", + this->params()), + STRING_MEMBER(page_separator, "\f", + "Page separator (default is form feed control character)", + this->params()), - // The following parameters were deprecated and removed from their original - // locations. The parameters are temporarily kept here to give Tesseract - // users a chance to updated their [lang].traineddata and config files - // without introducing failures during Tesseract initialization. - // TODO(ocr-team): remove these parameters from the code once we are - // reasonably sure that Tesseract users have updated their data files. - // - // BEGIN DEPRECATED PARAMETERS - BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, - "find horizontal lines such as headers in vertical page mode", - this->params()), - INT_MEMBER(tessedit_ok_mode, 5, - "Acceptance decision algorithm", this->params()), - BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" - " (e.g. for non-space delimited languages)", - this->params()), - INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", - this->params()), - BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", - this->params()), - double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of" - " current best rate to prune other hypotheses", - this->params()), - BOOL_MEMBER(permute_script_word, 0, - "Turn on word script consistency permuter", - this->params()), - BOOL_MEMBER(segment_segcost_rating, 0, - "incorporate segmentation cost in word rating?", - this->params()), - double_MEMBER(segment_reward_script, 0.95, - "Score multipler for script consistency within a word. " - "Being a 'reward' factor, it should be <= 1. " - "Smaller value implies bigger reward.", - this->params()), - BOOL_MEMBER(permute_fixed_length_dawg, 0, - "Turn on fixed-length phrasebook search permuter", - this->params()), - BOOL_MEMBER(permute_chartype_word, 0, - "Turn on character type (property) consistency permuter", - this->params()), - double_MEMBER(segment_reward_chartype, 0.97, - "Score multipler for char type consistency within a word. ", - this->params()), - double_MEMBER(segment_reward_ngram_best_choice, 0.99, - "Score multipler for ngram permuter's best choice" - " (only used in the Han script path).", - this->params()), - BOOL_MEMBER(ngram_permuter_activated, false, - "Activate character-level n-gram-based permuter", - this->params()), - BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", - this->params()), - INT_MEMBER(language_model_fixed_length_choices_depth, 3, - "Depth of blob choice lists to explore" - " when fixed length dawgs are on", - this->params()), - BOOL_MEMBER(use_new_state_cost, FALSE, - "use new state cost heuristics for segmentation state" - " evaluation", this->params()), - double_MEMBER(heuristic_segcost_rating_base, 1.25, - "base factor for adding segmentation cost into word rating." - "It's a multiplying factor, the larger the value above 1, " - "the bigger the effect of segmentation cost.", - this->params()), - double_MEMBER(heuristic_weight_rating, 1.0, - "weight associated with char rating in combined cost of" - "state", this->params()), - double_MEMBER(heuristic_weight_width, 1000.0, - "weight associated with width evidence in combined cost of" - " state", this->params()), - double_MEMBER(heuristic_weight_seamcut, 0.0, - "weight associated with seam cut in combined cost of state", - this->params()), - double_MEMBER(heuristic_max_char_wh_ratio, 2.0, - "max char width-to-height ratio allowed in segmentation", - this->params()), - BOOL_MEMBER(enable_new_segsearch, true, - "Enable new segmentation search path.", this->params()), - double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, - "Maximum character width-to-height ratio for" - " fixed-pitch fonts", - this->params()), - // END DEPRECATED PARAMETERS + // The following parameters were deprecated and removed from their + // original + // locations. The parameters are temporarily kept here to give Tesseract + // users a chance to updated their [lang].traineddata and config files + // without introducing failures during Tesseract initialization. + // TODO(ocr-team): remove these parameters from the code once we are + // reasonably sure that Tesseract users have updated their data files. + // + // BEGIN DEPRECATED PARAMETERS + BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, + "find horizontal lines such as headers in vertical page mode", + this->params()), + INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm", + this->params()), + BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, + "Load fixed length dawgs" + " (e.g. for non-space delimited languages)", + this->params()), + INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", + this->params()), + BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", + this->params()), + double_MEMBER(bestrate_pruning_factor, 2.0, + "Multiplying factor of" + " current best rate to prune other hypotheses", + this->params()), + BOOL_MEMBER(permute_script_word, 0, + "Turn on word script consistency permuter", this->params()), + BOOL_MEMBER(segment_segcost_rating, 0, + "incorporate segmentation cost in word rating?", + this->params()), + double_MEMBER(segment_reward_script, 0.95, + "Score multipler for script consistency within a word. " + "Being a 'reward' factor, it should be <= 1. " + "Smaller value implies bigger reward.", + this->params()), + BOOL_MEMBER(permute_fixed_length_dawg, 0, + "Turn on fixed-length phrasebook search permuter", + this->params()), + BOOL_MEMBER(permute_chartype_word, 0, + "Turn on character type (property) consistency permuter", + this->params()), + double_MEMBER(segment_reward_chartype, 0.97, + "Score multipler for char type consistency within a word. ", + this->params()), + double_MEMBER(segment_reward_ngram_best_choice, 0.99, + "Score multipler for ngram permuter's best choice" + " (only used in the Han script path).", + this->params()), + BOOL_MEMBER(ngram_permuter_activated, false, + "Activate character-level n-gram-based permuter", + this->params()), + BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", + this->params()), + INT_MEMBER(language_model_fixed_length_choices_depth, 3, + "Depth of blob choice lists to explore" + " when fixed length dawgs are on", + this->params()), + BOOL_MEMBER(use_new_state_cost, FALSE, + "use new state cost heuristics for segmentation state" + " evaluation", + this->params()), + double_MEMBER(heuristic_segcost_rating_base, 1.25, + "base factor for adding segmentation cost into word rating." + "It's a multiplying factor, the larger the value above 1, " + "the bigger the effect of segmentation cost.", + this->params()), + double_MEMBER(heuristic_weight_rating, 1.0, + "weight associated with char rating in combined cost of" + "state", + this->params()), + double_MEMBER(heuristic_weight_width, 1000.0, + "weight associated with width evidence in combined cost of" + " state", + this->params()), + double_MEMBER(heuristic_weight_seamcut, 0.0, + "weight associated with seam cut in combined cost of state", + this->params()), + double_MEMBER(heuristic_max_char_wh_ratio, 2.0, + "max char width-to-height ratio allowed in segmentation", + this->params()), + BOOL_MEMBER(enable_new_segsearch, true, + "Enable new segmentation search path.", this->params()), + double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, + "Maximum character width-to-height ratio for" + " fixed-pitch fonts", + this->params()), + // END DEPRECATED PARAMETERS - backup_config_file_(NULL), - pix_binary_(NULL), - cube_binary_(NULL), - pix_grey_(NULL), - pix_thresholds_(NULL), - source_resolution_(0), - textord_(this), - right_to_left_(false), - scaled_color_(NULL), - scaled_factor_(-1), - deskew_(1.0f, 0.0f), - reskew_(1.0f, 0.0f), - most_recently_used_(this), - font_table_size_(0), + backup_config_file_(NULL), + pix_binary_(NULL), + cube_binary_(NULL), + pix_grey_(NULL), + pix_thresholds_(NULL), + source_resolution_(0), + textord_(this), + right_to_left_(false), + scaled_color_(NULL), + scaled_factor_(-1), + deskew_(1.0f, 0.0f), + reskew_(1.0f, 0.0f), + most_recently_used_(this), + font_table_size_(0), #ifndef ANDROID_BUILD - cube_cntxt_(NULL), - tess_cube_combiner_(NULL), + cube_cntxt_(NULL), + tess_cube_combiner_(NULL), #endif - equ_detect_(NULL) { + equ_detect_(NULL) { } Tesseract::~Tesseract() { diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index bd03fff642..d488fd30f3 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -283,8 +283,8 @@ class Tesseract : public Wordrec { int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr); void SetupWordScripts(BLOCK_LIST* blocks); - int AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, + int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, OSResults* osr); ColumnFinder* SetupPageSegAndDetectOrientation( bool single_column, bool osd, bool only_osd, @@ -328,8 +328,46 @@ class Tesseract : public Wordrec { WordRecognizer recognizer, WERD_RES** in_word, PointerVector* best_words); - void classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, + // Moves good-looking "noise"/diacritics from the reject list to the main + // blob list on the current word. Returns true if anything was done, and + // sets make_next_word_fuzzy if blob(s) were added to the end of the word. + bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy); + // Attempts to put noise/diacritic outlines into the blobs that they overlap. + // Input: a set of noisy outlines that probably belong to the real_word. + // Output: outlines that overlapped blobs are set to NULL and put back into + // the word, either in the blobs or in the reject list. + void AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs); + // Attempts to assign non-overlapping outlines to their nearest blobs or + // make new blobs out of them. + void AssignDiacriticsToNewBlobs(const GenericVector& outlines, + int pass, WERD* real_word, PAGE_RES_IT* pr_it, + GenericVector* word_wanted, + GenericVector* target_blobs); + // Starting with ok_outlines set to indicate which outlines overlap the blob, + // chooses the optimal set (approximately) and returns true if any outlines + // are desired, in which case ok_outlines indicates which ones. + bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, + PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, + int num_outlines, + GenericVector* ok_outlines); + // Classifies the given blob plus the outlines flagged by ok_outlines, undoes + // the inclusion of the outlines, and returns the certainty of the raw choice. + float ClassifyBlobPlusOutlines(const GenericVector& ok_outlines, + const GenericVector& outlines, + int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str); + // Classifies the given blob (part of word_data->word->word) as an individual + // word, using languages, chopper etc, returning only the certainty of the + // best raw choice, and undoing all the work done to fake out the word. + float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str, float* c2); + void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data); void classify_word_pass1(const WordData& word_data, WERD_RES** in_word, @@ -808,6 +846,24 @@ class Tesseract : public Wordrec { "Enable single word correction based on the dictionary."); INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " "correction."); + BOOL_VAR_H(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise"); + INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines"); + // Worst (min) certainty, for which a diacritic is allowed to make the base + // character worse and still be included. + double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty"); + // Worst (min) certainty, for which a non-overlapping diacritic is allowed to + // make the base character worse and still be included. + double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty"); + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty"); + // Factor of certainty margin for adding diacritics to not count as worse. + double_VAR_H(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint"); + INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob"); + INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word"); INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); diff --git a/ccstruct/blobbox.h b/ccstruct/blobbox.h index bd26e1be95..b09d82f4da 100644 --- a/ccstruct/blobbox.h +++ b/ccstruct/blobbox.h @@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK cblob_ptr = srcblob; area = static_cast(srcblob->area()); } + ~BLOBNBOX() { + if (owns_cblob_) delete cblob_ptr; + } static BLOBNBOX* RealBlob(C_OUTLINE* outline) { C_BLOB* blob = new C_BLOB(outline); return new BLOBNBOX(blob); @@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK void set_base_char_blob(BLOBNBOX* blob) { base_char_blob_ = blob; } + void set_owns_cblob(bool value) { owns_cblob_ = value; } bool UniquelyVertical() const { return vert_possible_ && !horz_possible_; @@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK // construction time. void ConstructionInit() { cblob_ptr = NULL; + owns_cblob_ = false; area = 0; area_stroke_width_ = 0.0f; horz_stroke_width_ = 0.0f; @@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK bool vert_possible_; // Could be part of vertical flow. bool leader_on_left_; // There is a leader to the left. bool leader_on_right_; // There is a leader to the right. + // Iff true, then the destructor should delete the cblob_ptr. + // TODO(rays) migrate all uses to correctly setting this flag instead of + // deleting the C_BLOB before deleting the BLOBNBOX. + bool owns_cblob_; }; class TO_ROW: public ELIST2_LINK diff --git a/ccstruct/ocrblock.cpp b/ccstruct/ocrblock.cpp index a328e03887..ad7893b05a 100644 --- a/ccstruct/ocrblock.cpp +++ b/ccstruct/ocrblock.cpp @@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) { box = *poly_block()->bounding_box(); } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the rows in the block. + ROW_IT it(const_cast(&rows)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} + /** * BLOCK::reflect_polygon_in_y_axis * diff --git a/ccstruct/ocrblock.h b/ccstruct/ocrblock.h index 207c1e8579..c93aaf8a4c 100644 --- a/ccstruct/ocrblock.h +++ b/ccstruct/ocrblock.h @@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK median_size_.set_y(y); } - Pix* render_mask() { - return PDBLK::render_mask(re_rotation_); + Pix* render_mask(TBOX* mask_box) { + return PDBLK::render_mask(re_rotation_, mask_box); } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Reflects the polygon in the y-axis and recomputes the bounding_box. // Does nothing to any contained rows/words/blobs etc. void reflect_polygon_in_y_axis(); diff --git a/ccstruct/ocrrow.cpp b/ccstruct/ocrrow.cpp index a7ad6ba791..c6f919ca12 100644 --- a/ccstruct/ocrrow.cpp +++ b/ccstruct/ocrrow.cpp @@ -80,6 +80,17 @@ ROW::ROW( //constructor rmargin_ = 0; } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the words in the row. + WERD_IT it(const_cast(&words)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} /********************************************************************** * ROW::recalc_bounding_box diff --git a/ccstruct/ocrrow.h b/ccstruct/ocrrow.h index 1a23889279..45384b710f 100644 --- a/ccstruct/ocrrow.h +++ b/ccstruct/ocrrow.h @@ -85,6 +85,9 @@ class ROW:public ELIST_LINK TBOX bounding_box() const { //return bounding box return bound_box; } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; void set_lmargin(inT16 lmargin) { lmargin_ = lmargin; diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 5304451929..9c1b13c5c3 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -1258,23 +1258,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { return 0; } -// Inserts the new_word and a corresponding WERD_RES before the current -// position. The simple fields of the WERD_RES are copied from clone_res and -// the resulting WERD_RES is returned for further setup with best_choice etc. +// Inserts the new_word as a combination owned by a corresponding WERD_RES +// before the current position. The simple fields of the WERD_RES are copied +// from clone_res and the resulting WERD_RES is returned for further setup +// with best_choice etc. WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word) { - // Insert new_word into the ROW. - WERD_IT w_it(row()->row->word_list()); - for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { - WERD* word = w_it.data(); - if (word == word_res->word) - break; - } - ASSERT_HOST(!w_it.cycled_list()); - w_it.add_before_then_move(new_word); // Make a WERD_RES for the new_word. WERD_RES* new_res = new WERD_RES(new_word); new_res->CopySimpleFields(clone_res); + new_res->combination = true; // Insert into the appropriate place in the ROW_RES. WERD_RES_IT wr_it(&row()->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { @@ -1477,6 +1470,33 @@ void PAGE_RES_IT::DeleteCurrentWord() { ResetWordIterator(); } +// Makes the current word a fuzzy space if not already fuzzy. Updates +// corresponding part of combo if required. +void PAGE_RES_IT::MakeCurrentWordFuzzy() { + WERD* real_word = word_res->word; + if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made word fuzzy at:"); + real_word->bounding_box().print(); + if (word_res->combination) { + // The next word should be the corresponding part of combo, but we have + // already stepped past it, so find it by search. + WERD_RES_IT wr_it(&row()->word_res_list); + for (wr_it.mark_cycle_pt(); + !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { + } + wr_it.forward(); + ASSERT_HOST(wr_it.data()->part_of_combo); + real_word = wr_it.data()->word; + ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && + !real_word->flag(W_FUZZY_NON)); + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made part of combo word fuzzy at:"); + real_word->bounding_box().print(); + } + } +} + /************************************************************************* * PAGE_RES_IT::restart_page * diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index 75798113d4..a6a8404275 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -708,6 +708,10 @@ class PAGE_RES_IT { // Deletes the current WERD_RES and its underlying WERD. void DeleteCurrentWord(); + // Makes the current word a fuzzy space if not already fuzzy. Updates + // corresponding part of combo if required. + void MakeCurrentWordFuzzy(); + WERD_RES *forward() { // Get next word. return internal_forward(false, false); } @@ -747,9 +751,9 @@ class PAGE_RES_IT { return next_block_res; } void rej_stat_word(); // for page/block/row + void ResetWordIterator(); private: - void ResetWordIterator(); WERD_RES *internal_forward(bool new_block, bool empty_ok); WERD_RES * prev_word_res; // previous word diff --git a/ccstruct/pdblock.cpp b/ccstruct/pdblock.cpp index 97365b53e7..cf3289f2e7 100644 --- a/ccstruct/pdblock.cpp +++ b/ccstruct/pdblock.cpp @@ -77,7 +77,6 @@ void PDBLK::set_sides( //set vertex lists right_it.add_list_before (right); } - /********************************************************************** * PDBLK::contains * @@ -126,7 +125,7 @@ void PDBLK::move( // reposition block // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. -Pix* PDBLK::render_mask(const FCOORD& rerotation) { +Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) { TBOX rotated_box(box); rotated_box.rotate(rerotation); Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1); @@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) { pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(), PIX_SET, NULL, 0, 0); } + if (mask_box != NULL) *mask_box = rotated_box; return pix; } diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h index 34f5518e3c..0dd0bf2ef8 100644 --- a/ccstruct/pdblock.h +++ b/ccstruct/pdblock.h @@ -89,7 +89,9 @@ class PDBLK // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. - Pix* render_mask(const FCOORD& rerotation); + // If not NULL, mask_box is filled with the position box of the returned + // mask image. + Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box); #ifndef GRAPHICS_DISABLED ///draw histogram diff --git a/ccstruct/werd.cpp b/ccstruct/werd.cpp index 24c8a41b33..aaaee9cc23 100644 --- a/ccstruct/werd.cpp +++ b/ccstruct/werd.cpp @@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { * row being marked as FUZZY space. */ -TBOX WERD::bounding_box() { - TBOX box; // box being built - C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs - - for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list(); - rej_cblob_it.forward()) { - box += rej_cblob_it.data()->bounding_box(); +TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); } + +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box = true_bounding_box(); + int bottom = box.bottom(); + int top = box.top(); + // This is a read-only iteration of the rejected blobs. + C_BLOB_IT it(const_cast(&rej_cblobs)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TBOX dot_box = it.data()->bounding_box(); + if ((upper_dots || dot_box.bottom() <= top) && + (lower_dots || dot_box.top() >= bottom)) { + box += dot_box; + } } + return box; +} - C_BLOB_IT it = &cblobs; // blobs of WERD +// Returns the bounding box of only the good blobs. +TBOX WERD::true_bounding_box() const { + TBOX box; // box being built + // This is a read-only iteration of the good blobs. + C_BLOB_IT it(const_cast(&cblobs)); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { box += it.data()->bounding_box(); } return box; } - /** * WERD::move * @@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, } return new_werd; } + +// Removes noise from the word by moving small outlines to the rej_cblobs +// list, based on the size_threshold. +void WERD::CleanNoise(float size_threshold) { + C_BLOB_IT blob_it(&cblobs); + C_BLOB_IT rej_it(&rej_cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + C_OUTLINE_IT ol_it(blob->out_list()); + for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { + C_OUTLINE* outline = ol_it.data(); + TBOX ol_box = outline->bounding_box(); + int ol_size = + ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); + if (ol_size < size_threshold) { + // This outline is too small. Move it to a separate blob in the + // reject blobs list. + C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); + rej_it.add_after_then_move(rej_blob); + } + } + if (blob->out_list()->empty()) delete blob_it.extract(); + } +} + +// Extracts all the noise outlines and stuffs the pointers into the given +// vector of outlines. Afterwards, the outlines vector owns the pointers. +void WERD::GetNoiseOutlines(GenericVector* outlines) { + C_BLOB_IT rej_it(&rej_cblobs); + for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { + C_BLOB* blob = rej_it.extract(); + C_OUTLINE_IT ol_it(blob->out_list()); + outlines->push_back(ol_it.extract()); + delete blob; + } +} + +// Adds the selected outlines to the indcated real blobs, and puts the rest +// back in rej_cblobs where they came from. Where the target_blobs entry is +// NULL, a run of wanted outlines is put into a single new blob. +// Ownership of the outlines is transferred back to the word. (Hence +// GenericVector and not PointerVector.) +// Returns true if any new blob was added to the start of the word, which +// suggests that it might need joining to the word before it, and likewise +// sets make_next_word_fuzzy true if any new blob was added to the end. +bool WERD::AddSelectedOutlines(const GenericVector& wanted, + const GenericVector& target_blobs, + const GenericVector& outlines, + bool* make_next_word_fuzzy) { + bool outline_added_to_start = false; + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false; + C_BLOB_IT rej_it(&rej_cblobs); + for (int i = 0; i < outlines.size(); ++i) { + C_OUTLINE* outline = outlines[i]; + if (outline == NULL) continue; // Already used it. + if (wanted[i]) { + C_BLOB* target_blob = target_blobs[i]; + TBOX noise_box = outline->bounding_box(); + if (target_blob == NULL) { + target_blob = new C_BLOB(outline); + // Need to find the insertion point. + C_BLOB_IT blob_it(&cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); + blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + if (blob_box.left() > noise_box.left()) { + if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { + // We might want to join this word to its predecessor. + outline_added_to_start = true; + } + blob_it.add_before_stay_put(target_blob); + break; + } + } + if (blob_it.cycled_list()) { + blob_it.add_to_end(target_blob); + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true; + } + // Add all consecutive wanted, but null-blob outlines to same blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + while (i + 1 < outlines.size() && wanted[i + 1] && + target_blobs[i + 1] == NULL) { + ++i; + ol_it.add_to_end(outlines[i]); + } + } else { + // Insert outline into this blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + ol_it.add_to_end(outline); + } + } else { + // Put back on noise list. + rej_it.add_to_end(new C_BLOB(outline)); + } + } + return outline_added_to_start; +} diff --git a/ccstruct/werd.h b/ccstruct/werd.h index 43ecb84b6e..f9a89fb5b5 100644 --- a/ccstruct/werd.h +++ b/ccstruct/werd.h @@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK { script_id_ = id; } - TBOX bounding_box(); // compute bounding box + // Returns the (default) bounding box including all the dots. + TBOX bounding_box() const; // compute bounding box + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const; const char *text() const { return correct.string(); } void set_text(const char *new_text) { correct = new_text; } @@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK { void plot_rej_blobs(ScrollView *window); #endif // GRAPHICS_DISABLED + // Removes noise from the word by moving small outlines to the rej_cblobs + // list, based on the size_threshold. + void CleanNoise(float size_threshold); + + // Extracts all the noise outlines and stuffs the pointers into the given + // vector of outlines. Afterwards, the outlines vector owns the pointers. + void GetNoiseOutlines(GenericVector *outlines); + // Adds the selected outlines to the indcated real blobs, and puts the rest + // back in rej_cblobs where they came from. Where the target_blobs entry is + // NULL, a run of wanted outlines is put into a single new blob. + // Ownership of the outlines is transferred back to the word. (Hence + // GenericVector and not PointerVector.) + // Returns true if any new blob was added to the start of the word, which + // suggests that it might need joining to the word before it, and likewise + // sets make_next_word_fuzzy true if any new blob was added to the end. + bool AddSelectedOutlines(const GenericVector &wanted, + const GenericVector &target_blobs, + const GenericVector &outlines, + bool *make_next_word_fuzzy); + private: uinT8 blanks; // no of blanks uinT8 dummy; // padding diff --git a/textord/colfind.cpp b/textord/colfind.cpp index b9b10649af..41b3895602 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -286,22 +286,27 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block, // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. +// If diacritic_blobs is non-null, then diacritics/noise blobs, that would +// confuse layout anaylsis by causing textline overlap, are placed there, +// with the expectation that they will be reassigned to words later and +// noise/diacriticness determined via classification. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. -int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* input_block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { +int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, + int scaled_factor, TO_BLOCK* input_block, + Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, + TO_BLOCK_LIST* to_blocks) { pixOr(photo_mask_pix, photo_mask_pix, nontext_map_); stroke_width_->FindLeaderPartitions(input_block, &part_grid_); stroke_width_->RemoveLineResidue(&big_parts_); FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_, input_block); SetBlockRuleEdges(input_block); - stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_, - denorm_, cjk_script_, &projection_, - &part_grid_, &big_parts_); + stroke_width_->GradeBlobsIntoPartitions( + rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_, + diacritic_blobs, &part_grid_, &big_parts_); if (!PSM_SPARSE(pageseg_mode)) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this, &part_grid_, &big_parts_); @@ -1134,9 +1139,13 @@ void ColumnFinder::GridMergePartitions() { neighbour->Print(); } rsearch.RemoveBBox(); - gsearch.RepositionIterator(); + if (!modified_box) { + // We are going to modify part, so remove it and re-insert it after. + gsearch.RemoveBBox(); + rsearch.RepositionIterator(); + modified_box = true; + } part->Absorb(neighbour, WidthCB()); - modified_box = true; } else if (debug) { tprintf("Neighbour failed hgap test\n"); } @@ -1151,7 +1160,6 @@ void ColumnFinder::GridMergePartitions() { // or it will never be found by a full search. // Because the box has changed, it has to be removed first, otherwise // add_sorted may fail to keep a single copy of the pointer. - gsearch.RemoveBBox(); part_grid_.InsertBBox(true, true, part); gsearch.RepositionIterator(); } diff --git a/textord/colfind.h b/textord/colfind.h index 04ad1684de..eedd4c407e 100644 --- a/textord/colfind.h +++ b/textord/colfind.h @@ -155,13 +155,15 @@ class ColumnFinder : public TabFind { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. + // Small blobs that confuse the segmentation into lines are placed into + // diacritic_blobs, with the intention that they be put into the most + // appropriate word after the rest of layout analysis. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. - int FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, + TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks); // Get the rotation required to deskew, and its inverse rotation. void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); diff --git a/textord/colpartition.cpp b/textord/colpartition.cpp index e9ce568aa3..565c660bb2 100644 --- a/textord/colpartition.cpp +++ b/textord/colpartition.cpp @@ -297,6 +297,25 @@ void ColPartition::DisownBoxesNoAssert() { } } +// NULLs the owner of the blobs in this partition that are owned by this +// partition and not leader blobs, removing them from the boxes_ list, thus +// turning this partition back to a leader partition if it contains a leader, +// or otherwise leaving it empty. Returns true if any boxes remain. +bool ColPartition::ReleaseNonLeaderBoxes() { + BLOBNBOX_C_IT bb_it(&boxes_); + for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) { + BLOBNBOX* bblob = bb_it.data(); + if (bblob->flow() != BTFT_LEADER) { + if (bblob->owner() == this) bblob->set_owner(NULL); + bb_it.extract(); + } + } + if (bb_it.empty()) return false; + flow_ = BTFT_LEADER; + ComputeLimits(); + return true; +} + // Delete the boxes that this partition owns. void ColPartition::DeleteBoxes() { // Although the boxes_ list is a C_LIST, in some cases it owns the @@ -831,6 +850,10 @@ ColPartition* ColPartition::SplitAt(int split_x) { bbox->set_owner(split_part); } } + if (it.empty()) { + // Possible if split-x passes through the first blob. + it.add_list_after(&split_part->boxes_); + } ASSERT_HOST(!it.empty()); if (split_part->IsEmpty()) { // Split part ended up with nothing. Possible if split_x passes @@ -1130,6 +1153,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { if (best_end != NULL && best_end->total_cost() < blob_count) { // Good enough. Call it a leader. result = true; + bool modified_blob_list = false; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* blob = it.data(); TBOX box = blob->bounding_box(); @@ -1139,6 +1163,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { blob->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; continue; } } @@ -1147,12 +1172,14 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { it.data_relative(-1)->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; break; } } blob->set_region_type(BRT_TEXT); blob->set_flow(BTFT_LEADER); } + if (modified_blob_list) ComputeLimits(); blob_type_ = BRT_TEXT; flow_ = BTFT_LEADER; } else if (textord_debug_tabfind) { diff --git a/textord/colpartition.h b/textord/colpartition.h index 7f6cd64328..1b35d48545 100644 --- a/textord/colpartition.h +++ b/textord/colpartition.h @@ -481,6 +481,11 @@ class ColPartition : public ELIST2_LINK { // Any blobs that are not owned by this partition get to keep their owner // without an assert failure. void DisownBoxesNoAssert(); + // NULLs the owner of the blobs in this partition that are owned by this + // partition and not leader blobs, removing them from the boxes_ list, thus + // turning this partition back to a leader partition if it contains a leader, + // or otherwise leaving it empty. Returns true if any boxes remain. + bool ReleaseNonLeaderBoxes(); // Delete the boxes that this partition owns. void DeleteBoxes(); diff --git a/textord/colpartitiongrid.cpp b/textord/colpartitiongrid.cpp index 6cd8f31c93..800cbcb3c9 100644 --- a/textord/colpartitiongrid.cpp +++ b/textord/colpartitiongrid.cpp @@ -324,6 +324,40 @@ static bool TestCompatibleCandidates(const ColPartition& part, bool debug, return true; } +// Computes and returns the total overlap of all partitions in the grid. +// If overlap_grid is non-null, it is filled with a grid that holds empty +// partitions representing the union of all overlapped partitions. +int ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid** overlap_grid) { + int total_overlap = 0; + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + ColPartition_CLIST neighbors; + const TBOX& part_box = part->bounding_box(); + FindOverlappingPartitions(part_box, part, &neighbors); + ColPartition_C_IT n_it(&neighbors); + bool any_part_overlap = false; + for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) { + const TBOX& n_box = n_it.data()->bounding_box(); + int overlap = n_box.intersection(part_box).area(); + if (overlap > 0 && overlap_grid != NULL) { + if (*overlap_grid == NULL) { + *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright()); + } + (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy()); + if (!any_part_overlap) { + (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy()); + } + } + any_part_overlap = true; + total_overlap += overlap; + } + } + return total_overlap; +} + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -901,6 +935,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { while ((part = gsearch.NextFullSearch()) != NULL) { BlobRegionType blob_type = part->blob_type(); BlobTextFlowType flow = part->flow(); + bool any_blobs_moved = false; if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) { BLOBNBOX_C_IT blob_it(part->boxes()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { @@ -918,6 +953,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { ASSERT_HOST(blob->cblob()->area() != 0); blob->set_owner(NULL); blob_it.extract(); + any_blobs_moved = true; } else { blob->set_region_type(blob_type); if (blob->flow() != BTFT_LEADER) @@ -938,6 +974,11 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { delete blob; } } + } else if (any_blobs_moved) { + gsearch.RemoveBBox(); + part->ComputeLimits(); + InsertBBox(true, true, part); + gsearch.RepositionIterator(); } } } @@ -1048,6 +1089,24 @@ void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) { block->DeleteUnownedNoise(); } +// Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER. +void ColPartitionGrid::DeleteNonLeaderParts() { + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + if (part->flow() != BTFT_LEADER) { + gsearch.RemoveBBox(); + if (part->ReleaseNonLeaderBoxes()) { + InsertBBox(true, true, part); + gsearch.RepositionIterator(); + } else { + delete part; + } + } + } +} + // Finds and marks text partitions that represent figure captions. void ColPartitionGrid::FindFigureCaptions() { // For each image region find its best candidate text caption region, diff --git a/textord/colpartitiongrid.h b/textord/colpartitiongrid.h index 40946e5746..94e7da2c43 100644 --- a/textord/colpartitiongrid.h +++ b/textord/colpartitiongrid.h @@ -63,6 +63,11 @@ class ColPartitionGrid : public BBGrid* confirm_cb, ColPartition* part); + // Computes and returns the total overlap of all partitions in the grid. + // If overlap_grid is non-null, it is filled with a grid that holds empty + // partitions representing the union of all overlapped partitions. + int ComputeTotalOverlap(ColPartitionGrid** overlap_grid); + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -165,6 +170,10 @@ class ColPartitionGrid : public BBGridConstructProjection(block, rerotation, nontext_map_); if (textord_tabfind_show_strokewidths) { ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs"); @@ -375,7 +379,19 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, // Clear and re Insert to take advantage of the removed diacritics. Clear(); InsertBlobs(block); - FindInitialPartitions(rerotation, block, part_grid, big_parts); + FCOORD skew; + FindTextlineFlowDirection(true); + PartitionFindResult r = FindInitialPartitions( + rerotation, true, block, diacritic_blobs, part_grid, big_parts, &skew); + if (r == PFR_NOISE) { + tprintf("Detected %d diacritics\n", diacritic_blobs->length()); + // Noise was found, and removed. + Clear(); + InsertBlobs(block); + FindTextlineFlowDirection(true); + r = FindInitialPartitions(rerotation, false, block, diacritic_blobs, + part_grid, big_parts, &skew); + } nontext_map_ = NULL; projection_ = NULL; denorm_ = NULL; @@ -1220,10 +1236,17 @@ void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. -void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts) { +// If find_problems is true, detects possible noise pollution by the amount +// of partition overlap that is created by the diacritics. If excessive, the +// noise is separated out into diacritic blobs, and PFR_NOISE is returned. +// [TODO(rays): if the partition overlap is caused by heavy skew, deskews +// the components, saves the skew_angle and returns PFR_SKEW.] If the return +// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be +// called again after cleaning up the partly done work. +PartitionFindResult StrokeWidth::FindInitialPartitions( + const FCOORD& rerotation, bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, FCOORD* skew_angle) { FindVerticalTextChains(part_grid); FindHorizontalTextChains(part_grid); if (textord_tabfind_show_strokewidths) { @@ -1231,6 +1254,10 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, part_grid->DisplayBoxes(chains_win_); projection_->DisplayProjection(); } + if (find_problems) { + // TODO(rays) Do something to find skew, set skew_angle and return if there + // is some. + } part_grid->SplitOverlappingPartitions(big_parts); EasyMerges(part_grid); RemoveLargeUnusedBlobs(block, part_grid, big_parts); @@ -1239,8 +1266,14 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, rerotation)); while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)); + int pre_overlap = part_grid->ComputeTotalOverlap(NULL); TestDiacritics(part_grid, block); MergeDiacritics(block, part_grid); + if (find_problems && diacritic_blobs != NULL && + DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, + diacritic_blobs)) { + return PFR_NOISE; + } if (textord_tabfind_show_strokewidths) { textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs"); part_grid->DisplayBoxes(textlines_win_); @@ -1260,6 +1293,57 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs"); part_grid->DisplayBoxes(smoothed_win_); } + return PFR_OK; +} + +// Detects noise by a significant increase in partition overlap from +// pre_overlap to now, and removes noise from the union of all the overlapping +// partitions, placing the blobs in diacritic_blobs. Returns true if any noise +// was found and removed. +bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, + ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs) { + ColPartitionGrid* noise_grid = NULL; + int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid); + if (pre_overlap == 0) pre_overlap = 1; + BLOBNBOX_IT diacritic_it(diacritic_blobs); + if (noise_grid != NULL) { + if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor && + post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) { + // This is noisy enough to fix. + if (textord_tabfind_show_strokewidths) { + ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas"); + noise_grid->DisplayBoxes(noise_win); + } + part_grid->DeleteNonLeaderParts(); + BLOBNBOX_IT blob_it(&block->noise_blobs); + ColPartitionGridSearch rsearch(noise_grid); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + BLOBNBOX* blob = blob_it.data(); + blob->ClearNeighbours(); + if (!blob->IsDiacritic() || blob->owner() != NULL) + continue; // Not a noise candidate. + TBOX blob_box(blob->bounding_box()); + TBOX search_box(blob->bounding_box()); + search_box.pad(gridsize(), gridsize()); + rsearch.StartRectSearch(search_box); + ColPartition* part = rsearch.NextRectSearch(); + if (part != NULL) { + // Consider blob as possible noise. + blob->set_owns_cblob(true); + blob->compute_bounding_box(); + diacritic_it.add_after_then_move(blob_it.extract()); + } + } + noise_grid->DeleteParts(); + delete noise_grid; + return true; + } + noise_grid->DeleteParts(); + delete noise_grid; + } + return false; } // Helper verifies that blob's neighbour in direction dir is good to add to a diff --git a/textord/strokewidth.h b/textord/strokewidth.h index 5d649b5708..12cb3c91f6 100644 --- a/textord/strokewidth.h +++ b/textord/strokewidth.h @@ -41,6 +41,14 @@ enum LeftOrRight { LR_RIGHT }; +// Return value from FindInitialPartitions indicates detection of severe +// skew or noise. +enum PartitionFindResult { + PFR_OK, // Everything is OK. + PFR_SKEW, // Skew was detected and rotated. + PFR_NOISE // Noise was detected and removed. +}; + /** * The StrokeWidth class holds all the normal and large blobs. * It is used to find good large blobs and move them to the normal blobs @@ -110,12 +118,10 @@ class StrokeWidth : public BlobGrid { // part_grid is the output grid of textline partitions. // Large blobs that cause overlap are put in separate partitions and added // to the big_parts list. - void GradeBlobsIntoPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - Pix* nontext_pix, - const DENORM* denorm, - bool cjk_script, - TextlineProjection* projection, + void GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block, + Pix* nontext_pix, const DENORM* denorm, + bool cjk_script, TextlineProjection* projection, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts); @@ -205,10 +211,26 @@ class StrokeWidth : public BlobGrid { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. - void FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts); + // If find_problems is true, detects possible noise pollution by the amount + // of partition overlap that is created by the diacritics. If excessive, the + // noise is separated out into diacritic blobs, and PFR_NOISE is returned. + // [TODO(rays): if the partition overlap is caused by heavy skew, deskews + // the components, saves the skew_angle and returns PFR_SKEW.] If the return + // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be + // called again after cleaning up the partly done work. + PartitionFindResult FindInitialPartitions(const FCOORD& rerotation, + bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, + ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, + FCOORD* skew_angle); + // Detects noise by a significant increase in partition overlap from + // pre_overlap to now, and removes noise from the union of all the overlapping + // partitions, placing the blobs in diacritic_blobs. Returns true if any noise + // was found and removed. + bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs); // Finds vertical chains of text-like blobs and puts them in ColPartitions. void FindVerticalTextChains(ColPartitionGrid* part_grid); // Finds horizontal chains of text-like blobs and puts them in ColPartitions. diff --git a/textord/tablefind.cpp b/textord/tablefind.cpp index 888fe145f5..2e38bada0b 100644 --- a/textord/tablefind.cpp +++ b/textord/tablefind.cpp @@ -974,12 +974,12 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) { hsearch.StartSideSearch(x, bottom, top); ColPartition* leader = NULL; while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) { - // This should not happen, they are in different grids. - ASSERT_HOST(&part != leader); // The leader could be a horizontal ruling in the grid. // Make sure it is actually a leader. if (leader->flow() != BTFT_LEADER) continue; + // This should not happen, they are in different grids. + ASSERT_HOST(&part != leader); // Make sure the leader shares a page column with the partition, // otherwise we are spreading across columns. if (!part.IsInSameColumnAs(*leader)) diff --git a/textord/textord.cpp b/textord/textord.cpp index cf2fc04fe3..6156e45b3b 100644 --- a/textord/textord.cpp +++ b/textord/textord.cpp @@ -268,7 +268,7 @@ Textord::~Textord() { void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, int width, int height, Pix* binary_pix, Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, + bool use_box_bottoms, BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); @@ -340,9 +340,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } - cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. - + cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); + TransferDiacriticsToBlockGroups(diacritic_blobs, blocks); // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); diff --git a/textord/textord.h b/textord/textord.h index b99541efce..cc9cb1d341 100644 --- a/textord/textord.h +++ b/textord/textord.h @@ -22,6 +22,7 @@ #define TESSERACT_TEXTORD_TEXTORD_H__ #include "ccstruct.h" +#include "bbgrid.h" #include "blobbox.h" #include "gap_map.h" #include "publictypes.h" // For PageSegMode. @@ -35,6 +36,35 @@ class ScrollView; namespace tesseract { +// A simple class that can be used by BBGrid to hold a word and an expanded +// bounding box that makes it easy to find words to put diacritics. +class WordWithBox { + public: + WordWithBox() : word_(NULL) {} + explicit WordWithBox(WERD *word) + : word_(word), bounding_box_(word->bounding_box()) { + int height = bounding_box_.height(); + bounding_box_.pad(height, height); + } + + const TBOX &bounding_box() const { return bounding_box_; } + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const { return word_->true_bounding_box(); } + C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); } + const WERD *word() const { return word_; } + + private: + // Borrowed pointer to a real word somewhere that must outlive this class. + WERD *word_; + // Cached expanded bounding box of the word, padded all round by its height. + TBOX bounding_box_; +}; + +// Make it usable by BBGrid. +CLISTIZEH(WordWithBox) +typedef BBGrid WordGrid; +typedef GridSearch WordSearch; + class Textord { public: explicit Textord(CCStruct* ccstruct); @@ -47,11 +77,13 @@ class Textord { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. - void TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, - int width, int height, Pix* binary_pix, - Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + // diacritic_blobs contain small confusing components that should be added + // to the appropriate word(s) in case they are really diacritics. + void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, + int height, Pix *binary_pix, Pix *thresholds_pix, + Pix *grey_pix, bool use_box_bottoms, + BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, + TO_BLOCK_LIST *to_blocks); // If we were supposed to return only a single textline, and there is more // than one, clean up and leave only the best. @@ -212,6 +244,17 @@ class Textord { // Remove outlines that are a tiny fraction in either width or height // of the word height. void clean_small_noise_from_words(ROW *row); + // Groups blocks by rotation, then, for each group, makes a WordGrid and calls + // TransferDiacriticsToWords to copy the diacritic blobs to the most + // appropriate words in the group of blocks. Source blobs are not touched. + void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks); + // Places a copy of blobs that are near a word (after applying rotation to the + // blob) in the most appropriate word, unless there is doubt, in which case a + // blob can end up in two words. Source blobs are not touched. + void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, + const FCOORD &rotation, WordGrid *word_grid); + public: // makerow.cpp /////////////////////////////////////////// BOOL_VAR_H(textord_single_height_mode, false, diff --git a/textord/topitch.cpp b/textord/topitch.cpp index 3136a9417e..e918f14c36 100644 --- a/textord/topitch.cpp +++ b/textord/topitch.cpp @@ -283,12 +283,13 @@ void fix_row_pitch(TO_ROW *bad_row, // row to fix bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; bad_row->space_size = bad_row->fixed_pitch; - if (bad_row->char_cells.empty ()) + if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { tune_row_pitch (bad_row, &bad_row->projection, bad_row->projection_left, bad_row->projection_right, (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, sp_sd, mid_cuts, &bad_row->char_cells, FALSE); + } } else if (bad_row->pitch_decision == PITCH_CORR_PROP || bad_row->pitch_decision == PITCH_DEF_PROP) { @@ -1279,13 +1280,13 @@ float tune_row_pitch2( //find fp cells best_sp_sd = initial_pitch; - if (textord_disable_pitch_test) { + best_pitch = static_cast(initial_pitch); + if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { return initial_pitch; } sum_proj = new STATS[textord_pitch_range * 2 + 1]; if (sum_proj == NULL) return initial_pitch; - best_pitch = (inT32) initial_pitch; for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) @@ -1293,12 +1294,12 @@ float tune_row_pitch2( //find fp cells best_pitch + pitch_delta + 1); for (pixel = projection_left; pixel <= projection_right; pixel++) { - for (pitch_delta = -textord_pitch_range; - pitch_delta <= textord_pitch_range; pitch_delta++) - sum_proj[textord_pitch_range + - pitch_delta].add ((pixel - projection_left) % (best_pitch + - pitch_delta), - projection->pile_count (pixel)); + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; + pitch_delta++) { + sum_proj[textord_pitch_range + pitch_delta].add( + (pixel - projection_left) % (best_pitch + pitch_delta), + projection->pile_count(pixel)); + } } best_count = sum_proj[textord_pitch_range].pile_count (0); best_delta = 0; @@ -1427,7 +1428,7 @@ float compute_pitch_sd( //find fp cells if (blob_it.empty ()) return space_size * 10; #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { blob_box = blob_it.data ()->bounding_box (); projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); @@ -1476,7 +1477,7 @@ float compute_pitch_sd( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); @@ -1566,7 +1567,7 @@ float compute_pitch_sd2( //find fp cells return initial_pitch * 10; } #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); } @@ -1602,7 +1603,7 @@ float compute_pitch_sd2( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp index eb229eaa1a..e9e59261da 100644 --- a/textord/tordmain.cpp +++ b/textord/tordmain.cpp @@ -38,13 +38,18 @@ #include "allheaders.h" -const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"; +// Gridsize for word grid when reassigning diacritics to words. Not critical. +const int kWordGridSize = 50; #undef EXTERN #define EXTERN #define MAX_NEAREST_DIST 600 //for block skew stats +namespace tesseract { + +CLISTIZE(WordWithBox) + /********************************************************************** * SetBlobStrokeWidth * @@ -143,7 +148,6 @@ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { } } - /********************************************************************** * assign_blobs_to_blocks2 * @@ -193,7 +197,6 @@ void assign_blobs_to_blocks2(Pix* pix, } } -namespace tesseract { /********************************************************************** * find_components * @@ -400,7 +403,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) { * Delete empty blocks, rows from the page. **********************************************************************/ -void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { +void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { BLOCK_IT block_it = blocks; //iterator ROW_IT row_it; //row iterator @@ -420,18 +423,18 @@ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { if (clean_noise) { row_it.set_to_list(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); ++num_rows_all; - clean_small_noise_from_words(row_it.data()); - if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() && - clean_noise_from_row(row_it.data())) || - row_it.data()->word_list()->empty()) { + clean_small_noise_from_words(row); + if ((textord_noise_rejrows && !row->word_list()->empty() && + clean_noise_from_row(row)) || + row->word_list()->empty()) { delete row_it.extract(); // lose empty row. } else { if (textord_noise_rejwords) clean_noise_from_words(row_it.data()); if (textord_blshift_maxshift >= 0) - tweak_row_baseline(row_it.data(), - textord_blshift_maxshift, + tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); ++num_rows; } @@ -640,16 +643,16 @@ void Textord::clean_noise_from_words( //remove empties && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } - if (dot_count > 2) { + if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; - } - else + } else { word_dud[word_index] = 0; + } if (word_dud[word_index] == 2) dud_words++; else @@ -661,11 +664,11 @@ void Textord::clean_noise_from_words( //remove empties for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { - word = word_it.data (); //current word - //rejected blobs - blob_it.set_to_list (word->rej_cblob_list ()); - //move from blobs - blob_it.add_list_after (word->cblob_list ()); + word = word_it.data(); // Current word. + // Previously we threw away the entire word. + // Now just aggressively throw all small blobs into the reject list, where + // the classifier can decide whether they are actually needed. + word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } @@ -705,6 +708,176 @@ void Textord::clean_small_noise_from_words(ROW *row) { } } } + +// Local struct to hold a group of blocks. +struct BlockGroup { + BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} + explicit BlockGroup(BLOCK* block) + : bounding_box(block->bounding_box()), + rotation(block->re_rotation()), + angle(block->re_rotation().angle()), + min_xheight(block->x_height()) { + blocks.push_back(block); + } + // Union of block bounding boxes. + TBOX bounding_box; + // Common rotation of the blocks. + FCOORD rotation; + // Angle of rotation. + float angle; + // Min xheight of the blocks. + float min_xheight; + // Collection of borrowed pointers to the blocks in the group. + GenericVector blocks; +}; + +// Groups blocks by rotation, then, for each group, makes a WordGrid and calls +// TransferDiacriticsToWords to copy the diacritic blobs to the most +// appropriate words in the group of blocks. Source blobs are not touched. +void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks) { + // Angle difference larger than this is too much to consider equal. + // They should only be in multiples of M_PI/2 anyway. + const double kMaxAngleDiff = 0.01; // About 0.6 degrees. + PointerVector groups; + BLOCK_IT bk_it(blocks); + for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { + BLOCK* block = bk_it.data(); + if (block->poly_block() != NULL && !block->poly_block()->IsText()) { + continue; + } + // Linear search of the groups to find a matching rotation. + float block_angle = block->re_rotation().angle(); + int best_g = 0; + float best_angle_diff = MAX_FLOAT32; + for (int g = 0; g < groups.size(); ++g) { + double angle_diff = fabs(block_angle - groups[g]->angle); + if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); + if (angle_diff < best_angle_diff) { + best_angle_diff = angle_diff; + best_g = g; + } + } + if (best_angle_diff > kMaxAngleDiff) { + groups.push_back(new BlockGroup(block)); + } else { + groups[best_g]->blocks.push_back(block); + groups[best_g]->bounding_box += block->bounding_box(); + float x_height = block->x_height(); + if (x_height < groups[best_g]->min_xheight) + groups[best_g]->min_xheight = x_height; + } + } + // Now process each group of blocks. + PointerVector word_ptrs; + for (int g = 0; g < groups.size(); ++g) { + const BlockGroup* group = groups[g]; + tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight, + group->blocks.size()); + WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), + group->bounding_box.topright()); + for (int b = 0; b < group->blocks.size(); ++b) { + tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length()); + ROW_IT row_it(group->blocks[b]->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); + tprintf("%d words in row\n", row->word_list()->length()); + // Put the words of the row into the grid. + WERD_IT w_it(row->word_list()); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD* word = w_it.data(); + WordWithBox* box_word = new WordWithBox(word); + word_grid.InsertBBox(true, true, box_word); + // Save the pointer where it will be auto-deleted. + word_ptrs.push_back(box_word); + } + } + } + FCOORD rotation = group->rotation; + // Make it a forward rotation that will transform blob coords to block. + rotation.set_y(-rotation.y()); + TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); + } +} + +// Places a copy of blobs that are near a word (after applying rotation to the +// blob) in the most appropriate word, unless there is doubt, in which case a +// blob can end up in two words. Source blobs are not touched. +void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, + const FCOORD& rotation, + WordGrid* word_grid) { + WordSearch ws(word_grid); + BLOBNBOX_IT b_it(diacritic_blobs); + // Apply rotation to each blob before finding the nearest words. The rotation + // allows us to only consider above/below placement and not left/right on + // vertical text, because all text is horizontal here. + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOBNBOX* blobnbox = b_it.data(); + TBOX blob_box = blobnbox->bounding_box(); + blob_box.rotate(rotation); + ws.StartRectSearch(blob_box); + // Above/below refer to word position relative to diacritic. Since some + // scripts eg Kannada/Telugu habitually put diacritics below words, and + // others eg Thai/Vietnamese/Latin put most diacritics above words, try + // for both if there isn't much in it. + WordWithBox* best_above_word = NULL; + WordWithBox* best_below_word = NULL; + int best_above_distance = 0; + int best_below_distance = 0; + for (WordWithBox* word = ws.NextRectSearch(); word != NULL; + word = ws.NextRectSearch()) { + if (word->word()->flag(W_REP_CHAR)) continue; + TBOX word_box = word->true_bounding_box(); + int x_distance = blob_box.x_gap(word_box); + int y_distance = blob_box.y_gap(word_box); + if (x_distance > 0) { + // Arbitrarily divide x-distance by 2 if there is a major y overlap, + // and the word is to the left of the diacritic. If the + // diacritic is a dropped broken character between two words, this will + // help send all the pieces to a single word, instead of splitting them + // over the 2 words. + if (word_box.major_y_overlap(blob_box) && + blob_box.left() > word_box.right()) { + x_distance /= 2; + } + y_distance += x_distance; + } + if (word_box.y_middle() > blob_box.y_middle() && + (best_above_word == NULL || y_distance < best_above_distance)) { + best_above_word = word; + best_above_distance = y_distance; + } + if (word_box.y_middle() <= blob_box.y_middle() && + (best_below_word == NULL || y_distance < best_below_distance)) { + best_below_word = word; + best_below_distance = y_distance; + } + } + bool above_good = + best_above_word != NULL && + (best_below_word == NULL || + best_above_distance < best_below_distance + blob_box.height()); + bool below_good = + best_below_word != NULL && best_below_word != best_above_word && + (best_above_word == NULL || + best_below_distance < best_above_distance + blob_box.height()); + if (below_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_below_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + if (above_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_above_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + } +} + } // tesseract /********************************************************************** @@ -820,33 +993,3 @@ void tweak_row_baseline(ROW *row, free_mem(xstarts); free_mem(coeffs); } - -/********************************************************************** - * blob_y_order - * - * Sort function to sort blobs in y from page top. - **********************************************************************/ - -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2) { - //converted ptr - BLOBNBOX *blob1 = *(BLOBNBOX **) item1; - //converted ptr - BLOBNBOX *blob2 = *(BLOBNBOX **) item2; - - if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) - return -1; - else if (blob1->bounding_box ().bottom () < - blob2->bounding_box ().bottom ()) - return 1; - else { - if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) - return -1; - else if (blob1->bounding_box ().left () > - blob2->bounding_box ().left ()) - return 1; - else - return 0; - } -} diff --git a/textord/tordmain.h b/textord/tordmain.h index 340ff1aabe..cb5a6a1ef2 100644 --- a/textord/tordmain.h +++ b/textord/tordmain.h @@ -29,29 +29,14 @@ struct Pix; namespace tesseract { class Tesseract; -} -void make_blocks_from_blobs( //convert & textord - TBLOB *tessblobs, //tess style input - const char *filename, //blob file - ICOORD page_tr, //top right - BOOL8 do_shift, //shift tess coords - BLOCK_LIST *blocks //block list - ); void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob); void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks); -void textord_page( //make rows & words - ICOORD page_tr, //top right - BLOCK_LIST *blocks, //block list - TO_BLOCK_LIST *land_blocks, //rotated for landscape - TO_BLOCK_LIST *port_blocks, //output list - tesseract::Tesseract* - ); +} // namespace tesseract + void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction); -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2); + #endif