diff --git a/Makefile.am b/Makefile.am index b0e0baeeb8..28e772d172 100644 --- a/Makefile.am +++ b/Makefile.am @@ -260,6 +260,7 @@ endif # Rules for src/ccstruct. noinst_HEADERS += src/ccstruct/blamer.h +noinst_HEADERS += src/ccstruct/blob_bounds_calculator.h noinst_HEADERS += src/ccstruct/blobbox.h noinst_HEADERS += src/ccstruct/blobs.h noinst_HEADERS += src/ccstruct/blread.h @@ -303,6 +304,7 @@ noinst_HEADERS += src/ccstruct/params_training_featdef.h endif libtesseract_la_SOURCES += src/ccstruct/blamer.cpp +libtesseract_la_SOURCES += src/ccstruct/blob_bounds_calculator.cpp libtesseract_la_SOURCES += src/ccstruct/blobbox.cpp libtesseract_la_SOURCES += src/ccstruct/blobs.cpp libtesseract_la_SOURCES += src/ccstruct/blread.cpp diff --git a/src/ccstruct/blob_bounds_calculator.cpp b/src/ccstruct/blob_bounds_calculator.cpp new file mode 100644 index 0000000000..54ea9294df --- /dev/null +++ b/src/ccstruct/blob_bounds_calculator.cpp @@ -0,0 +1,471 @@ +/////////////////////////////////////////////////////////////////////// +// File: blob_bounds_calculator.h +// Description: Module for calculation of blob bounds from LSTM data +// Author: Povilas Kanapickas +// +// (C) Copyright 2022, Povilas Kanapickas +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "blob_bounds_calculator.h" +#include +#include +#include + +namespace tesseract { + +std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d) { + out << "CharBoundaryByBoxIndex{ "; + if (d.index.has_value()) { + out << d.index.value() << ", "; + } else { + out << "no-index, "; + } + out << d.split_index << " " << d.split_count << " }"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d) { + out << "CharacterPlaceDecision{" + << " prev_index: " << d.prev_index + << " begin: " << d.begin + << " end: " << d.end + << " prev_pos_diff: " << d.prev_pos_diff + << " cost: " << d.cost + << " }"; + return out; +} + +void CharacterPlaceDecisions::add_place(unsigned prev_index, + CharBoundaryByBoxIndex begin, + CharBoundaryByBoxIndex end, + double prev_pos_diff, + double cost, double max_cost_diff) { + if (cost > min_cost + max_cost_diff) { + return; + } + + int replace_existing_decision_index = -1; + for (std::size_t i = 0; i < decisions.size(); ++i) { + if (decisions[i].end == end) { + if (cost < decisions[i].cost) { + replace_existing_decision_index = i; + break; + } else { + // existing decision is better + return; + } + } + } + + CharacterPlaceDecision new_decision{prev_index, begin, end, + prev_pos_diff, cost}; + if (replace_existing_decision_index >= 0) { + decisions[replace_existing_decision_index] = new_decision; + } else { + decisions.push_back(new_decision); + } + + if (cost < min_cost) { + min_cost = cost; + + // Remove all decisions that no longer satisfy maximum cost difference + // requirement. + auto last_it = std::remove_if(decisions.begin(), decisions.end(), + [=](const auto& d) { + return d.cost > min_cost + max_cost_diff; + }); + decisions.erase(last_it, decisions.end()); + } +} + +bool CharacterBoundaries::operator==(const CharacterBoundaries& other) const { + return begin_x == other.begin_x && + begin_box_index == other.begin_box_index && + end_x == other.end_x && + end_box_index == other.end_box_index; +} + +std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds) { + out << "CharacterBoundaries{" << bounds.begin_x << ", " + << bounds.begin_box_index << ", " + << bounds.end_x << ", " + << bounds.end_box_index << "}"; + return out; +} + +BoxBoundariesCalculator::BoxBoundariesCalculator( + const std::vector& bounds, + const BoxBoundariesCalculatorConfig& config) : + bounds_{bounds}, + config_{config} +{ + if (!bounds_.empty()) { + double width_sum = 0; + for (const auto& b : bounds) { + width_sum += b.end - b.begin; + } + average_box_width_ = width_sum / static_cast(bounds.size()); + } +} + +std::vector + BoxBoundariesCalculator::calculate_bounds(const std::vector& symbols) +{ + std::vector decisions; + decisions.resize(symbols.size()); + + // The initial state + CharacterPlaceDecisions init_decisions; + init_decisions.add_place(0, {0, 0, 0}, {0, 0, 0}, 0, 0, + config_.max_character_cost_diff); + + for (std::size_t is = 0; is != symbols.size(); ++is) { + const auto& symbol = symbols[is]; + const auto& prev_decisions = is == 0 ? init_decisions : decisions[is - 1]; + auto& next_decisions = decisions[is]; + + auto [symbol_min_box, symbol_max_box] = possible_boxes_for_symbol(symbol); + + unsigned prev_farthest_index = farthest_decision_index(prev_decisions); + const auto& prev_farthest_decision = + prev_decisions.decisions[prev_farthest_index]; + + if (symbol_min_box == symbol_max_box) { + // There are no boxes for the current symbol. Select the previous + // decision which went farthest and was at box boundary. + // + // We ignore everything that affects the cost for this symbol because the + // cost will be the same for all decision paths, thus will not affect + // which decision path is ultimately selected. + // + // We reset prev_pos_diff as we are effectively starting over. + next_decisions.add_place(prev_farthest_index, {{}, 0, 0}, {{}, 0, 0}, + 0, prev_farthest_decision.cost, + config_.max_character_cost_diff); + continue; + } + + if (prev_farthest_decision.end.index < symbol_min_box) { + // There are boxes that can't be attributed to any of the symbols because + // they are too far away. In this case we pick the previous decision path + // that went farthest and force the first box to be attributed to the + // symbol. + // + // We ignore everything that affects the cost for this symbol because the + // cost will be the same for all decision paths, thus will not affect + // which decision path is ultimately selected. + // + // We reset prev_pos_diff as we are effectively starting over. + try_decisions_from_prev_decision(next_decisions, prev_farthest_index, + {symbol_min_box, 0, 0}, + 0, prev_farthest_decision.cost, + symbol, symbol_max_box); + continue; + } + + for (std::size_t i_d = 0; i_d < prev_decisions.decisions.size(); ++i_d) { + const auto& prev_decision = prev_decisions.decisions[i_d]; + try_decisions_from_prev_decision(next_decisions, i_d, + prev_decision.end, + prev_decision.prev_pos_diff, + prev_decision.cost, + symbol, symbol_max_box); + } + } + + auto best_decision_path = pick_best_decision_path(decisions); + fix_decisions_split_count(best_decision_path); + return decisions_to_results(symbols, best_decision_path); +} + +void BoxBoundariesCalculator::try_decisions_from_prev_decision( + CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol, unsigned symbol_max_box) +{ + if (start_bound.split_index > 0) { + // attempt to split the start box once again + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, + {start_bound.index, + start_bound.split_index + 1, + start_bound.split_count + 1}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + // attempt to take the remaining split of the start box + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {start_bound.index, 0, 0}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + } + for (unsigned end_box = start_bound.index.value() + 1; + end_box <= symbol_max_box; ++end_box) { + // try one or more full boxes + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {end_box, 0, 0}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + // try zero or more full boxes and a split box + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {end_box, 1, 2}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + } +} + +void BoxBoundariesCalculator::try_decision_from_prev_decision( + CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, CharBoundaryByBoxIndex end_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol) +{ + // The following computes the additional cost of the decision. The + // following rules are used: + // + // - The center of the resulting merged boxes that we assign to the symbol + // is just the middle between the start and end boundaries. We don't use + // anything like weighted averages because presumably the boxes actually + // represent a single symbol and were split into parts due to bad quality + // input or a segmenter error. Instead we just consider whole area as a + // single box. + // + // - In case of split box, the boundary position is computed according to + // the currently known split factor without taking into account that + // future decisions may split the box further. In theory we could go back + // to previous decisions and adjust the cost, but this is not currently + // implemented. + double cost = prev_decision_cost; + + bool is_split = end_bound.split_index != 0; + if (is_split) { + cost += config_.split_cost; + } + + unsigned merge_count = end_bound.index.value() - start_bound.index.value(); + if (start_bound.split_index == 0) { + merge_count--; + } + + cost += config_.merge_cost * merge_count; + + double merged_box_center = (get_box_pos_begin(start_bound) + + get_box_pos_end(end_bound)) / 2; + double symbol_center = symbol.middle(); + + double pos_diff = symbol_center - merged_box_center; + double pos_diff_for_cost = 0; + + if (pos_diff < 0 && pos_diff < prev_decision_pos_diff) { + if (prev_decision_pos_diff < 0) { + pos_diff_for_cost = prev_decision_pos_diff - pos_diff; + } else { + pos_diff_for_cost = -pos_diff; + } + } + + if (pos_diff > 0 && pos_diff > prev_decision_pos_diff) { + if (prev_decision_pos_diff > 0) { + pos_diff_for_cost = pos_diff - prev_decision_pos_diff; + } else { + pos_diff_for_cost = pos_diff; + } + } + + cost += config_.pos_diff_cost * pos_diff_for_cost / average_box_width_; + + next_decisions.add_place(prev_decision_index, start_bound, end_bound, + pos_diff, cost, config_.max_character_cost_diff); +} + + +double BoxBoundariesCalculator::get_box_pos_begin(CharBoundaryByBoxIndex bound) +{ + assert(bound.index.has_value()); + + if (bound.split_index == 0) { + return bounds_[*bound.index].begin; + } + assert(bound.index.value() > 0); + return get_box_split_pos(bounds_[bound.index.value() - 1], + bound.split_index, bound.split_count); +} + +double BoxBoundariesCalculator::get_box_pos_end(CharBoundaryByBoxIndex bound) +{ + assert(bound.index.has_value()); + assert(bound.index.value() > 0); + + if (bound.split_index == 0) { + return bounds_[bound.index.value() - 1].end; + } + return get_box_split_pos(bounds_[bound.index.value() - 1], + bound.split_index, bound.split_count); +} + + +int BoxBoundariesCalculator::farthest_decision_index( + const CharacterPlaceDecisions& decisions) +{ + unsigned best_decision = 0; + unsigned max_box_index = 0; + for (std::size_t i = 0; i < decisions.decisions.size(); ++i) { + const auto& decision = decisions.decisions[i]; + if (!decision.end.index.has_value()) { + continue; + } + + if (decision.end.split_index == 0 && + decision.end.index.value() > max_box_index) { + max_box_index = decision.end.index.value(); + best_decision = i; + } + } + return best_decision; +} + +std::pair + BoxBoundariesCalculator::possible_boxes_for_symbol(const BoxBoundaries& symbol) +{ + auto min = symbol.begin - config_.max_pos_diff * average_box_width_; + auto max = symbol.end + config_.max_pos_diff * average_box_width_; + + auto range_begin = std::partition_point(bounds_.begin(), bounds_.end(), + [min](const auto& b){ + return b.middle() < min; + }); + + auto range_end = std::partition_point(range_begin, bounds_.end(), + [max](const auto& b){ + return b.middle() < max; + }); + + if (range_begin == bounds_.end()) { + return { 0, 0 }; + } + return { std::distance(bounds_.begin(), range_begin), + std::distance(bounds_.begin(), range_end) }; +} + +std::vector + BoxBoundariesCalculator::pick_best_decision_path( + std::vector& decisions) { + + std::vector result; + result.resize(decisions.size()); + + unsigned next_best_decision = get_best_end_decision(decisions.back()); + for (int i = decisions.size(); i > 0; --i) { + int curr_index = i - 1; + const auto& curr_decisions = decisions[curr_index]; + const auto& curr_best_decision = curr_decisions.decisions[next_best_decision]; + next_best_decision = curr_best_decision.prev_index; + + result[curr_index] = curr_best_decision; + } + + return result; +} + +void BoxBoundariesCalculator::fix_decisions_split_count( + std::vector& decisions) { + unsigned last_box_index = std::numeric_limits::max(); + unsigned last_box_split_count = 0; + + auto adjust_index = [&](CharBoundaryByBoxIndex& index) { + if (!index.index.has_value()) + return; + + // The box indexes are always increasing and the last index with nonzero + // split_count contains the largest split_count that we must apply to the + // rest of indexes with nonzero split_count and the same box index. + // Note that we iterate backwards in the loop below, so the order reverses + // here. + if (index.index.value() == last_box_index) { + if (index.split_count != 0) { + last_box_split_count = index.split_count; + } + index.split_count = last_box_split_count; + } else { + last_box_index = index.index.value(); + last_box_split_count = index.split_count; + } + }; + + for (auto it = decisions.rbegin(); it != decisions.rend(); it++) { + adjust_index(it->end); + adjust_index(it->begin); + } +} + +std::vector BoxBoundariesCalculator::decisions_to_results( + const std::vector& symbols, + const std::vector& decisions) +{ + std::vector results; + results.resize(symbols.size()); + + for (int i = decisions.size(); i > 0; --i) { + int curr_index = i - 1; + const auto& decision = decisions[curr_index]; + const auto& symbol = symbols[curr_index]; + + if (!decision.begin.index.has_value() || + !decision.end.index.has_value()) { + results[curr_index] = CharacterBoundaries{symbol.begin, 0, symbol.end, 0}; + continue; + } + + // The result is in terms of boxes that are at least partially assigned to + // characters. Decisions store bounds which need adjustment in case of + // split boxes. + auto begin_index = decision.begin.index.value(); + if (decision.begin.split_count > 0) { + begin_index--; + } + + results[curr_index] = CharacterBoundaries{ + static_cast(get_box_pos_begin(decision.begin)), + begin_index, + static_cast(get_box_pos_end(decision.end)), + decision.end.index.value()}; + } + + return results; +} + +int BoxBoundariesCalculator::get_best_end_decision( + const CharacterPlaceDecisions& decisions) { + assert(!decisions.decisions.empty()); + + unsigned best_decision = 0; + double min_cost = std::numeric_limits::infinity(); + + for (unsigned i = 0; i < decisions.decisions.size(); ++i) { + const auto& decision = decisions.decisions[i]; + if (decision.end.split_index != 0) + continue; + if (decision.cost < min_cost) { + best_decision = i; + min_cost = decision.cost; + } + } + + return best_decision; +} + +} // namespace tesseract diff --git a/src/ccstruct/blob_bounds_calculator.h b/src/ccstruct/blob_bounds_calculator.h new file mode 100644 index 0000000000..fe49b96080 --- /dev/null +++ b/src/ccstruct/blob_bounds_calculator.h @@ -0,0 +1,264 @@ +/////////////////////////////////////////////////////////////////////// +// File: blob_bounds_calculator.h +// Description: Module for calculation of blob bounds from LSTM data +// Author: Povilas Kanapickas +// +// (C) Copyright 2022, Povilas Kanapickas +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H +#define TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H + +#include +#include +#include +#include + +namespace tesseract { + +/* This file contains an implementation of an algorithm for improving character + positions when using LSTM models. LSTM model output produces only approximate + character positions without boundary data. Matching it to the blobs that + comprise the characters is non-trivial task, because the character positions + in the LTSM output have drift that is large enough for simple algorithms such + "pick nearest blobs" to produce large amounts of errors. + + It can be noticed that while LSTM model output produces only approximate + character positions, the regular segmenter is pretty good. Most of the blob + boundaries correspond to boundaries of characters and most significant errors + are occasional blobs that correspond to multiple characters or multiple blobs + that correspond to a single character. + + Thus the basic idea of the algorithm is to treat the output of the regular + segmenter as a template to which LSTM model output is matched. The selection + of best match is done by assigning each unwanted property a cost and + then minimizing the total cost of the solution. The algorithm uses the + following costs: + + - cost for merging multiple blobs to represent a character + - cost for splitting a blob to represent multiple characters + - cost for difference between the positions of the blobs and characters + that they are matched to. + + The cost of difference between positions is computed not by simply + accumulating the sum of all position differences, but by only taking into + account additional difference of each character compared to previous + character. This way the algorithm does not attempt to "optimize" out of + place characters by adding unneeded blob merges and splits. + + The optimization problem is solved by dynamic programming techniques by + noticing that assigning specific blobs to a character leaves us with a + slightly smaller problem. + + The approach is to place the first character in all potential positions + and record the outcomes. Then for each of these outcomes attempts are made + to place the second character at all potential positions and so on. + Whenever there are multiple decision paths to arrive to a situation when the + end of a specific character is at the same position, the path with the + lowest cost is picked and others are ignored. +*/ + +// Represents a character boundary in terms of index of a box in a list and +// potentially partition within that box. +struct CharBoundaryByBoxIndex { + // The index of the box following the boundary. Box may be invalid. + std::optional index = 0; + + // The location of the boundary within the box. split_count == 0 means that + // the boundary is just before the box. Otherwise, the location is + // (split_index / split_count) position within the preceding box. + unsigned split_index = 0; + unsigned split_count = 0; + + bool operator==(const CharBoundaryByBoxIndex& other) const { + return index == other.index && + split_index == other.split_index && + split_count == other.split_count; + } + + bool operator!=(const CharBoundaryByBoxIndex& other) const { + return !(*this == other); + } +}; + +std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d); + + +// Represents a placement of a specific character at specific location. +struct CharacterPlaceDecision { + // Index of the placement decision of the previous character. + unsigned prev_index; + // Placement of the start of a character in the input box list. + CharBoundaryByBoxIndex begin; + // Placement of the end of a character in the input box list. + CharBoundaryByBoxIndex end; + // The difference of positions between the center of the previous character + // and the center of the assigned boxes + double prev_pos_diff = 0; + // The cost incurred so far + double cost = 0; +}; + +std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d); + + +// Represents a set of placement decisions for a specific character +struct CharacterPlaceDecisions { + std::vector decisions; + // minimum cost across all decisions + double min_cost = std::numeric_limits::infinity(); + + // Adds a character placement decision. + void add_place(unsigned prev_index, + CharBoundaryByBoxIndex begin, CharBoundaryByBoxIndex end, + double prev_pos_diff, double cost, double max_cost_diff); +}; + +// Represents bounds of a box in X direction +struct BoxBoundaries { + int begin = 0; + int end = 0; + + double middle() const { return (double(begin) + end) / 2; } +}; + + +// Represents resulting character boundaries. The exact X positions are +// provided as well as which input blobs the character corresponds to, which +// allows computing correct boundaries in the Y axis. +struct CharacterBoundaries { + int begin_x = 0; + + // Inclusive index of the beginning box. + unsigned begin_box_index = 0; + + int end_x = 0; + + // Exclusive index of the ending box. If box data is invalid, + // begin_box_index == end_box_index + unsigned end_box_index = 0; + + bool operator==(const CharacterBoundaries& other) const; +}; + +std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds); + + +struct BoxBoundariesCalculatorConfig +{ + // The cost of each merging of two input boxes. + double merge_cost = 2; + + // The cost of each split of two input boxes. + double split_cost = 2; + + // The cost of difference between the center the symbol and the center of + // the input box. This cost is only incurred whenever subsequent character + // "moves" in wrong direction. The total cost is computed by multiplying + // the multiplier and the difference of positions relative to the average + // width of input boxes. + double pos_diff_cost = 1; + + // Defines which boxes to potentially consider for symbol. The number is + // relative to the average width of input boxes. + double max_pos_diff = 2; + + // Defines the maximum difference between minimum and maximum cost for all + // placements of a character. + double max_character_cost_diff = 5; +}; + +// See the description of the algorithm at the top of the file. +class BoxBoundariesCalculator { +public: + // Constructs the calculator for blob boundaries computed by regular + // segmenter. + BoxBoundariesCalculator(const std::vector& bounds, + const BoxBoundariesCalculatorConfig& config); + + // Computes improved character positions given LSTM model output. For the + // purposes of character positioning only the center coordinate is used. + // The start and end coordinates are used only as a fallback when the data + // does not match any input blobs. + std::vector + calculate_bounds(const std::vector& symbols); + +private: + + // This function takes all possible combinations of box boundaries between + // start_bound and symbol_max_box, computes the costs of each option and adds + // them to next_decisions array. The number of possibilities is approximately + // (symbol_max_box - start_bound.index) * 2. The number is twice the number + // of available boxes in range because we may want to split each box with + // subsequent symbol. + void try_decisions_from_prev_decision(CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol, + unsigned symbol_max_box); + + void try_decision_from_prev_decision(CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + CharBoundaryByBoxIndex end_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol); + + double get_box_pos_begin(CharBoundaryByBoxIndex bound); + double get_box_pos_end(CharBoundaryByBoxIndex bound); + + double get_box_split_pos(const BoxBoundaries& b, unsigned split_index, + unsigned split_count) + { + return b.begin + (b.end - b.begin) * double(split_index) / split_count; + } + + static int farthest_decision_index(const CharacterPlaceDecisions& decisions); + + std::pair + possible_boxes_for_symbol(const BoxBoundaries& symbol); + + // Goes through the final decisions and picks full path of the best placement + // decision. + std::vector pick_best_decision_path( + std::vector& decisions); + + // When constructing decisions we didn't care to update split sizes of + // blobs when splitting more than once. As a result, splitting a blob into 4 + // parts splits at 0.5, 0.66 and 0.75 of the blob whereas the correct + // splits are at 0.25, 0.5, 0.75. We assume this does not matter when + // computing the costs, but for positions of the characters we need to + // produce exact results. + void fix_decisions_split_count(std::vector& decisions); + + std::vector + decisions_to_results(const std::vector& symbols, + const std::vector& decisions); + + // Finds the best decision from the final decisions. The best decision is + // such that it has minimum cost among decisions that end at an proper box + // boundary. + static int get_best_end_decision(const CharacterPlaceDecisions& decisions); + +private: + std::vector bounds_; + BoxBoundariesCalculatorConfig config_; + double average_box_width_ = 0; +}; + +} // namespace tesseract + +#endif // TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H diff --git a/src/ccstruct/pageres.cpp b/src/ccstruct/pageres.cpp index 9e2a4ad587..6ef5da2d4c 100644 --- a/src/ccstruct/pageres.cpp +++ b/src/ccstruct/pageres.cpp @@ -24,6 +24,7 @@ #include "pageres.h" #include "blamer.h" // for BlamerBundle +#include "blob_bounds_calculator.h" // for BoxBoundariesCalculator #include "blobs.h" // for TWERD, TBLOB #include "boxword.h" // for BoxWord #include "errcode.h" // for ASSERT_HOST @@ -1269,36 +1270,6 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, return new_res; } -// Helper computes the boundaries between blobs in the word. The blob bounds -// are likely very poor, if they come from LSTM, where it only outputs the -// character at one pixel within it, so we find the midpoints between them. -static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box, - C_BLOB_LIST *next_word_blobs, - std::vector *blob_ends) { - C_BLOB_IT blob_it(word.word->cblob_list()); - for (int length : word.best_state) { - // Get the bounding box of the fake blobs - TBOX blob_box = blob_it.data()->bounding_box(); - blob_it.forward(); - for (int b = 1; b < length; ++b) { - blob_box += blob_it.data()->bounding_box(); - blob_it.forward(); - } - // This blob_box is crap, so for now we are only looking for the - // boundaries between them. - int blob_end = INT32_MAX; - if (!blob_it.at_first() || next_word_blobs != nullptr) { - if (blob_it.at_first()) { - blob_it.set_to_list(next_word_blobs); - } - blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2; - } - blob_end = ClipToRange(blob_end, clip_box.left(), clip_box.right()); - blob_ends->push_back(blob_end); - } - blob_ends->back() = clip_box.right(); -} - // Helper computes the bounds of a word by restricting it to existing words // that significantly overlap. static TBOX ComputeWordBounds(const tesseract::PointerVector &words, @@ -1345,6 +1316,40 @@ static TBOX ComputeWordBounds(const tesseract::PointerVector &words, return clipped_box; } +// Helper to compute input for BoxBoundariesCalculator +static std::vector ComputeFakeWordBlobXBounds( + const PointerVector &words) { + + std::vector result; + + for (size_t w = 0; w < words.size(); ++w) { + WERD_RES *word_w = words[w]; + + C_BLOB_IT blob_it(word_w->word->cblob_list()); + for (int length : word_w->best_state) { + TBOX blob_box = blob_it.data()->bounding_box(); + blob_it.forward(); + for (int b = 1; b < length; ++b) { + blob_box += blob_it.data()->bounding_box(); + blob_it.forward(); + } + result.push_back({blob_box.left(), blob_box.right()}); + } + } + return result; +} + +// Helper to compute input for BoxBoundariesCalculator +static std::vector ComputeBlobXBoundsFromTBOX( + const std::vector &boxes) { + std::vector result; + result.reserve(boxes.size()); + for (const auto& box : boxes) { + result.push_back({box.left(), box.right()}); + } + return result; +} + // Helper moves the src_blob to dest. If it isn't contained by clip_box, // the blob is replaced by a fake that is contained. The helper takes ownership // of the blob. @@ -1368,6 +1373,13 @@ static TBOX ClipAndAddBlob(C_BLOB *src_blob, C_BLOB_IT *dest_it, return box; } +// Helper to clip a box only in X direction +static TBOX ClipBoxX(const TBOX &box, int left, int right) { + int clip_left = ClipToRange(box.left(), left, right - 1); + int clip_right = ClipToRange(box.right(), left + 1, right); + return TBOX(clip_left, box.bottom(), clip_right, box.top()); +} + // Replaces the current WERD/WERD_RES with the given words. The given words // contain fake blobs that indicate the position of the characters. These are // replaced with real blobs from the current word as much as possible. @@ -1412,21 +1424,31 @@ void PAGE_RES_IT::ReplaceCurrentWord( } } ASSERT_HOST(!wr_it.cycled_list()); - // Since we only have an estimate of the bounds between blobs, use the blob - // x-middle as the determiner of where to put the blobs + + std::vector blob_boxes; + C_BLOB_IT src_b_it(input_word->word->cblob_list()); src_b_it.sort(&C_BLOB::SortByXMiddle); + for (src_b_it.mark_cycle_pt(); !src_b_it.cycled_list(); src_b_it.forward()) { + blob_boxes.push_back(src_b_it.data()->bounding_box()); + } + src_b_it.move_to_first(); + C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list()); rej_b_it.sort(&C_BLOB::SortByXMiddle); + + auto fake_blob_bounds = ComputeFakeWordBlobXBounds(*words); + BoxBoundariesCalculator calculator{ComputeBlobXBoundsFromTBOX(blob_boxes), {}}; + auto char_bounds = calculator.calculate_bounds(fake_blob_bounds); + size_t char_bounds_i = 0; + size_t box_bounds_i = 0; + TBOX last_blob_box; + TBOX clip_box; for (size_t w = 0; w < words->size(); ++w) { WERD_RES *word_w = (*words)[w]; clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word); - // Compute blob boundaries. - std::vector blob_ends; - C_BLOB_LIST *next_word_blobs = - w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr; - ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends); + // Remove the fake blobs on the current word, but keep safe for back-up if // no blob can be found. C_BLOB_LIST fake_blobs; @@ -1437,26 +1459,64 @@ void PAGE_RES_IT::ReplaceCurrentWord( C_BLOB_IT dest_it(word_w->word->cblob_list()); // Build the box word as we move the blobs. auto *box_word = new tesseract::BoxWord; - for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) { - int end_x = blob_ends[i]; + + for (size_t i = 0; i < word_w->best_state.size(); ++i) { + const auto& char_bound = char_bounds[char_bounds_i++]; + TBOX blob_box; - // Add the blobs up to end_x. - while (!src_b_it.empty() && - src_b_it.data()->bounding_box().x_middle() < end_x) { - blob_box += ClipAndAddBlob(src_b_it.extract(), &dest_it, clip_box); - src_b_it.forward(); + if (char_bound.begin_box_index != char_bound.end_box_index) { + // The box indices in curr_char_bound will always be increasing, thus + // we can iterate src_b_it in the same order. + while (box_bounds_i < char_bound.begin_box_index) { + box_bounds_i++; + src_b_it.forward(); + } + + if (box_bounds_i > char_bound.begin_box_index) { + // The blob was split across multiple characters and has already + // been extracted for a previous character. We have the bounds + // of the blob and can create a fake blob out of it. + TBOX fake_box = ClipBoxX(last_blob_box, + char_bound.begin_x, char_bound.end_x); + blob_box += ClipAndAddBlob(C_BLOB::FakeBlob(fake_box), + &dest_it, clip_box); + } + + // Add all blobs that have not yet been assigned to any of the + // characters. + while (box_bounds_i < char_bound.end_box_index) { + auto* src_blob = src_b_it.extract(); + last_blob_box = src_blob->bounding_box(); + TBOX inserted_box = ClipAndAddBlob(src_blob, &dest_it, clip_box); + + box_bounds_i++; + src_b_it.forward(); + + // Note that the blob may be split across multiple characters in + // which case we want to clip the box to the part that was "assigned" + // to the character. + blob_box += ClipBoxX(inserted_box, + char_bound.begin_x, char_bound.end_x); + } } + + // It's not clear where rejected blobs should be added because by + // definition we don't have enough information about them. So we just + // add them to whatever character follows. while (!rej_b_it.empty() && - rej_b_it.data()->bounding_box().x_middle() < end_x) { + rej_b_it.data()->bounding_box().x_middle() < char_bound.end_x) { blob_box += ClipAndAddBlob(rej_b_it.extract(), &dest_it, clip_box); rej_b_it.forward(); } + if (blob_box.null_box()) { // Use the original box as a back-up. blob_box = ClipAndAddBlob(fake_b_it.extract(), &dest_it, clip_box); } box_word->InsertBox(i, blob_box); + fake_b_it.forward(); } + delete word_w->box_word; word_w->box_word = box_word; if (!input_word->combination) {