From 51a3398a3c1448fdedc9dc7de7790639d877ca7d Mon Sep 17 00:00:00 2001 From: Povilas Kanapickas Date: Sun, 10 Apr 2022 22:59:47 +0300 Subject: [PATCH] Improved character position tracking when LSTM models are used When using LSTM models the accuracy of character bounding boxes is low with many blobs assigned to wrong characters. This is caused by the fact that LSTM model output produces only approximate character positions without boundary data. As a result the input blobs cannot be accurately mapped to characters and which compromises the accuracy of character bounding boxes. Current this problem is solved as follows. The character boundaries are computed according to the character positions from the LSTM output by placing the boundaries at the middle between two character positions. The blobs are then assigned according to which character the center of the blob falls to. In other words the blobs are assigned to the nearest characters. This unfortunately produces a lot of errors because the character positions in the LSTM output have a tendency to drift, thus the nearest character is often not the right one. Fortunately while the LSTM model produces approximate positions, the blob boundaries produced by the regular segmenter are pretty good. Most of the time a single blob corresponds to a single character and vice-versa. The above is used to create an optimization algorithm that treats the output of the regular segmenter as a template to which LSTM model output is matched. The selection of best match is done by assigning each unwanted property of the outcome a cost and then minimizing the total cost of the solution. This reliably solves the most frequent error present in the current solution when blobs are simply assigned to wrong character. As a result the current algorithm produces up to 20 times less errors. Fixes https://github.com/tesseract-ocr/tesseract/issues/1712. --- Makefile.am | 7 + src/ccstruct/blob_bounds_calculator.cpp | 491 ++++++++++++++++++++++++ src/ccstruct/blob_bounds_calculator.h | 278 ++++++++++++++ src/ccstruct/pageres.cpp | 150 +++++--- unittest/blob_bounds_calculator_test.cc | 197 ++++++++++ 5 files changed, 1078 insertions(+), 45 deletions(-) create mode 100644 src/ccstruct/blob_bounds_calculator.cpp create mode 100644 src/ccstruct/blob_bounds_calculator.h create mode 100644 unittest/blob_bounds_calculator_test.cc diff --git a/Makefile.am b/Makefile.am index 56cdd695ed..083b4eb2d7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -250,6 +250,7 @@ endif # Rules for src/ccstruct. noinst_HEADERS += src/ccstruct/blamer.h +noinst_HEADERS += src/ccstruct/blob_bounds_calculator.h noinst_HEADERS += src/ccstruct/blobbox.h noinst_HEADERS += src/ccstruct/blobs.h noinst_HEADERS += src/ccstruct/blread.h @@ -293,6 +294,7 @@ noinst_HEADERS += src/ccstruct/params_training_featdef.h endif libtesseract_la_SOURCES += src/ccstruct/blamer.cpp +libtesseract_la_SOURCES += src/ccstruct/blob_bounds_calculator.cpp libtesseract_la_SOURCES += src/ccstruct/blobbox.cpp libtesseract_la_SOURCES += src/ccstruct/blobs.cpp libtesseract_la_SOURCES += src/ccstruct/blread.cpp @@ -1197,6 +1199,7 @@ if !DISABLED_LEGACY_ENGINE check_PROGRAMS += bitvector_test endif # !DISABLED_LEGACY_ENGINE endif # ENABLE_TRAINING +check_PROGRAMS += blob_bounds_calculator_test check_PROGRAMS += cleanapi_test check_PROGRAMS += colpartition_test if ENABLE_TRAINING @@ -1309,6 +1312,10 @@ bitvector_test_CPPFLAGS = $(unittest_CPPFLAGS) bitvector_test_LDADD = $(TRAINING_LIBS) endif # !DISABLED_LEGACY_ENGINE +blob_bounds_calculator_test_SOURCES = unittest/blob_bounds_calculator_test.cc +blob_bounds_calculator_test_CPPFLAGS = $(unittest_CPPFLAGS) +blob_bounds_calculator_test_LDADD = $(TESS_LIBS) + cleanapi_test_SOURCES = unittest/cleanapi_test.cc cleanapi_test_CPPFLAGS = $(unittest_CPPFLAGS) cleanapi_test_LDADD = $(TESS_LIBS) diff --git a/src/ccstruct/blob_bounds_calculator.cpp b/src/ccstruct/blob_bounds_calculator.cpp new file mode 100644 index 0000000000..786749bda0 --- /dev/null +++ b/src/ccstruct/blob_bounds_calculator.cpp @@ -0,0 +1,491 @@ +/////////////////////////////////////////////////////////////////////// +// File: blob_bounds_calculator.h +// Description: Module for calculation of blob bounds from LSTM data +// Author: Povilas Kanapickas +// +// (C) Copyright 2022, Povilas Kanapickas +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "blob_bounds_calculator.h" +#include +#include +#include + +namespace tesseract { + +std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d) { + out << "CharBoundaryByBoxIndex{ " + << d.index << ", " + << d.split_index << " " << d.split_count << " }"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d) { + out << "CharacterPlaceDecision{" + << " prev_index: " << d.prev_index + << " has_boxes: " << d.has_boxes + << " begin: " << d.begin + << " end: " << d.end + << " prev_pos_diff: " << d.prev_pos_diff + << " cost: " << d.cost + << " }"; + return out; +} + +void CharacterPlaceDecisions::add_place(unsigned prev_index, bool has_boxes, + CharBoundaryByBoxIndex begin, + CharBoundaryByBoxIndex end, + double prev_pos_diff, + double cost, double max_cost_diff) { + if (cost > min_cost + max_cost_diff) { + return; + } + + int replace_existing_decision_index = -1; + for (std::size_t i = 0; i < decisions.size(); ++i) { + if (decisions[i].end == end) { + if (cost < decisions[i].cost) { + replace_existing_decision_index = i; + break; + } else { + // existing decision is better + return; + } + } + } + + CharacterPlaceDecision new_decision{prev_index, has_boxes, begin, end, + prev_pos_diff, cost}; + if (replace_existing_decision_index >= 0) { + decisions[replace_existing_decision_index] = new_decision; + } else { + decisions.push_back(new_decision); + } + + if (cost < min_cost) { + min_cost = cost; + + // Remove all decisions that no longer satisfy maximum cost difference + // requirement. + auto last_it = std::remove_if(decisions.begin(), decisions.end(), + [=](const auto& d) { + return d.cost > min_cost + max_cost_diff; + }); + decisions.erase(last_it, decisions.end()); + } +} + +bool CharacterBoundaries::operator==(const CharacterBoundaries& other) const { + return begin_x == other.begin_x && + begin_box_index == other.begin_box_index && + end_x == other.end_x && + end_box_index == other.end_box_index; +} + +std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds) { + out << "CharacterBoundaries{" << bounds.begin_x << ", " + << bounds.begin_box_index << ", " + << bounds.end_x << ", " + << bounds.end_box_index << "}"; + return out; +} + +BoxBoundariesCalculator::BoxBoundariesCalculator( + const std::vector& bounds, + const BoxBoundariesCalculatorConfig& config) : + bounds_{bounds}, + config_{config} +{ + if (!bounds_.empty()) { + double width_sum = 0; + for (const auto& b : bounds) { + width_sum += b.end - b.begin; + } + average_box_width_ = width_sum / static_cast(bounds.size()); + } +} + +std::vector + BoxBoundariesCalculator::calculate_bounds(const std::vector& symbols) +{ + std::vector decisions; + decisions.resize(symbols.size()); + + // The initial state + CharacterPlaceDecisions init_decisions; + init_decisions.add_place(0, true, {0, 0, 0}, {0, 0, 0}, 0, 0, + config_.max_character_cost_diff); + + for (std::size_t is = 0; is != symbols.size(); ++is) { + const auto& symbol = symbols[is]; + const auto& prev_decisions = is == 0 ? init_decisions : decisions[is - 1]; + auto& next_decisions = decisions[is]; + + auto [symbol_min_box, symbol_max_box] = possible_boxes_for_symbol(symbol); + + unsigned prev_farthest_index = farthest_decision_index(prev_decisions); + const auto& prev_farthest_decision = + prev_decisions.decisions[prev_farthest_index]; + + if (symbol_min_box == symbol_max_box) { + // There are no boxes for the current symbol. Select the previous + // decision which went farthest and was at box boundary. + // + // We ignore everything that affects the cost for this symbol because the + // cost will be the same for all decision paths, thus will not affect + // which decision path is ultimately selected. + auto new_cost = prev_farthest_decision.cost + + config_.symbol_with_no_box_cost; + + // We reset prev_pos_diff as we are effectively starting over. + next_decisions.add_place(prev_farthest_index, false, {{}, 0, 0}, + prev_farthest_decision.end, + 0, new_cost, + config_.max_character_cost_diff); + continue; + } + + if (prev_farthest_decision.end.index < symbol_min_box) { + // There are boxes that can't be attributed to any of the symbols because + // they are too far away. In this case we pick the previous decision path + // that went farthest and force the first box to be attributed to the + // symbol. + // + // We ignore everything that affects the cost for this symbol because the + // cost will be the same for all decision paths, thus will not affect + // which decision path is ultimately selected. + + auto boxes_with_no_symbols = + symbol_min_box - prev_farthest_decision.end.index; + + auto new_cost = prev_farthest_decision.cost + + config_.box_with_no_symbol_cost * boxes_with_no_symbols; + + // We reset prev_pos_diff as we are effectively starting over. + try_decisions_from_prev_decision(next_decisions, prev_farthest_index, + {symbol_min_box, 0, 0}, + 0, new_cost, + symbol, symbol_max_box); + continue; + } + + for (std::size_t i_d = 0; i_d < prev_decisions.decisions.size(); ++i_d) { + const auto& prev_decision = prev_decisions.decisions[i_d]; + try_decisions_from_prev_decision(next_decisions, i_d, + prev_decision.end, + prev_decision.prev_pos_diff, + prev_decision.cost, + symbol, symbol_max_box); + } + } + + add_costs_for_remaining_boxes(decisions.back()); + auto best_decision_path = pick_best_decision_path(decisions); + fix_decisions_split_count(best_decision_path); + return decisions_to_results(symbols, best_decision_path); +} + +void BoxBoundariesCalculator::try_decisions_from_prev_decision( + CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol, unsigned symbol_max_box) +{ + if (start_bound.split_index > 0) { + // attempt to split the start box once again + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, + {start_bound.index, + start_bound.split_index + 1, + start_bound.split_count + 1}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + // attempt to take the remaining split of the start box + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {start_bound.index, 0, 0}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + } + for (unsigned end_box = start_bound.index + 1; + end_box <= symbol_max_box; ++end_box) { + // try one or more full boxes + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {end_box, 0, 0}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + // try zero or more full boxes and a split box + try_decision_from_prev_decision(next_decisions, prev_decision_index, + start_bound, {end_box, 1, 2}, + prev_decision_pos_diff, prev_decision_cost, + symbol); + } +} + +void BoxBoundariesCalculator::try_decision_from_prev_decision( + CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, CharBoundaryByBoxIndex end_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol) +{ + // The following computes the additional cost of the decision. The + // following rules are used: + // + // - The center of the resulting merged boxes that we assign to the symbol + // is just the middle between the start and end boundaries. We don't use + // anything like weighted averages because presumably the boxes actually + // represent a single symbol and were split into parts due to bad quality + // input or a segmenter error. Instead we just consider whole area as a + // single box. + // + // - In case of split box, the boundary position is computed according to + // the currently known split factor without taking into account that + // future decisions may split the box further. In theory we could go back + // to previous decisions and adjust the cost, but this is not currently + // implemented. + double cost = prev_decision_cost; + + bool is_split = end_bound.split_index != 0; + if (is_split) { + cost += config_.split_cost; + } + + unsigned merge_count = end_bound.index - start_bound.index; + if (start_bound.split_index == 0) { + merge_count--; + } + + cost += config_.merge_cost * merge_count; + + double merged_box_center = (get_box_pos_begin(start_bound) + + get_box_pos_end(end_bound)) / 2; + double symbol_center = symbol.middle(); + + double pos_diff = symbol_center - merged_box_center; + double pos_diff_for_cost = 0; + + if (pos_diff < 0 && pos_diff < prev_decision_pos_diff) { + if (prev_decision_pos_diff < 0) { + pos_diff_for_cost = prev_decision_pos_diff - pos_diff; + } else { + pos_diff_for_cost = -pos_diff; + } + } + + if (pos_diff > 0 && pos_diff > prev_decision_pos_diff) { + if (prev_decision_pos_diff > 0) { + pos_diff_for_cost = pos_diff - prev_decision_pos_diff; + } else { + pos_diff_for_cost = pos_diff; + } + } + + cost += config_.pos_diff_cost * pos_diff_for_cost / average_box_width_; + + next_decisions.add_place(prev_decision_index, true, start_bound, end_bound, + pos_diff, cost, config_.max_character_cost_diff); +} + + +double BoxBoundariesCalculator::get_box_pos_begin(CharBoundaryByBoxIndex bound) +{ + if (bound.split_index == 0) { + return bounds_[bound.index].begin; + } + assert(bound.index > 0); + return get_box_split_pos(bounds_[bound.index - 1], + bound.split_index, bound.split_count); +} + +double BoxBoundariesCalculator::get_box_pos_end(CharBoundaryByBoxIndex bound) +{ + assert(bound.index > 0); + + if (bound.split_index == 0) { + return bounds_[bound.index - 1].end; + } + return get_box_split_pos(bounds_[bound.index - 1], + bound.split_index, bound.split_count); +} + + +int BoxBoundariesCalculator::farthest_decision_index( + const CharacterPlaceDecisions& decisions) +{ + unsigned best_decision = 0; + unsigned max_box_index = 0; + double best_decision_cost = std::numeric_limits::infinity(); + + for (std::size_t i = 0; i < decisions.decisions.size(); ++i) { + const auto& decision = decisions.decisions[i]; + + if (decision.end.split_index == 0) { + if ((decision.end.index == max_box_index && + decision.cost < best_decision_cost) || + decision.end.index < max_box_index) { + max_box_index = decision.end.index; + best_decision_cost = decision.cost; + best_decision = i; + } + } + } + return best_decision; +} + +std::pair + BoxBoundariesCalculator::possible_boxes_for_symbol(const BoxBoundaries& symbol) +{ + auto min = symbol.begin - config_.max_pos_diff * average_box_width_; + auto max = symbol.end + config_.max_pos_diff * average_box_width_; + + auto range_begin = std::partition_point(bounds_.begin(), bounds_.end(), + [min](const auto& b){ + return b.middle() < min; + }); + + auto range_end = std::partition_point(range_begin, bounds_.end(), + [max](const auto& b){ + return b.middle() < max; + }); + + if (range_begin == bounds_.end()) { + return { 0, 0 }; + } + return { std::distance(bounds_.begin(), range_begin), + std::distance(bounds_.begin(), range_end) }; +} + +void BoxBoundariesCalculator::add_costs_for_remaining_boxes( + CharacterPlaceDecisions& decisions) { + + for (auto& decision : decisions.decisions) { + if (decision.end.split_index != 0) { + // We don't care about decisions that don't end on a box boundary. + continue; + } + assert(decision.end.index > 0); + + auto unused_boxes = bounds_.size() - decision.end.index; + decision.cost += unused_boxes * config_.box_with_no_symbol_cost; + } +} + +std::vector + BoxBoundariesCalculator::pick_best_decision_path( + std::vector& decisions) { + + std::vector result; + result.resize(decisions.size()); + + unsigned next_best_decision = get_best_end_decision(decisions.back()); + for (int i = decisions.size(); i > 0; --i) { + int curr_index = i - 1; + const auto& curr_decisions = decisions[curr_index]; + const auto& curr_best_decision = curr_decisions.decisions[next_best_decision]; + next_best_decision = curr_best_decision.prev_index; + + result[curr_index] = curr_best_decision; + } + + return result; +} + +void BoxBoundariesCalculator::fix_decisions_split_count( + std::vector& decisions) { + unsigned last_box_index = std::numeric_limits::max(); + unsigned last_box_split_count = 0; + + auto adjust_index = [&](CharBoundaryByBoxIndex& index) { + // The box indexes are always increasing and the last index with nonzero + // split_count contains the largest split_count that we must apply to the + // rest of indexes with nonzero split_count and the same box index. + // Note that we iterate backwards in the loop below, so the order reverses + // here. + if (index.index == last_box_index) { + if (index.split_count != 0) { + last_box_split_count = index.split_count; + } + index.split_count = last_box_split_count; + } else { + last_box_index = index.index; + last_box_split_count = index.split_count; + } + }; + + for (auto it = decisions.rbegin(); it != decisions.rend(); it++) { + if (it->has_boxes) { + adjust_index(it->end); + adjust_index(it->begin); + } + } +} + +std::vector BoxBoundariesCalculator::decisions_to_results( + const std::vector& symbols, + const std::vector& decisions) +{ + std::vector results; + results.resize(symbols.size()); + + for (int i = decisions.size(); i > 0; --i) { + int curr_index = i - 1; + const auto& decision = decisions[curr_index]; + const auto& symbol = symbols[curr_index]; + + if (!decision.has_boxes) { + results[curr_index] = CharacterBoundaries{symbol.begin, 0, symbol.end, 0}; + continue; + } + + // The result is in terms of boxes that are at least partially assigned to + // characters. Decisions store bounds which need adjustment in case of + // split boxes. + auto begin_index = decision.begin.index; + if (decision.begin.split_count > 0) { + begin_index--; + } + + results[curr_index] = CharacterBoundaries{ + static_cast(get_box_pos_begin(decision.begin)), + begin_index, + static_cast(get_box_pos_end(decision.end)), + decision.end.index}; + } + + return results; +} + +int BoxBoundariesCalculator::get_best_end_decision( + const CharacterPlaceDecisions& decisions) { + assert(!decisions.decisions.empty()); + + unsigned best_decision = 0; + double min_cost = std::numeric_limits::infinity(); + + for (unsigned i = 0; i < decisions.decisions.size(); ++i) { + const auto& decision = decisions.decisions[i]; + if (decision.end.split_index != 0) + continue; + if (decision.cost < min_cost) { + best_decision = i; + min_cost = decision.cost; + } + } + + return best_decision; +} + +} // namespace tesseract diff --git a/src/ccstruct/blob_bounds_calculator.h b/src/ccstruct/blob_bounds_calculator.h new file mode 100644 index 0000000000..ed69e3669e --- /dev/null +++ b/src/ccstruct/blob_bounds_calculator.h @@ -0,0 +1,278 @@ +/////////////////////////////////////////////////////////////////////// +// File: blob_bounds_calculator.h +// Description: Module for calculation of blob bounds from LSTM data +// Author: Povilas Kanapickas +// +// (C) Copyright 2022, Povilas Kanapickas +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H +#define TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H + +#include +#include +#include +#include + +namespace tesseract { + +/* This file contains an implementation of an algorithm for improving character + positions when using LSTM models. LSTM model output produces only approximate + character positions without boundary data. Matching it to the blobs that + comprise the characters is non-trivial task, because the character positions + in the LTSM output have drift that is large enough for simple algorithms such + "pick nearest blobs" to produce large amounts of errors. + + It can be noticed that while LSTM model output produces only approximate + character positions, the regular segmenter is pretty good. Most of the blob + boundaries correspond to boundaries of characters and most significant errors + are occasional blobs that correspond to multiple characters or multiple blobs + that correspond to a single character. + + Thus the basic idea of the algorithm is to treat the output of the regular + segmenter as a template to which LSTM model output is matched. The selection + of best match is done by assigning each unwanted property a cost and + then minimizing the total cost of the solution. The algorithm uses the + following costs: + + - cost for merging multiple blobs to represent a character + - cost for splitting a blob to represent multiple characters + - cost for difference between the positions of the blobs and characters + that they are matched to. + + The cost of difference between positions is computed not by simply + accumulating the sum of all position differences, but by only taking into + account additional difference of each character compared to previous + character. This way the algorithm does not attempt to "optimize" out of + place characters by adding unneeded blob merges and splits. + + The optimization problem is solved by dynamic programming techniques by + noticing that assigning specific blobs to a character leaves us with a + slightly smaller problem. + + The approach is to place the first character in all potential positions + and record the outcomes. Then for each of these outcomes attempts are made + to place the second character at all potential positions and so on. + Whenever there are multiple decision paths to arrive to a situation when the + end of a specific character is at the same position, the path with the + lowest cost is picked and others are ignored. +*/ + +// Represents a character boundary in terms of index of a box in a list and +// potentially partition within that box. +struct CharBoundaryByBoxIndex { + // The index of the box following the boundary. + unsigned index = 0; + + // The location of the boundary within the box. split_count == 0 means that + // the boundary is just before the box. Otherwise, the location is + // (split_index / split_count) position within the preceding box. + unsigned split_index = 0; + unsigned split_count = 0; + + bool operator==(const CharBoundaryByBoxIndex& other) const { + return index == other.index && + split_index == other.split_index && + split_count == other.split_count; + } + + bool operator!=(const CharBoundaryByBoxIndex& other) const { + return !(*this == other); + } +}; + +std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d); + + +// Represents a placement of a specific character at specific location. +struct CharacterPlaceDecision { + // Index of the placement decision of the previous character. + unsigned prev_index; + // Whether the character had any boxes assigned to it. If not, then the + // data stored in `begin` in not defined. + bool has_boxes = false; + // Placement of the start of a character in the input box list. + CharBoundaryByBoxIndex begin; + // Placement of the end of a character in the input box list. + CharBoundaryByBoxIndex end; + // The difference of positions between the center of the previous character + // and the center of the assigned boxes + double prev_pos_diff = 0; + // The cost incurred so far + double cost = 0; +}; + +std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d); + + +// Represents a set of placement decisions for a specific character +struct CharacterPlaceDecisions { + std::vector decisions; + // minimum cost across all decisions + double min_cost = std::numeric_limits::infinity(); + + // Adds a character placement decision. + void add_place(unsigned prev_index, bool has_boxes, + CharBoundaryByBoxIndex begin, CharBoundaryByBoxIndex end, + double prev_pos_diff, double cost, double max_cost_diff); +}; + +// Represents bounds of a box in X direction +struct BoxBoundaries { + int begin = 0; + int end = 0; + + double middle() const { return (double(begin) + end) / 2; } +}; + + +// Represents resulting character boundaries. The exact X positions are +// provided as well as which input blobs the character corresponds to, which +// allows computing correct boundaries in the Y axis. +struct CharacterBoundaries { + int begin_x = 0; + + // Inclusive index of the beginning box. + unsigned begin_box_index = 0; + + int end_x = 0; + + // Exclusive index of the ending box. If box data is invalid, + // begin_box_index == end_box_index + unsigned end_box_index = 0; + + bool operator==(const CharacterBoundaries& other) const; +}; + +std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds); + + +struct BoxBoundariesCalculatorConfig +{ + // The cost of each merging of two input boxes. + double merge_cost = 2; + + // The cost of each split of two input boxes. + double split_cost = 2; + + // The cost of each box that is not attributed to any symbol + double box_with_no_symbol_cost = 2.2; + + // The cost of each symbol that has no boxes + double symbol_with_no_box_cost = 2.2; + + // The cost of difference between the center the symbol and the center of + // the input box. This cost is only incurred whenever subsequent character + // "moves" in wrong direction. The total cost is computed by multiplying + // the multiplier and the difference of positions relative to the average + // width of input boxes. + double pos_diff_cost = 1; + + // Defines which boxes to potentially consider for symbol. The number is + // relative to the average width of input boxes. + double max_pos_diff = 2; + + // Defines the maximum difference between minimum and maximum cost for all + // placements of a character. + double max_character_cost_diff = 5; +}; + +// See the description of the algorithm at the top of the file. +class BoxBoundariesCalculator { +public: + // Constructs the calculator for blob boundaries computed by regular + // segmenter. + BoxBoundariesCalculator(const std::vector& bounds, + const BoxBoundariesCalculatorConfig& config); + + // Computes improved character positions given LSTM model output. For the + // purposes of character positioning only the center coordinate is used. + // The start and end coordinates are used only as a fallback when the data + // does not match any input blobs. + std::vector + calculate_bounds(const std::vector& symbols); + +private: + + // This function takes all possible combinations of box boundaries between + // start_bound and symbol_max_box, computes the costs of each option and adds + // them to next_decisions array. The number of possibilities is approximately + // (symbol_max_box - start_bound.index) * 2. The number is twice the number + // of available boxes in range because we may want to split each box with + // subsequent symbol. + void try_decisions_from_prev_decision(CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol, + unsigned symbol_max_box); + + void try_decision_from_prev_decision(CharacterPlaceDecisions& next_decisions, + unsigned prev_decision_index, + CharBoundaryByBoxIndex start_bound, + CharBoundaryByBoxIndex end_bound, + double prev_decision_pos_diff, + double prev_decision_cost, + const BoxBoundaries& symbol); + + double get_box_pos_begin(CharBoundaryByBoxIndex bound); + double get_box_pos_end(CharBoundaryByBoxIndex bound); + + double get_box_split_pos(const BoxBoundaries& b, unsigned split_index, + unsigned split_count) + { + return b.begin + (b.end - b.begin) * double(split_index) / split_count; + } + + static int farthest_decision_index(const CharacterPlaceDecisions& decisions); + + std::pair + possible_boxes_for_symbol(const BoxBoundaries& symbol); + + + // Goes through the decisions and adds costs for all boxes that have not + // been added to a symbol. + void add_costs_for_remaining_boxes(CharacterPlaceDecisions& decisions); + + // Goes through the final decisions and picks full path of the best placement + // decision. + std::vector pick_best_decision_path( + std::vector& decisions); + + // When constructing decisions we didn't care to update split sizes of + // blobs when splitting more than once. As a result, splitting a blob into 4 + // parts splits at 0.5, 0.66 and 0.75 of the blob whereas the correct + // splits are at 0.25, 0.5, 0.75. We assume this does not matter when + // computing the costs, but for positions of the characters we need to + // produce exact results. + void fix_decisions_split_count(std::vector& decisions); + + std::vector + decisions_to_results(const std::vector& symbols, + const std::vector& decisions); + + // Finds the best decision from the final decisions. The best decision is + // such that it has minimum cost among decisions that end at an proper box + // boundary. + static int get_best_end_decision(const CharacterPlaceDecisions& decisions); + +private: + std::vector bounds_; + BoxBoundariesCalculatorConfig config_; + double average_box_width_ = 0; +}; + +} // namespace tesseract + +#endif // TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H diff --git a/src/ccstruct/pageres.cpp b/src/ccstruct/pageres.cpp index 58401da46b..bc0090009c 100644 --- a/src/ccstruct/pageres.cpp +++ b/src/ccstruct/pageres.cpp @@ -24,6 +24,7 @@ #include "pageres.h" #include "blamer.h" // for BlamerBundle +#include "blob_bounds_calculator.h" // for BoxBoundariesCalculator #include "blobs.h" // for TWERD, TBLOB #include "boxword.h" // for BoxWord #include "errcode.h" // for ASSERT_HOST @@ -1273,36 +1274,6 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, return new_res; } -// Helper computes the boundaries between blobs in the word. The blob bounds -// are likely very poor, if they come from LSTM, where it only outputs the -// character at one pixel within it, so we find the midpoints between them. -static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box, - C_BLOB_LIST *next_word_blobs, - std::vector *blob_ends) { - C_BLOB_IT blob_it(word.word->cblob_list()); - for (int length : word.best_state) { - // Get the bounding box of the fake blobs - TBOX blob_box = blob_it.data()->bounding_box(); - blob_it.forward(); - for (int b = 1; b < length; ++b) { - blob_box += blob_it.data()->bounding_box(); - blob_it.forward(); - } - // This blob_box is crap, so for now we are only looking for the - // boundaries between them. - int blob_end = INT32_MAX; - if (!blob_it.at_first() || next_word_blobs != nullptr) { - if (blob_it.at_first()) { - blob_it.set_to_list(next_word_blobs); - } - blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2; - } - blob_end = ClipToRange(blob_end, clip_box.left(), clip_box.right()); - blob_ends->push_back(blob_end); - } - blob_ends->back() = clip_box.right(); -} - // Helper computes the bounds of a word by restricting it to existing words // that significantly overlap. static TBOX ComputeWordBounds(const tesseract::PointerVector &words, @@ -1349,6 +1320,40 @@ static TBOX ComputeWordBounds(const tesseract::PointerVector &words, return clipped_box; } +// Helper to compute input for BoxBoundariesCalculator +static std::vector ComputeFakeWordBlobXBounds( + const PointerVector &words) { + + std::vector result; + + for (size_t w = 0; w < words.size(); ++w) { + WERD_RES *word_w = words[w]; + + C_BLOB_IT blob_it(word_w->word->cblob_list()); + for (int length : word_w->best_state) { + TBOX blob_box = blob_it.data()->bounding_box(); + blob_it.forward(); + for (int b = 1; b < length; ++b) { + blob_box += blob_it.data()->bounding_box(); + blob_it.forward(); + } + result.push_back({blob_box.left(), blob_box.right()}); + } + } + return result; +} + +// Helper to compute input for BoxBoundariesCalculator +static std::vector ComputeBlobXBoundsFromTBOX( + const std::vector &boxes) { + std::vector result; + result.reserve(boxes.size()); + for (const auto& box : boxes) { + result.push_back({box.left(), box.right()}); + } + return result; +} + // Helper moves the src_blob to dest. If it isn't contained by clip_box, // the blob is replaced by a fake that is contained. The helper takes ownership // of the blob. @@ -1372,6 +1377,13 @@ static TBOX ClipAndAddBlob(C_BLOB *src_blob, C_BLOB_IT *dest_it, return box; } +// Helper to clip a box only in X direction +static TBOX ClipBoxX(const TBOX &box, int left, int right) { + int clip_left = ClipToRange(box.left(), left, right - 1); + int clip_right = ClipToRange(box.right(), left + 1, right); + return TBOX(clip_left, box.bottom(), clip_right, box.top()); +} + // Replaces the current WERD/WERD_RES with the given words. The given words // contain fake blobs that indicate the position of the characters. These are // replaced with real blobs from the current word as much as possible. @@ -1416,21 +1428,31 @@ void PAGE_RES_IT::ReplaceCurrentWord( } } ASSERT_HOST(!wr_it.cycled_list()); - // Since we only have an estimate of the bounds between blobs, use the blob - // x-middle as the determiner of where to put the blobs + + std::vector blob_boxes; + C_BLOB_IT src_b_it(input_word->word->cblob_list()); src_b_it.sort(&C_BLOB::SortByXMiddle); + for (src_b_it.mark_cycle_pt(); !src_b_it.cycled_list(); src_b_it.forward()) { + blob_boxes.push_back(src_b_it.data()->bounding_box()); + } + src_b_it.move_to_first(); + C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list()); rej_b_it.sort(&C_BLOB::SortByXMiddle); + + auto fake_blob_bounds = ComputeFakeWordBlobXBounds(*words); + BoxBoundariesCalculator calculator{ComputeBlobXBoundsFromTBOX(blob_boxes), {}}; + auto char_bounds = calculator.calculate_bounds(fake_blob_bounds); + size_t char_bounds_i = 0; + size_t box_bounds_i = 0; + TBOX last_blob_box; + TBOX clip_box; for (size_t w = 0; w < words->size(); ++w) { WERD_RES *word_w = (*words)[w]; clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word); - // Compute blob boundaries. - std::vector blob_ends; - C_BLOB_LIST *next_word_blobs = - w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr; - ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends); + // Remove the fake blobs on the current word, but keep safe for back-up if // no blob can be found. C_BLOB_LIST fake_blobs; @@ -1441,26 +1463,64 @@ void PAGE_RES_IT::ReplaceCurrentWord( C_BLOB_IT dest_it(word_w->word->cblob_list()); // Build the box word as we move the blobs. auto *box_word = new tesseract::BoxWord; - for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) { - int end_x = blob_ends[i]; + + for (size_t i = 0; i < word_w->best_state.size(); ++i) { + const auto& char_bound = char_bounds[char_bounds_i++]; + TBOX blob_box; - // Add the blobs up to end_x. - while (!src_b_it.empty() && - src_b_it.data()->bounding_box().x_middle() < end_x) { - blob_box += ClipAndAddBlob(src_b_it.extract(), &dest_it, clip_box); - src_b_it.forward(); + if (char_bound.begin_box_index != char_bound.end_box_index) { + // The box indices in curr_char_bound will always be increasing, thus + // we can iterate src_b_it in the same order. + while (box_bounds_i < char_bound.begin_box_index) { + box_bounds_i++; + src_b_it.forward(); + } + + if (box_bounds_i > char_bound.begin_box_index) { + // The blob was split across multiple characters and has already + // been extracted for a previous character. We have the bounds + // of the blob and can create a fake blob out of it. + TBOX fake_box = ClipBoxX(last_blob_box, + char_bound.begin_x, char_bound.end_x); + blob_box += ClipAndAddBlob(C_BLOB::FakeBlob(fake_box), + &dest_it, clip_box); + } + + // Add all blobs that have not yet been assigned to any of the + // characters. + while (box_bounds_i < char_bound.end_box_index) { + auto* src_blob = src_b_it.extract(); + last_blob_box = src_blob->bounding_box(); + TBOX inserted_box = ClipAndAddBlob(src_blob, &dest_it, clip_box); + + box_bounds_i++; + src_b_it.forward(); + + // Note that the blob may be split across multiple characters in + // which case we want to clip the box to the part that was "assigned" + // to the character. + blob_box += ClipBoxX(inserted_box, + char_bound.begin_x, char_bound.end_x); + } } + + // It's not clear where rejected blobs should be added because by + // definition we don't have enough information about them. So we just + // add them to whatever character follows. while (!rej_b_it.empty() && - rej_b_it.data()->bounding_box().x_middle() < end_x) { + rej_b_it.data()->bounding_box().x_middle() < char_bound.end_x) { blob_box += ClipAndAddBlob(rej_b_it.extract(), &dest_it, clip_box); rej_b_it.forward(); } + if (blob_box.null_box()) { // Use the original box as a back-up. blob_box = ClipAndAddBlob(fake_b_it.extract(), &dest_it, clip_box); } box_word->InsertBox(i, blob_box); + fake_b_it.forward(); } + delete word_w->box_word; word_w->box_word = box_word; if (!input_word->combination) { diff --git a/unittest/blob_bounds_calculator_test.cc b/unittest/blob_bounds_calculator_test.cc new file mode 100644 index 0000000000..334e384c40 --- /dev/null +++ b/unittest/blob_bounds_calculator_test.cc @@ -0,0 +1,197 @@ +// (C) Copyright 2022, Povilas Kanapickas . +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "blob_bounds_calculator.h" + +#include "include_gunit.h" + +namespace tesseract { + +namespace { + +BoxBoundariesCalculatorConfig get_default_config() { + BoxBoundariesCalculatorConfig config; + config.merge_cost = 1; + config.split_cost = 1; + config.pos_diff_cost = 1; + config.max_pos_diff = 2; + config.box_with_no_symbol_cost = 2; + config.symbol_with_no_box_cost = 2; + return config; +} + +} // namespace + +TEST(BoxBoundariesCalculatorTest, MatchesExactly) { + BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}}, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 2}, + {31, 2, 40, 3}, + {41, 3, 50, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {10, 20}, {20, 30}, {30, 40}, {40, 50} + })); +} + +TEST(BoxBoundariesCalculatorTest, OneMergedInMiddle) { + BoxBoundariesCalculator calc{{{10, 20}, {21, 40}, {41, 50}}, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 2}, + {30, 1, 40, 2}, + {41, 2, 50, 3} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {10, 20}, {20, 30}, {30, 40}, {40, 50} + })); +} + +TEST(BoxBoundariesCalculatorTest, OneSplit) { + BoxBoundariesCalculator calc{{{10, 20}, {21, 25}, {26, 30}, {31, 40}, {41, 50}}, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 3}, + {31, 3, 40, 4}, + {41, 4, 50, 5} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {10, 20}, {20, 30}, {30, 40}, {40, 50} + })); +} + +TEST(BoxBoundariesCalculatorTest, ManySplitAtEnd) { + BoxBoundariesCalculator calc{ + { + {10, 20}, {21, 30}, {31, 40}, {41, 50}, {51, 60}, {61, 70} + }, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 2}, + {31, 2, 40, 3}, + {41, 3, 70, 6} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {10, 20}, {20, 30}, {30, 40}, {40, 50} + })); +} + +TEST(BoxBoundariesCalculatorTest, ShiftedSymbolPositionsForward) { + BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}}, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 2}, + {31, 2, 40, 3}, + {41, 3, 50, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {15, 25}, {25, 35}, {35, 45}, {45, 55} + })); +} + +TEST(BoxBoundariesCalculatorTest, VeryShiftedSymbolPositionsForward) { + BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}}, + get_default_config()}; + + std::vector expected = { + {10, 0, 20, 1}, + {21, 1, 30, 2}, + {31, 2, 40, 3}, + {41, 3, 50, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {25, 35}, {35, 45}, {45, 55}, {55, 65} + })); +} + +TEST(BoxBoundariesCalculatorTest, ShiftedSymbolPositionsBackward) { + BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}}, + get_default_config()}; + + std::vector expected = { + {110, 0, 120, 1}, + {121, 1, 130, 2}, + {131, 2, 140, 3}, + {141, 3, 150, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {105, 115}, {115, 125}, {125, 135}, {135, 145} + })); +} + +TEST(BoxBoundariesCalculatorTest, VeryShiftedSymbolPositionsBackward) { + BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}}, + get_default_config()}; + + std::vector expected = { + {110, 0, 120, 1}, + {121, 1, 130, 2}, + {131, 2, 140, 3}, + {141, 3, 150, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {95, 105}, {105, 115}, {115, 125}, {125, 135} + })); +} + +TEST(BoxBoundariesCalculatorTest, HoleInMiddle) { + BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}}, + get_default_config()}; + + std::vector expected = { + {110, 0, 120, 1}, + {121, 1, 130, 2}, + {131, 2, 140, 3}, + {141, 3, 150, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {105, 115}, {115, 125}, {135, 145}, {145, 155} + })); +} + +TEST(BoxBoundariesCalculatorTest, LargeHoleInMiddle) { + BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}}, + get_default_config()}; + + std::vector expected = { + {110, 0, 120, 1}, + {121, 1, 130, 2}, + {131, 2, 140, 3}, + {141, 3, 150, 4} + }; + + ASSERT_EQ(expected, calc.calculate_bounds({ + {95, 105}, {105, 115}, {145, 155}, {155, 165} + })); +} + +} // namespace tesseract