From 3b7842bd845a74f039aa3076bf1ee86db4d56cbf Mon Sep 17 00:00:00 2001 From: woodjohndavid <57116722+woodjohndavid@users.noreply.github.com> Date: Sat, 26 Jun 2021 14:01:11 -0700 Subject: [PATCH 1/8] recodebeam changes --- src/lstm/jdwcrap.cpp | 1469 +++++++++++++++++++++++++++++++++++++++ src/lstm/recodebeam.cpp | 3 + src/lstm/recodebeam.h | 8 + 3 files changed, 1480 insertions(+) create mode 100644 src/lstm/jdwcrap.cpp diff --git a/src/lstm/jdwcrap.cpp b/src/lstm/jdwcrap.cpp new file mode 100644 index 0000000000..d3df8e7f0d --- /dev/null +++ b/src/lstm/jdwcrap.cpp @@ -0,0 +1,1469 @@ +/////////////////////////////////////////////////////////////////////// +// File: recodebeam.cpp +// Description: Beam search to decode from the re-encoded CJK as a sequence of +// smaller numbers in place of a single large code. +// Author: Ray Smith +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "recodebeam.h" + +#include "networkio.h" +#include "pageres.h" +#include "unicharcompress.h" + +#include // for std::reverse +#include +#include +#include +#include +#include +#include + +namespace tesseract { + +// The beam width at each code position. +const int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = { + 5, 10, 16, 16, 16, 16, 16, 16, 16, 16, +}; + +static const char *kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; + +// Prints debug details of the node. +void RecodeNode::Print(int null_char, const UNICHARSET &unicharset, int depth) const { + if (code == null_char) { + tprintf("null_char"); + } else { + tprintf("label=%d, uid=%d=%s", code, unichar_id, unicharset.debug_str(unichar_id).c_str()); + } + tprintf(" score=%g, c=%g,%s%s%s perm=%d, hash=%" PRIx64, score, certainty, + start_of_dawg ? " DawgStart" : "", start_of_word ? " Start" : "", + end_of_word ? " End" : "", permuter, code_hash); + if (depth > 0 && prev != nullptr) { + tprintf(" prev:"); + prev->Print(null_char, unicharset, depth - 1); + } else { + tprintf("\n"); + } +} + +// Borrows the pointer, which is expected to survive until *this is deleted. +RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress &recoder, int null_char, bool simple_text, + Dict *dict) + : recoder_(recoder) + , beam_size_(0) + , top_code_(-1) + , second_code_(-1) + , in_double_whammy_(false) // JDWTODO + , first_whammy_(-1) // JDWTODO + , second_whammy_(-1) // JDWTODO + , dict_(dict) + , space_delimited_(true) + , is_simple_text_(simple_text) + , null_char_(null_char) { + if (dict_ != nullptr && !dict_->IsSpaceDelimitedLang()) { + space_delimited_ = false; + } +} + +RecodeBeamSearch::~RecodeBeamSearch() { + for (auto data : beam_) { + delete data; + } + for (auto data : secondary_beam_) { + delete data; + } +} + +// Decodes the set of network outputs, storing the lattice internally. +void RecodeBeamSearch::Decode(const NetworkIO &output, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset, + int lstm_choice_mode) { + beam_size_ = 0; + int width = output.Width(); + fprintf(stderr, "recodebeam decode #1 outputwidth= %i lstmchoice= %i \n", width, lstm_choice_mode); // JDWDEBUG + if (lstm_choice_mode) { + timesteps.clear(); + } + for (int t = 0; t < width; ++t) { + fprintf(stderr, "recodebeam decode #1 unicharid,code= timestep# %i \n", t); // JDWDEBUG + ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); + DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); + if (lstm_choice_mode) { + SaveMostCertainChoices(output.f(t), output.NumFeatures(), charset, t); + } + } +} + +void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY &output, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET *charset) { + fprintf(stderr, "recodebeam decode #2 \n"); // JDWDEBUG + beam_size_ = 0; + int width = output.dim1(); + for (int t = 0; t < width; ++t) { + fprintf(stderr, "recodebeam decode #@ unicharid,code= timestep# %i \n", t); // JDWDEBUG + ComputeTopN(output[t], output.dim2(), kBeamWidths[0]); + DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset); + } +} + +void RecodeBeamSearch::DecodeSecondaryBeams(const NetworkIO &output, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET *charset, int lstm_choice_mode) { + fprintf(stderr, "recodebeam decode secondary \n"); // JDWDEBUG + for (auto data : secondary_beam_) { + delete data; + } + secondary_beam_.clear(); + if (character_boundaries_.size() < 2) { + return; + } + int width = output.Width(); + int bucketNumber = 0; + for (int t = 0; t < width; ++t) { + while ((bucketNumber + 1) < character_boundaries_.size() && + t >= character_boundaries_[bucketNumber + 1]) { + ++bucketNumber; + } + ComputeSecTopN(&(excludedUnichars)[bucketNumber], output.f(t), output.NumFeatures(), + kBeamWidths[0]); + DecodeSecondaryStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); + } +} + +void RecodeBeamSearch::SaveMostCertainChoices(const float *outputs, int num_outputs, + const UNICHARSET *charset, int xCoord) { + fprintf(stderr, "recodebeam savemostcertainchoices \n"); // JDWDEBUG + std::vector> choices; + for (int i = 0; i < num_outputs; ++i) { + if (outputs[i] >= 0.01f) { + const char *character; + if (i + 2 >= num_outputs) { + character = ""; + } else if (i > 0) { + character = charset->id_to_unichar_ext(i + 2); + } else { + character = charset->id_to_unichar_ext(i); + } + size_t pos = 0; + // order the possible choices within one timestep + // beginning with the most likely + while (choices.size() > pos && choices[pos].second > outputs[i]) { + pos++; + } + choices.insert(choices.begin() + pos, std::pair(character, outputs[i])); + } + } + timesteps.push_back(choices); +} + +void RecodeBeamSearch::segmentTimestepsByCharacters() { + for (int i = 1; i < character_boundaries_.size(); ++i) { + std::vector>> segment; + for (int j = character_boundaries_[i - 1]; j < character_boundaries_[i]; ++j) { + segment.push_back(timesteps[j]); + } + segmentedTimesteps.push_back(segment); + } +} +std::vector>> +RecodeBeamSearch::combineSegmentedTimesteps( + std::vector>>> *segmentedTimesteps) { + std::vector>> combined_timesteps; + for (auto &segmentedTimestep : *segmentedTimesteps) { + for (auto &j : segmentedTimestep) { + combined_timesteps.push_back(j); + } + } + return combined_timesteps; +} + +void RecodeBeamSearch::calculateCharBoundaries(std::vector *starts, std::vector *ends, + std::vector *char_bounds_, int maxWidth) { + fprintf(stderr, "recodebeam calculatecharboundaries maxwidth= %i \n", maxWidth); // JDWDEBUG + // char_bounds_->push_back(0); // JDWTODO + char_bounds_->push_back((*starts)[0]); // JDWTODO + for (int i = 0; i < ends->size(); ++i) { + int middle = ((*starts)[i + 1] - (*ends)[i]) / 2; + fprintf(stderr, "%s %i %i %i \n", "calculatecharboundaries start&end&middle=", (*starts)[i + 1], (*ends)[i], middle); + char_bounds_->push_back((*ends)[i] + middle); + } + char_bounds_->pop_back(); + char_bounds_->push_back(maxWidth); +} + +// Returns the best path as labels/scores/xcoords similar to simple CTC. +void RecodeBeamSearch::ExtractBestPathAsLabels(std::vector *labels, + std::vector *xcoords) const { + fprintf(stderr, "recodebeam extractbestpathaslabels \n"); // JDWDEBUG + labels->clear(); + xcoords->clear(); + std::vector best_nodes; + ExtractBestPaths(&best_nodes, nullptr); + // Now just run CTC on the best nodes. + int t = 0; + int width = best_nodes.size(); + while (t < width) { + int label = best_nodes[t]->code; + if (label != null_char_) { + labels->push_back(label); + xcoords->push_back(t); + } + while (++t < width && !is_simple_text_ && best_nodes[t]->code == label) { + } + } + xcoords->push_back(width); +} + +// Returns the best path as unichar-ids/certs/ratings/xcoords skipping +// duplicates, nulls and intermediate parts. +void RecodeBeamSearch::ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, + std::vector *unichar_ids, + std::vector *certs, + std::vector *ratings, + std::vector *xcoords) const { + fprintf(stderr, "recodebeam extractbestpathasunicharids \n"); // JDWDEBUG + std::vector best_nodes; + ExtractBestPaths(&best_nodes, nullptr); + ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords); + if (debug) { + DebugPath(unicharset, best_nodes); + DebugUnicharPath(unicharset, best_nodes, *unichar_ids, *certs, *ratings, *xcoords); + } +} + +// Returns the best path as a set of WERD_RES. +void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, + const UNICHARSET *unicharset, + PointerVector *words, + int lstm_choice_mode) { +fprintf(stderr, "recodebeam extractbestpathaswords \n"); // JDWDEBUG +words->truncate(0); + std::vector unichar_ids; + std::vector certs; + std::vector ratings; + std::vector xcoords; + std::vector best_nodes; + std::vector second_nodes; + character_boundaries_.clear(); + ExtractBestPaths(&best_nodes, &second_nodes); + if (debug) { + DebugPath(unicharset, best_nodes); + ExtractPathAsUnicharIds(second_nodes, &unichar_ids, &certs, &ratings, &xcoords); + tprintf("\nSecond choice path:\n"); + DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings, xcoords); + } + // If lstm choice mode is required in granularity level 2, it stores the x + // Coordinates of every chosen character, to match the alternative choices to + // it. + ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords, + &character_boundaries_); + int num_ids = unichar_ids.size(); + // JDWDEBUG START + for (int i = 0; i < num_ids; i++){ + const char *c = unicharset->id_to_unichar_ext(unichar_ids[i]); + fprintf(stderr, "recodebeam extractbestpathaswords unichar,unicharid= %s %i \n", c, unichar_ids[i]); + } + // JDWDEBUG END + if (debug) { + DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings, xcoords); + } + // Convert labels to unichar-ids. + int word_end = 0; + float prev_space_cert = 0.0f; + for (int word_start = 0; word_start < num_ids; word_start = word_end) { + for (word_end = word_start + 1; word_end < num_ids; ++word_end) { + // A word is terminated when a space character or start_of_word flag is + // hit. We also want to force a separate word for every non + // space-delimited character when not in a dictionary context. + if (unichar_ids[word_end] == UNICHAR_SPACE) { + break; + } + int index = xcoords[word_end]; + if (best_nodes[index]->start_of_word) { + break; + } + if (best_nodes[index]->permuter == TOP_CHOICE_PERM && + (!unicharset->IsSpaceDelimited(unichar_ids[word_end]) || + !unicharset->IsSpaceDelimited(unichar_ids[word_end - 1]))) { + break; + } + } + float space_cert = 0.0f; + if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) { + space_cert = certs[word_end]; + } + bool leading_space = word_start > 0 && unichar_ids[word_start - 1] == UNICHAR_SPACE; + // Create a WERD_RES for the output word. + WERD_RES *word_res = + InitializeWord(leading_space, line_box, word_start, word_end, + std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); + for (int i = word_start; i < word_end; ++i) { + auto *choices = new BLOB_CHOICE_LIST; + BLOB_CHOICE_IT bc_it(choices); + auto *choice = new BLOB_CHOICE(unichar_ids[i], ratings[i], certs[i], -1, 1.0f, + static_cast(INT16_MAX), 0.0f, BCC_STATIC_CLASSIFIER); + int col = i - word_start; + choice->set_matrix_cell(col, col); + bc_it.add_after_then_move(choice); + word_res->ratings->put(col, col, choices); + } + int index = xcoords[word_end - 1]; + word_res->FakeWordFromRatings(best_nodes[index]->permuter); + words->push_back(word_res); + prev_space_cert = space_cert; + if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) { + ++word_end; + } + } +} + +struct greater_than { + inline bool operator()(const RecodeNode *&node1, const RecodeNode *&node2) { + return (node1->score > node2->score); + } +}; + +void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *charset, + bool secondary) const { + std::vector> topology; + std::unordered_set visited; + const std::vector &beam = !secondary ? beam_ : secondary_beam_; + // create the topology + for (int step = beam.size() - 1; step >= 0; --step) { + std::vector layer; + topology.push_back(layer); + } + // fill the topology with depths first + for (int step = beam.size() - 1; step >= 0; --step) { + std::vector &heaps = beam.at(step)->beams_->heap(); + for (auto node : heaps) { + int backtracker = 0; + const RecodeNode *curr = &node.data(); + while (curr != nullptr && !visited.count(curr)) { + visited.insert(curr); + topology[step - backtracker].push_back(curr); + curr = curr->prev; + ++backtracker; + } + } + } + int ct = 0; + int cb = 1; + for (std::vector layer : topology) { + if (cb >= character_boundaries_.size()) { + break; + } + if (ct == character_boundaries_[cb]) { + tprintf("***\n"); + ++cb; + } + for (const RecodeNode *node : layer) { + const char *code; + int intCode; + if (node->unichar_id != INVALID_UNICHAR_ID) { + code = charset->id_to_unichar(node->unichar_id); + intCode = node->unichar_id; + } else if (node->code == null_char_) { + intCode = 0; + code = " "; + } else { + intCode = 666; + code = "*"; + } + int intPrevCode = 0; + const char *prevCode; + float prevScore = 0; + if (node->prev != nullptr) { + prevScore = node->prev->score; + if (node->prev->unichar_id != INVALID_UNICHAR_ID) { + prevCode = charset->id_to_unichar(node->prev->unichar_id); + intPrevCode = node->prev->unichar_id; + } else if (node->code == null_char_) { + intPrevCode = 0; + prevCode = " "; + } else { + prevCode = "*"; + intPrevCode = 666; + } + } else { + prevCode = " "; + } + if (uids) { + tprintf("%x(|)%f(>)%x(|)%f\n", intPrevCode, prevScore, intCode, node->score); + } else { + tprintf("%s(|)%f(>)%s(|)%f\n", prevCode, prevScore, code, node->score); + } + } + tprintf("-\n"); + ++ct; + } + tprintf("***\n"); +} + +void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) { + if (character_boundaries_.size() < 2) { + return; + } + fprintf(stderr, "recodebeam extractsymbolchoices \n"); // JDWDEBUG + // For the first iteration the original beam is analyzed. After that a + // new beam is calculated based on the results from the original beam. + std::vector ¤tBeam = secondary_beam_.empty() ? beam_ : secondary_beam_; + character_boundaries_[0] = 0; + for (int j = 1; j < character_boundaries_.size(); ++j) { + std::vector unichar_ids; + std::vector certs; + std::vector ratings; + std::vector xcoords; + int backpath = character_boundaries_[j] - character_boundaries_[j - 1]; + std::vector &heaps = + currentBeam.at(character_boundaries_[j] - 1)->beams_->heap(); + std::vector best_nodes; + std::vector best; + // Scan the segmented node chain for valid unichar ids. + for (auto entry : heaps) { + bool validChar = false; + int backcounter = 0; + const RecodeNode *node = &entry.data(); + while (node != nullptr && backcounter < backpath) { + if (node->code != null_char_ && node->unichar_id != INVALID_UNICHAR_ID) { + validChar = true; + break; + } + node = node->prev; + ++backcounter; + } + if (validChar) { + best.push_back(&entry.data()); + } + } + // find the best rated segmented node chain and extract the unichar id. + if (!best.empty()) { + std::sort(best.begin(), best.end(), greater_than()); + ExtractPath(best[0], &best_nodes, backpath); + ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords); + } + if (!unichar_ids.empty()) { + int bestPos = 0; + for (int i = 1; i < unichar_ids.size(); ++i) { + if (ratings[i] < ratings[bestPos]) { + bestPos = i; + } + } + // TODO: bestCode is currently unused (see commit 2dd5d0d60). + int bestCode = -10; + for (auto &node : best_nodes) { + if (node->unichar_id == unichar_ids[bestPos]) { + bestCode = node->code; + } + } + // Exclude the best choice for the followup decoding. + std::unordered_set excludeCodeList; + for (auto &best_node : best_nodes) { + if (best_node->code != null_char_) { + excludeCodeList.insert(best_node->code); + } + } + if (j - 1 < excludedUnichars.size()) { + for (auto elem : excludeCodeList) { + excludedUnichars[j - 1].insert(elem); + } + } else { + excludedUnichars.push_back(excludeCodeList); + } + // Save the best choice for the choice iterator. + if (j - 1 < ctc_choices.size()) { + int id = unichar_ids[bestPos]; + const char *result = unicharset->id_to_unichar_ext(id); + float rating = ratings[bestPos]; + ctc_choices[j - 1].push_back(std::pair(result, rating)); + } else { + std::vector> choice; + int id = unichar_ids[bestPos]; + const char *result = unicharset->id_to_unichar_ext(id); + float rating = ratings[bestPos]; + choice.emplace_back(result, rating); + ctc_choices.push_back(choice); + } + // fill the blank spot with an empty array + } else { + if (j - 1 >= excludedUnichars.size()) { + std::unordered_set excludeCodeList; + excludedUnichars.push_back(excludeCodeList); + } + if (j - 1 >= ctc_choices.size()) { + std::vector> choice; + ctc_choices.push_back(choice); + } + } + } + for (auto data : secondary_beam_) { + delete data; + } + secondary_beam_.clear(); +} + +// Generates debug output of the content of the beams after a Decode. +void RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const { + fprintf(stderr, "recodebeam debugbeams \n"); // JDWDEBUG + for (int p = 0; p < beam_size_; ++p) { + for (int d = 0; d < 2; ++d) { + for (int c = 0; c < NC_COUNT; ++c) { + auto cont = static_cast(c); + int index = BeamIndex(d, cont, 0); + if (beam_[p]->beams_[index].empty()) { + continue; + } + // Print all the best scoring nodes for each unichar found. + tprintf("Position %d: %s+%s beam\n", p, d ? "Dict" : "Non-Dict", kNodeContNames[c]); + DebugBeamPos(unicharset, beam_[p]->beams_[index]); + } + } + } +} + +// Generates debug output of the content of a single beam position. +void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const { + std::vector unichar_bests(unicharset.size()); + const RecodeNode *null_best = nullptr; + int heap_size = heap.size(); + for (int i = 0; i < heap_size; ++i) { + const RecodeNode *node = &heap.get(i).data(); + if (node->unichar_id == INVALID_UNICHAR_ID) { + if (null_best == nullptr || null_best->score < node->score) { + null_best = node; + } + } else { + if (unichar_bests[node->unichar_id] == nullptr || + unichar_bests[node->unichar_id]->score < node->score) { + unichar_bests[node->unichar_id] = node; + } + } + } + for (auto &unichar_best : unichar_bests) { + if (unichar_best != nullptr) { + const RecodeNode &node = *unichar_best; + node.Print(null_char_, unicharset, 1); + } + } + if (null_best != nullptr) { + null_best->Print(null_char_, unicharset, 1); + } +} + +// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping +// duplicates, nulls and intermediate parts. +/* static */ +void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vector &best_nodes, + std::vector *unichar_ids, + std::vector *certs, + std::vector *ratings, + std::vector *xcoords, + std::vector *character_boundaries) { + unichar_ids->clear(); + certs->clear(); + ratings->clear(); + xcoords->clear(); + std::vector starts; + std::vector ends; + // Backtrack extracting only valid, non-duplicate unichar-ids. + fprintf(stderr, "recodebeam extractpathasunicharids \n"); // JDWDEBUG + int t = 0; + int width = best_nodes.size(); + fprintf(stderr, "%s %i \n", "extractpathasunicharids width=", width); + while (t < width) { + double certainty = 0.0; + double rating = 0.0; + bool foundInvalid = false; // JDWDEBUG + while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) { + fprintf(stderr, "%s %i \n", "extractpathasunicharids bypass invalid unicharid code=", best_nodes[t]->code); // JDWDEBUG + foundInvalid = true; // JDWDEBUG + double cert = best_nodes[t++]->certainty; + if (cert < certainty) { + certainty = cert; + } + rating -= cert; + } + // starts.push_back(t); // JDWTODO + if (t < width) { + starts.push_back(t); // JDWTODO + fprintf(stderr, "%s %i %i %f \n", "extractpathasunicharids valid unicharid,code,score=", best_nodes[t]->unichar_id, best_nodes[t]->code, best_nodes[t]->score); // JDWDEBUG + // JDWDEBUG START + if (!foundInvalid) + fprintf(stderr, "%s \n", "extractpathasunicharids foundvalid with no invalid"); + // JDWDEBUG END + int unichar_id = best_nodes[t]->unichar_id; + if (unichar_id == UNICHAR_SPACE && !certs->empty() && best_nodes[t]->permuter != NO_PERM) { + // All the rating and certainty go on the previous character except + // for the space itself. + fprintf(stderr, "%s %i \n", "extractpathasunicharids unicharid space", best_nodes[t]->code); // JDWDEBUG + if (certainty < certs->back()) { + certs->back() = certainty; + } + ratings->back() += rating; + certainty = 0.0; + rating = 0.0; + } + xcoords->push_back(t); + unichar_ids->push_back(unichar_id); + t++; // JDWTODO + // do { // JDWTODO + while (t < width && best_nodes[t]->duplicate) { // JDWTODO + // double cert = best_nodes[t++]->certainty; // JDWTODO + double cert = best_nodes[t]->certainty; // JDWTODO + // Special-case NO-PERM space to forget the certainty of the previous + // nulls. See long comment in ContinueContext. + if (cert < certainty || + (unichar_id == UNICHAR_SPACE && best_nodes[t - 1]->permuter == NO_PERM)) { + certainty = cert; + } + rating -= cert; + // JDWDEBUG START + if (t < width && best_nodes[t]->duplicate) + fprintf(stderr, "%s %i %i \n", "extractpathasunicharids duplicate removed unicharid,code=", best_nodes[t]->unichar_id, best_nodes[t]->code); // JDWDEBUG + // JDWDEBUG END + t++; // JDWTODO + } // JDWTODO + // } while (t < width && best_nodes[t]->duplicate); // JDWTODO + ends.push_back(t); + certs->push_back(certainty); + ratings->push_back(rating); + } else if (!certs->empty()) { + if (certainty < certs->back()) { + certs->back() = certainty; + } + ratings->back() += rating; + } + } + starts.push_back(width); + if (character_boundaries != nullptr) { + calculateCharBoundaries(&starts, &ends, character_boundaries, width); + } + xcoords->push_back(width); +} + +// Sets up a word with the ratings matrix and fake blobs with boxes in the +// right places. +WERD_RES *RecodeBeamSearch::InitializeWord(bool leading_space, const TBOX &line_box, int word_start, + int word_end, float space_certainty, + const UNICHARSET *unicharset, + const std::vector &xcoords, float scale_factor) { + // Make a fake blob for each non-zero label. + fprintf(stderr, "recodebeam initializeword scalefactor= %f \n", scale_factor); // JDWDEBUG + fprintf(stderr, "recodebeam initializeword start,end= %i %i \n", word_start, word_end); // JDWDEBUG + C_BLOB_LIST blobs; + C_BLOB_IT b_it(&blobs); + for (int i = word_start; i < word_end; ++i) { + if (character_boundaries_.size() > (i + 1)) { + TBOX box(static_cast(std::floor(character_boundaries_[i] * scale_factor)) + + line_box.left(), + line_box.bottom(), + static_cast(std::ceil(character_boundaries_[i + 1] * scale_factor)) + + line_box.left(), + line_box.top()); + // JDWDEBUG START + std::string debug_str; + debug_str = "fake boxblob for werd being built in recodebeam "; + box.print_to_str(debug_str); + fprintf(stderr, "%s %i %i %i \n", debug_str.c_str(), i, character_boundaries_[i], character_boundaries_[i + 1]); + // JDWDEBUG END + b_it.add_after_then_move(C_BLOB::FakeBlob(box)); + } + } + // Make a fake word from the blobs. + WERD *word = new WERD(&blobs, leading_space, nullptr); + // Make a WERD_RES from the word. + auto *word_res = new WERD_RES(word); + word_res->end = word_end - word_start + leading_space; + word_res->uch_set = unicharset; + word_res->combination = true; // Give it ownership of the word. + word_res->space_certainty = space_certainty; + word_res->ratings = new MATRIX(word_end - word_start, 1); + return word_res; +} + +// Fills top_n_flags_ with bools that are true iff the corresponding output +// is one of the top_n. +void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int top_n) { + fprintf(stderr, "recodebeam computetopn \n"); // JDWDEBUG + top_n_flags_.resize(num_outputs, TN_ALSO_RAN); + top_code_ = -1; + second_code_ = -1; + top_heap_.clear(); + for (int i = 0; i < num_outputs; ++i) { + if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) { + TopPair entry(outputs[i], i); + top_heap_.Push(&entry); + if (top_heap_.size() > top_n) { + top_heap_.Pop(&entry); + } + } + } + + float top_key = 0.0F; // JDWTODO + float second_key = 0.0F; // JDWTODO + bool found_first_whammy = false; // JDWTODO + bool found_second_whammy = false; // JDWTODO + while (!top_heap_.empty()) { + TopPair entry; + top_heap_.Pop(&entry); + if (in_double_whammy_ && entry.data() == first_whammy_) // JDWTODO + found_first_whammy = true; // JDWTODO + if (in_double_whammy_ && entry.data() == second_whammy_) // JDWTODO + found_second_whammy = true; // JDWTODO + if (top_heap_.size() > 1) { + top_n_flags_[entry.data()] = TN_TOPN; + fprintf(stderr, "recodebeam computetopn topn code,key= %i %f \n", entry.data(), entry.key()); // JDWDEBUG + } else { + top_n_flags_[entry.data()] = TN_TOP2; + fprintf(stderr, "recodebeam computetopn top2 code,key= %i %f \n", entry.data(), entry.key()); // JDWDEBUG + if (top_heap_.empty()) { + top_code_ = entry.data(); + top_key = entry.key(); // JDWTODO + } else { + second_code_ = entry.data(); + second_key = entry.key(); // JDWTODO + } + } + } + + // JDWTODO START + if (in_double_whammy_) { + if (!found_first_whammy && !found_second_whammy){ + in_double_whammy_ = false; + first_whammy_ = -1; + second_whammy_ = -1; + fprintf(stderr, "recodebeam computetopn double whammy cleared unicharid,code= \n"); + } + } + // JDWTODO END + + // JDWTODO START + if (!in_double_whammy_) { + if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > 0.25F && second_key > 0.25F){ + in_double_whammy_ = true; + first_whammy_ = top_code_; + second_whammy_ = second_code_; + fprintf(stderr, "recodebeam computetopn double whammy found unicharid,code= %f %f \n", top_key, second_key); + } + } + // JDWTODO END + + fprintf(stderr, "recodebeam computetopn unicharid,code= top_code,second_code= %i %i \n", top_code_, second_code_); // JDWDEBUG + top_n_flags_[null_char_] = TN_TOP2; +} + +void RecodeBeamSearch::ComputeSecTopN(std::unordered_set *exList, const float *outputs, + int num_outputs, int top_n) { + fprintf(stderr, "recodebeam computesectopn \n"); // JDWDEBUG + top_n_flags_.resize(num_outputs, TN_ALSO_RAN); + top_code_ = -1; + second_code_ = -1; + top_heap_.clear(); + for (int i = 0; i < num_outputs; ++i) { + if ((top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) && !exList->count(i)) { + TopPair entry(outputs[i], i); + top_heap_.Push(&entry); + if (top_heap_.size() > top_n) { + top_heap_.Pop(&entry); + } + } + } + while (!top_heap_.empty()) { + TopPair entry; + top_heap_.Pop(&entry); + if (top_heap_.size() > 1) { + top_n_flags_[entry.data()] = TN_TOPN; + } else { + top_n_flags_[entry.data()] = TN_TOP2; + if (top_heap_.empty()) { + top_code_ = entry.data(); + } else { + second_code_ = entry.data(); + } + } + } + top_n_flags_[null_char_] = TN_TOP2; +} + +// Adds the computation for the current time-step to the beam. Call at each +// time-step in sequence from left to right. outputs is the activation vector +// for the current timestep. +void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET *charset, bool debug) { + fprintf(stderr, "recodebeam decodestep timestep= %i \n", t); // JDWDEBUG + if (t == beam_.size()) { + beam_.push_back(new RecodeBeam); + } + RecodeBeam *step = beam_[t]; + beam_size_ = t + 1; + step->Clear(); + if (t == 0) { + // The first step can only use singles and initials. + ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + if (dict_ != nullptr) { + ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + } + } else { + RecodeBeam *prev = beam_[t - 1]; + if (debug) { + int beam_index = BeamIndex(true, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + std::vector path; + ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); + tprintf("Step %d: Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + beam_index = BeamIndex(false, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + std::vector path; + ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); + tprintf("Step %d: Non-Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + } + int total_beam = 0; + // Work through the scores by group (top-2, top-n, the rest) while the beam + // is empty. This enables extending the context using only the top-n results + // first, which may have an empty intersection with the valid codes, so we + // fall back to the rest if the beam is empty. + for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) { + auto top_n = static_cast(tn); + for (int index = 0; index < kNumBeams; ++index) { + // Working backwards through the heaps doesn't guarantee that we see the + // best first, but it comes before a lot of the worst, so it is slightly + // more efficient than going forwards. + for (int i = prev->beams_[index].size() - 1; i >= 0; --i) { + ContinueContext(&prev->beams_[index].get(i).data(), index, outputs, top_n, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + } + } + for (int index = 0; index < kNumBeams; ++index) { + if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) { + total_beam += step->beams_[index].size(); + } + } + } + // Special case for the best initial dawg. Push it on the heap if good + // enough, but there is only one, so it doesn't blow up the beam. + for (int c = 0; c < NC_COUNT; ++c) { + if (step->best_initial_dawgs_[c].code >= 0) { + fprintf(stderr, "recodebeam decodestep special case initial dawg %i \n", t); // JDWDEBUG + int index = BeamIndex(true, static_cast(c), 0); + RecodeHeap *dawg_heap = &step->beams_[index]; + PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c], dawg_heap); + } + } + } +} + +void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET *charset, bool debug) { + fprintf(stderr, "recodebeam decodesecondarystep \n"); // JDWDEBUG + if (t == secondary_beam_.size()) { + secondary_beam_.push_back(new RecodeBeam); + } + RecodeBeam *step = secondary_beam_[t]; + step->Clear(); + if (t == 0) { + // The first step can only use singles and initials. + ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + if (dict_ != nullptr) { + ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + } + } else { + RecodeBeam *prev = secondary_beam_[t - 1]; + if (debug) { + int beam_index = BeamIndex(true, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + std::vector path; + ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); + tprintf("Step %d: Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + beam_index = BeamIndex(false, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + std::vector path; + ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); + tprintf("Step %d: Non-Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + } + int total_beam = 0; + // Work through the scores by group (top-2, top-n, the rest) while the beam + // is empty. This enables extending the context using only the top-n results + // first, which may have an empty intersection with the valid codes, so we + // fall back to the rest if the beam is empty. + for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) { + auto top_n = static_cast(tn); + for (int index = 0; index < kNumBeams; ++index) { + // Working backwards through the heaps doesn't guarantee that we see the + // best first, but it comes before a lot of the worst, so it is slightly + // more efficient than going forwards. + for (int i = prev->beams_[index].size() - 1; i >= 0; --i) { + ContinueContext(&prev->beams_[index].get(i).data(), index, outputs, top_n, charset, + dict_ratio, cert_offset, worst_dict_cert, step); + } + } + for (int index = 0; index < kNumBeams; ++index) { + if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) { + total_beam += step->beams_[index].size(); + } + } + } + // Special case for the best initial dawg. Push it on the heap if good + // enough, but there is only one, so it doesn't blow up the beam. + for (int c = 0; c < NC_COUNT; ++c) { + if (step->best_initial_dawgs_[c].code >= 0) { + int index = BeamIndex(true, static_cast(c), 0); + RecodeHeap *dawg_heap = &step->beams_[index]; + PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c], dawg_heap); + } + } + } +} + +// Adds to the appropriate beams the legal (according to recoder) +// continuations of context prev, which is of the given length, using the +// given network outputs to provide scores to the choices. Uses only those +// choices for which top_n_flags[index] == top_n_flag. +void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const float *outputs, + TopNState top_n_flag, const UNICHARSET *charset, + double dict_ratio, double cert_offset, + double worst_dict_cert, RecodeBeam *step) { + // JDWDEBUG START + if (prev != nullptr) { + const char *ucc = charset->id_to_unichar_ext(prev->unichar_id); + fprintf(stderr, "recodebeam continuecontext unicharid,code,unichar,index,topn = %i %i %s %i %i \n", prev->unichar_id, prev->code, ucc, index, top_n_flag); + } + else { + fprintf(stderr, "recodebeam continuecontext top prev null index,topn = %i %i \n", index, top_n_flag); + } + // JDWDEBUG END + RecodedCharID prefix; + RecodedCharID full_code; + const RecodeNode *previous = prev; + int length = LengthFromBeamsIndex(index); + bool use_dawgs = IsDawgFromBeamsIndex(index); + NodeContinuation prev_cont = ContinuationFromBeamsIndex(index); + for (int p = length - 1; p >= 0; --p, previous = previous->prev) { + while (previous != nullptr && (previous->duplicate || previous->code == null_char_)) { + fprintf(stderr, "recodebeam continuecontext stepping back code= %i \n", previous->code); + previous = previous->prev; + } + if (previous != nullptr) { + prefix.Set(p, previous->code); + full_code.Set(p, previous->code); + } + } + if (prev != nullptr && !is_simple_text_) { + if (top_n_flags_[prev->code] == top_n_flag) { + if (prev_cont != NC_NO_DUP) { + float cert = NetworkIO::ProbToCertainty(outputs[prev->code]) + cert_offset; + fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", prev->unichar_id, 1); + PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, cert, worst_dict_cert, + dict_ratio, use_dawgs, NC_ANYTHING, prev, step); + } + if (prev_cont == NC_ANYTHING && top_n_flag == TN_TOP2 && prev->code != null_char_) { + float cert = + NetworkIO::ProbToCertainty(outputs[prev->code] + outputs[null_char_]) + cert_offset; + fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", prev->unichar_id, 1); + PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, cert, worst_dict_cert, + dict_ratio, use_dawgs, NC_NO_DUP, prev, step); + } + } + if (prev_cont == NC_ONLY_DUP) { + return; + } + if (prev->code != null_char_ && length > 0 && top_n_flags_[null_char_] == top_n_flag) { + // Allow nulls within multi code sequences, as the nulls within are not + // explicitly included in the code sequence. + float cert = NetworkIO::ProbToCertainty(outputs[null_char_]) + cert_offset; + fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); + PushDupOrNoDawgIfBetter(length, false, null_char_, INVALID_UNICHAR_ID, cert, worst_dict_cert, + dict_ratio, use_dawgs, NC_ANYTHING, prev, step); + } + } + const std::vector *final_codes = recoder_.GetFinalCodes(prefix); + if (final_codes != nullptr) { + for (int code : *final_codes) { + if (top_n_flags_[code] != top_n_flag) { + continue; + } + if (prev != nullptr && prev->code == code && !is_simple_text_) { + continue; + } + float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; + if (cert < kMinCertainty && code != null_char_) { + continue; + } + full_code.Set(length, code); + int unichar_id = recoder_.DecodeUnichar(full_code); + // Map the null char to INVALID. + if (length == 0 && code == null_char_) { + unichar_id = INVALID_UNICHAR_ID; + } + if (unichar_id != INVALID_UNICHAR_ID && charset != nullptr && + !charset->get_enabled(unichar_id)) { + continue; // disabled by whitelist/blacklist + } + ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, use_dawgs, NC_ANYTHING, + prev, step); + if (top_n_flag == TN_TOP2 && code != null_char_) { + float prob = outputs[code] + outputs[null_char_]; + if (prev != nullptr && prev_cont == NC_ANYTHING && prev->code != null_char_ && + ((prev->code == top_code_ && code == second_code_) || + (code == top_code_ && prev->code == second_code_))) { + prob += outputs[prev->code]; + } + float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; + ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, use_dawgs, NC_ONLY_DUP, + prev, step); + } + } + } + const std::vector *next_codes = recoder_.GetNextCodes(prefix); + if (next_codes != nullptr) { + for (int code : *next_codes) { + if (top_n_flags_[code] != top_n_flag) { + continue; + } + if (prev != nullptr && prev->code == code && !is_simple_text_) { + continue; + } + float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; + fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); + PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert, worst_dict_cert, + dict_ratio, use_dawgs, NC_ANYTHING, prev, step); + if (top_n_flag == TN_TOP2 && code != null_char_) { + float prob = outputs[code] + outputs[null_char_]; + if (prev != nullptr && prev_cont == NC_ANYTHING && prev->code != null_char_ && + ((prev->code == top_code_ && code == second_code_) || + (code == top_code_ && prev->code == second_code_))) { + prob += outputs[prev->code]; + } + float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; + fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); + PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert, worst_dict_cert, + dict_ratio, use_dawgs, NC_ONLY_DUP, prev, step); + } + } + } +} + +// Continues for a new unichar, using dawg or non-dawg as per flag. +void RecodeBeamSearch::ContinueUnichar(int code, int unichar_id, float cert, float worst_dict_cert, + float dict_ratio, bool use_dawgs, NodeContinuation cont, + const RecodeNode *prev, RecodeBeam *step) { + fprintf(stderr, "recodebeam ContinueUnichar unicharid,code,cont= %i %i %i \n", unichar_id, code, cont); // JDWDEBUG + if (use_dawgs) { + if (cert > worst_dict_cert) { + ContinueDawg(code, unichar_id, cert, cont, prev, step); + } + } else { + RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; + fprintf(stderr, "recodebeam ContinueUnichar before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, TOP_CHOICE_PERM, false, false, false, false, + cert * dict_ratio, prev, nullptr, nodawg_heap); + if (dict_ != nullptr && ((unichar_id == UNICHAR_SPACE && cert > worst_dict_cert) || + !dict_->getUnicharset().IsSpaceDelimited(unichar_id))) { + // Any top choice position that can start a new word, ie a space or + // any non-space-delimited character, should also be considered + // by the dawg search, so push initial dawg to the dawg heap. + float dawg_cert = cert; + PermuterType permuter = TOP_CHOICE_PERM; + // Since we use the space either side of a dictionary word in the + // certainty of the word, (to properly handle weak spaces) and the + // space is coming from a non-dict word, we need special conditions + // to avoid degrading the certainty of the dict word that follows. + // With a space we don't multiply the certainty by dict_ratio, and we + // flag the space with NO_PERM to indicate that we should not use the + // predecessor nulls to generate the confidence for the space, as they + // have already been multiplied by dict_ratio, and we can't go back to + // insert more entries in any previous heaps. + if (unichar_id == UNICHAR_SPACE) { + permuter = NO_PERM; + } else { + dawg_cert *= dict_ratio; + } + fprintf(stderr, "recodebeam ContinueUnichar before pushinitialdawgifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG + PushInitialDawgIfBetter(code, unichar_id, permuter, false, false, dawg_cert, cont, prev, + step); + } + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev, +// appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id +// is a valid continuation of whatever is in prev. +void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont, + const RecodeNode *prev, RecodeBeam *step) { + fprintf(stderr, "recodebeam ContinueDawg unicharid,code,cont= %i %i %i \n", unichar_id, code, cont); // JDWDEBUG + RecodeHeap *dawg_heap = &step->beams_[BeamIndex(true, cont, 0)]; + RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; + if (unichar_id == INVALID_UNICHAR_ID) { + fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, NO_PERM, false, false, false, false, cert, + prev, nullptr, dawg_heap); + return; + } + // Avoid dictionary probe if score a total loss. + float score = cert; + if (prev != nullptr) { + score += prev->score; + } + if (dawg_heap->size() >= kBeamWidths[0] && score <= dawg_heap->PeekTop().data().score && + nodawg_heap->size() >= kBeamWidths[0] && score <= nodawg_heap->PeekTop().data().score) { + return; + } + const RecodeNode *uni_prev = prev; + // Prev may be a partial code, null_char, or duplicate, so scan back to the + // last valid unichar_id. + while (uni_prev != nullptr && + (uni_prev->unichar_id == INVALID_UNICHAR_ID || uni_prev->duplicate)) { + uni_prev = uni_prev->prev; + } + if (unichar_id == UNICHAR_SPACE) { + if (uni_prev != nullptr && uni_prev->end_of_word) { + // Space is good. Push initial state, to the dawg beam and a regular + // space to the top choice beam. + fprintf(stderr, "recodebeam ContinueDawg before PushInitialDawgIfBetter unicharid= %i \n", unichar_id); // JDWDEBUG + PushInitialDawgIfBetter(code, unichar_id, uni_prev->permuter, false, false, cert, cont, prev, + step); + fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, uni_prev->permuter, false, false, false, + false, cert, prev, nullptr, nodawg_heap); + } + return; + } else if (uni_prev != nullptr && uni_prev->start_of_dawg && + uni_prev->unichar_id != UNICHAR_SPACE && + dict_->getUnicharset().IsSpaceDelimited(uni_prev->unichar_id) && + dict_->getUnicharset().IsSpaceDelimited(unichar_id)) { + return; // Can't break words between space delimited chars. + } + DawgPositionVector initial_dawgs; + auto *updated_dawgs = new DawgPositionVector; + DawgArgs dawg_args(&initial_dawgs, updated_dawgs, NO_PERM); + bool word_start = false; + if (uni_prev == nullptr) { + // Starting from beginning of line. + dict_->default_dawgs(&initial_dawgs, false); + word_start = true; + } else if (uni_prev->dawgs != nullptr) { + // Continuing a previous dict word. + dawg_args.active_dawgs = uni_prev->dawgs; + word_start = uni_prev->start_of_dawg; + } else { + return; // Can't continue if not a dict word. + } + auto permuter = static_cast( + dict_->def_letter_is_okay(&dawg_args, dict_->getUnicharset(), unichar_id, false)); + if (permuter != NO_PERM) { + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, word_start, + dawg_args.valid_end, false, cert, prev, dawg_args.updated_dawgs, dawg_heap); + if (dawg_args.valid_end && !space_delimited_) { + // We can start another word right away, so push initial state as well, + // to the dawg beam, and the regular character to the top choice beam, + // since non-dict words can start here too. + fprintf(stderr, "recodebeam ContinueDawg before PushInitialDawgIfBetter unicharid= %i \n", unichar_id); // JDWDEBUG + PushInitialDawgIfBetter(code, unichar_id, permuter, word_start, true, cert, cont, prev, step); + fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, word_start, true, false, + cert, prev, nullptr, nodawg_heap); + } + } else { + delete updated_dawgs; + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, +// initial-dawg-state, prev, cert) to the given heap if/ there is room or if +// better than the current worst element if already full. +void RecodeBeamSearch::PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter, + bool start, bool end, float cert, + NodeContinuation cont, const RecodeNode *prev, + RecodeBeam *step) { + fprintf(stderr, "recodebeam PushInitialDawgIfBetter unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG + RecodeNode *best_initial_dawg = &step->best_initial_dawgs_[cont]; + float score = cert; + if (prev != nullptr) { + score += prev->score; + } + if (best_initial_dawg->code < 0 || score > best_initial_dawg->score) { + auto *initial_dawgs = new DawgPositionVector; + dict_->default_dawgs(initial_dawgs, false); + fprintf(stderr, "recodebeam PushInitialDawgIfBetter adding new node unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG + RecodeNode node(code, unichar_id, permuter, true, start, end, false, cert, score, prev, + initial_dawgs, ComputeCodeHash(code, false, prev)); + *best_initial_dawg = node; + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, +// false, false, false, false, cert, prev, nullptr) to heap if there is room +// or if better than the current worst element if already full. +/* static */ +void RecodeBeamSearch::PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id, + float cert, float worst_dict_cert, float dict_ratio, + bool use_dawgs, NodeContinuation cont, + const RecodeNode *prev, RecodeBeam *step) { + fprintf(stderr, "recodebeam PushDupOrNoDawgIfBetter %i \n", unichar_id); // JDWDEBUG + int index = BeamIndex(use_dawgs, cont, length); + if (use_dawgs) { + if (cert > worst_dict_cert) { + PushHeapIfBetter(kBeamWidths[length], code, unichar_id, prev ? prev->permuter : NO_PERM, + false, false, false, dup, cert, prev, nullptr, &step->beams_[index]); + } + } else { + cert *= dict_ratio; + if (cert >= kMinCertainty || code == null_char_) { + PushHeapIfBetter(kBeamWidths[length], code, unichar_id, + prev ? prev->permuter : TOP_CHOICE_PERM, false, false, false, dup, cert, + prev, nullptr, &step->beams_[index]); + } + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, +// dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room +// or if better than the current worst element if already full. +void RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id, + PermuterType permuter, bool dawg_start, bool word_start, + bool end, bool dup, float cert, const RecodeNode *prev, + DawgPositionVector *d, RecodeHeap *heap) { + fprintf(stderr, "recodebeam PushHeapIfBetter #1 %i \n", unichar_id); // JDWDEBUG + float score = cert; + if (prev != nullptr) { + score += prev->score; + } + if (heap->size() < max_size || score > heap->PeekTop().data().score) { + uint64_t hash = ComputeCodeHash(code, dup, prev); + RecodeNode node(code, unichar_id, permuter, dawg_start, word_start, end, dup, cert, score, prev, + d, hash); + if (UpdateHeapIfMatched(&node, heap)) { + return; + } + // JDWTODO START + if (!AddToHeapIsAllowed(&node)) { + return; + } + // JDWTODO END + fprintf(stderr, "recodebeam PushHeapIfBetter #1 adding node unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG + RecodePair entry(score, node); + heap->Push(&entry); + ASSERT_HOST(entry.data().dawgs == nullptr); + if (heap->size() > max_size) { + heap->Pop(&entry); + } + } else { + delete d; + } +} + +// Adds a RecodeNode to heap if there is room +// or if better than the current worst element if already full. +void RecodeBeamSearch::PushHeapIfBetter(int max_size, RecodeNode *node, RecodeHeap *heap) { + fprintf(stderr, "recodebeam PushHeapIfBetter #1 %i \n", node->unichar_id); // JDWDEBUG + if (heap->size() < max_size || node->score > heap->PeekTop().data().score) { + if (UpdateHeapIfMatched(node, heap)) { + return; + } + fprintf(stderr, "recodebeam PushHeapIfBetter #2 adding node unicharid,code= %i %i \n", node->unichar_id, node->code); // JDWDEBUG + RecodePair entry(node->score, *node); + heap->Push(&entry); + ASSERT_HOST(entry.data().dawgs == nullptr); + if (heap->size() > max_size) { + heap->Pop(&entry); + } + } +} + +// Searches the heap for a matching entry, and updates the score with +// reshuffle if needed. Returns true if there was a match. +bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap) { + // TODO(rays) consider hash map instead of linear search. + // It might not be faster because the hash map would have to be updated + // every time a heap reshuffle happens, and that would be a lot of overhead. + fprintf(stderr, "recodebeam UpdateHeapIfMatched %i \n", new_node->unichar_id); // JDWDEBUG + std::vector &nodes = heap->heap(); + for (auto &i : nodes) { + RecodeNode &node = i.data(); + if (node.code == new_node->code && node.code_hash == new_node->code_hash && + node.permuter == new_node->permuter && node.start_of_dawg == new_node->start_of_dawg) { + if (new_node->score > node.score) { + // The new one is better. Update the entire node in the heap and + // reshuffle. + fprintf(stderr, "recodebeam UpdateHeapIfMatched doing update unicharid,code= %i %i \n", new_node->unichar_id, new_node->code); // JDWDEBUG + node = *new_node; + i.key() = node.score; + heap->Reshuffle(&i); + } + return true; + } + } + return false; +} + +// JDWTODO START +bool RecodeBeamSearch::AddToHeapIsAllowed(RecodeNode *new_node) { + if (!in_double_whammy_) + return true; + const RecodeNode *prev_node = new_node->prev; + if (prev_node != nullptr && prev_node->code == first_whammy_ && new_node->code == second_whammy_) { + fprintf(stderr, "recodebeam AddToHeapIsAllowed second whammy not allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); + return false; + } + if (prev_node != nullptr && prev_node->code == second_whammy_ && new_node->code == first_whammy_) { + fprintf(stderr, "recodebeam AddToHeapIsAllowed first whammy not allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); + return false; + } +if (prev_node != nullptr){ + fprintf(stderr, "recodebeam AddToHeapIsAllowed allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); +} +else { + fprintf(stderr, "recodebeam AddToHeapIsAllowed allowed unicharid,code= + prevcode %i %i null \n", new_node->unichar_id, new_node->code); +} + return true; +} +// JDWTODO END + +// Computes and returns the code-hash for the given code and prev. +uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const { + uint64_t hash = prev == nullptr ? 0 : prev->code_hash; + if (!dup && code != null_char_) { + int num_classes = recoder_.code_range(); + uint64_t carry = (((hash >> 32) * num_classes) >> 32); + hash *= num_classes; + hash += carry; + hash += code; + } + return hash; +} + +// Backtracks to extract the best path through the lattice that was built +// during Decode. On return the best_nodes vector essentially contains the set +// of code, score pairs that make the optimal path with the constraint that +// the recoder can decode the code sequence back to a sequence of unichar-ids. +void RecodeBeamSearch::ExtractBestPaths(std::vector *best_nodes, + std::vector *second_nodes) const { + // Scan both beams to extract the best and second best paths. + fprintf(stderr, "recodebeam extractbestpaths \n"); // JDWDEBUG + const RecodeNode *best_node = nullptr; + const RecodeNode *second_best_node = nullptr; + const RecodeBeam *last_beam = beam_[beam_size_ - 1]; + for (int c = 0; c < NC_COUNT; ++c) { + if (c == NC_ONLY_DUP) { + continue; + } + auto cont = static_cast(c); + for (int is_dawg = 0; is_dawg < 2; ++is_dawg) { + int beam_index = BeamIndex(is_dawg, cont, 0); + int heap_size = last_beam->beams_[beam_index].size(); + for (int h = 0; h < heap_size; ++h) { + const RecodeNode *node = &last_beam->beams_[beam_index].get(h).data(); + if (is_dawg) { + // dawg_node may be a null_char, or duplicate, so scan back to the + // last valid unichar_id. + const RecodeNode *dawg_node = node; + while (dawg_node != nullptr && + (dawg_node->unichar_id == INVALID_UNICHAR_ID || dawg_node->duplicate)) { + dawg_node = dawg_node->prev; + } + if (dawg_node == nullptr || + (!dawg_node->end_of_word && dawg_node->unichar_id != UNICHAR_SPACE)) { + // Dawg node is not valid. + continue; + } + } + if (best_node == nullptr || node->score > best_node->score) { + second_best_node = best_node; + best_node = node; + fprintf(stderr, "recodebeam extractbestpaths bestnodebeam= %i \n", beam_index); // JDWDEBUG + } else if (second_best_node == nullptr || node->score > second_best_node->score) { + second_best_node = node; + fprintf(stderr, "recodebeam extractbestpaths secondbestnodebeam= %i \n", beam_index); // JDWDEBUG + } + } + } + } + if (second_nodes != nullptr) { + fprintf(stderr, "recodebeam extractbestpaths extract second best \n"); // JDWDEBUG + ExtractPath(second_best_node, second_nodes); + } + fprintf(stderr, "recodebeam extractbestpaths extract best \n"); // JDWDEBUG + ExtractPath(best_node, best_nodes); +} + +// Helper backtracks through the lattice from the given node, storing the +// path and reversing it. +void RecodeBeamSearch::ExtractPath(const RecodeNode *node, + std::vector *path) const { + path->clear(); + while (node != nullptr) { + fprintf(stderr, "recodebeam extractpath unicharid,code,cert,score= %i %i %f %f %i \n", node->unichar_id, node->code, node->certainty, node->score, node->duplicate); // JDWDEBUG + path->push_back(node); + node = node->prev; + } + std::reverse(path->begin(), path->end()); +} + +void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector *path, + int limiter) const { + int pathcounter = 0; + path->clear(); + while (node != nullptr && pathcounter < limiter) { + path->push_back(node); + node = node->prev; + ++pathcounter; + } + std::reverse(path->begin(), path->end()); +} + +// Helper prints debug information on the given lattice path. +void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset, + const std::vector &path) const { + for (int c = 0; c < path.size(); ++c) { + const RecodeNode &node = *path[c]; + tprintf("%d ", c); + node.Print(null_char_, *unicharset, 1); + } +} + +// Helper prints debug information on the given unichar path. +void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset, + const std::vector &path, + const std::vector &unichar_ids, + const std::vector &certs, + const std::vector &ratings, + const std::vector &xcoords) const { + int num_ids = unichar_ids.size(); + double total_rating = 0.0; + for (int c = 0; c < num_ids; ++c) { + int coord = xcoords[c]; + tprintf("%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\n", coord, unichar_ids[c], + unicharset->debug_str(unichar_ids[c]).c_str(), ratings[c], certs[c], + path[coord]->start_of_word, path[coord]->end_of_word, path[coord]->permuter); + total_rating += ratings[c]; + } + tprintf("Path total rating = %g\n", total_rating); +} + +} // namespace tesseract. diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 485723ea45..816198d43d 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -65,6 +65,9 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress &recoder, int null_char , beam_size_(0) , top_code_(-1) , second_code_(-1) + , in_possible_diplopia_(false) + , first_diplopia_code_(-1) + , second_diplopia_code_(-1) , dict_(dict) , space_delimited_(true) , is_simple_text_(simple_text) diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 62703e36c6..78ff911bd4 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -377,6 +377,9 @@ class TESS_API RecodeBeamSearch { // Searches the heap for an entry matching new_node, and updates the entry // with reshuffle if needed. Returns true if there was a match. bool UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap); + // Determines if new node can be added to the heap for the current beam. + // Returns false if we are in possible diplopia situation + bool AddToHeapIsAllowed(RecodeNode *new_node); // Computes and returns the code-hash for the given code and prev. uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const; // Backtracks to extract the best path through the lattice that was built @@ -425,6 +428,11 @@ class TESS_API RecodeBeamSearch { bool is_simple_text_; // The encoded (class label) of the null/reject character. int null_char_; + // Variables used in tracking possible diplopia case + // Refer to ComputeTopN routine for more information + bool in_possible_diplopia_; + int first_diplopia_code_; + int second_diplopia_code_; }; } // namespace tesseract. From b29668135e26669486430add7f4cddb9d4934366 Mon Sep 17 00:00:00 2001 From: woodjohndavid <57116722+woodjohndavid@users.noreply.github.com> Date: Sat, 26 Jun 2021 14:19:01 -0700 Subject: [PATCH 2/8] Update recodebeam.cpp --- src/lstm/recodebeam.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 816198d43d..96e5fbb276 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -185,7 +185,7 @@ RecodeBeamSearch::combineSegmentedTimesteps( void RecodeBeamSearch::calculateCharBoundaries(std::vector *starts, std::vector *ends, std::vector *char_bounds_, int maxWidth) { - char_bounds_->push_back(0); + char_bounds_->push_back((*starts)[0]); for (int i = 0; i < ends->size(); ++i) { int middle = ((*starts)[i + 1] - (*ends)[i]) / 2; char_bounds_->push_back((*ends)[i] + middle); @@ -570,8 +570,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vectorunichar_id; if (unichar_id == UNICHAR_SPACE && !certs->empty() && best_nodes[t]->permuter != NO_PERM) { // All the rating and certainty go on the previous character except @@ -585,8 +585,9 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vectorpush_back(unichar_id); xcoords->push_back(t); - do { - double cert = best_nodes[t++]->certainty; + t++; + while (t < width && best_nodes[t]->duplicate) { + double cert = best_nodes[t]->certainty; // Special-case NO-PERM space to forget the certainty of the previous // nulls. See long comment in ContinueContext. if (cert < certainty || @@ -594,7 +595,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vectorduplicate); + t++; + } ends.push_back(t); certs->push_back(certainty); ratings->push_back(rating); From d9244fc063be8f435e49eca64e1bbd9632b98463 Mon Sep 17 00:00:00 2001 From: woodjohndavid <57116722+woodjohndavid@users.noreply.github.com> Date: Sun, 27 Jun 2021 10:07:24 -0700 Subject: [PATCH 3/8] diplopia interim solution --- src/lstm/jdwcrap.cpp | 1469 --------------------------------------- src/lstm/recodebeam.cpp | 44 ++ 2 files changed, 44 insertions(+), 1469 deletions(-) delete mode 100644 src/lstm/jdwcrap.cpp diff --git a/src/lstm/jdwcrap.cpp b/src/lstm/jdwcrap.cpp deleted file mode 100644 index d3df8e7f0d..0000000000 --- a/src/lstm/jdwcrap.cpp +++ /dev/null @@ -1,1469 +0,0 @@ -/////////////////////////////////////////////////////////////////////// -// File: recodebeam.cpp -// Description: Beam search to decode from the re-encoded CJK as a sequence of -// smaller numbers in place of a single large code. -// Author: Ray Smith -// -// (C) Copyright 2015, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#include "recodebeam.h" - -#include "networkio.h" -#include "pageres.h" -#include "unicharcompress.h" - -#include // for std::reverse -#include -#include -#include -#include -#include -#include - -namespace tesseract { - -// The beam width at each code position. -const int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = { - 5, 10, 16, 16, 16, 16, 16, 16, 16, 16, -}; - -static const char *kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; - -// Prints debug details of the node. -void RecodeNode::Print(int null_char, const UNICHARSET &unicharset, int depth) const { - if (code == null_char) { - tprintf("null_char"); - } else { - tprintf("label=%d, uid=%d=%s", code, unichar_id, unicharset.debug_str(unichar_id).c_str()); - } - tprintf(" score=%g, c=%g,%s%s%s perm=%d, hash=%" PRIx64, score, certainty, - start_of_dawg ? " DawgStart" : "", start_of_word ? " Start" : "", - end_of_word ? " End" : "", permuter, code_hash); - if (depth > 0 && prev != nullptr) { - tprintf(" prev:"); - prev->Print(null_char, unicharset, depth - 1); - } else { - tprintf("\n"); - } -} - -// Borrows the pointer, which is expected to survive until *this is deleted. -RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress &recoder, int null_char, bool simple_text, - Dict *dict) - : recoder_(recoder) - , beam_size_(0) - , top_code_(-1) - , second_code_(-1) - , in_double_whammy_(false) // JDWTODO - , first_whammy_(-1) // JDWTODO - , second_whammy_(-1) // JDWTODO - , dict_(dict) - , space_delimited_(true) - , is_simple_text_(simple_text) - , null_char_(null_char) { - if (dict_ != nullptr && !dict_->IsSpaceDelimitedLang()) { - space_delimited_ = false; - } -} - -RecodeBeamSearch::~RecodeBeamSearch() { - for (auto data : beam_) { - delete data; - } - for (auto data : secondary_beam_) { - delete data; - } -} - -// Decodes the set of network outputs, storing the lattice internally. -void RecodeBeamSearch::Decode(const NetworkIO &output, double dict_ratio, double cert_offset, - double worst_dict_cert, const UNICHARSET *charset, - int lstm_choice_mode) { - beam_size_ = 0; - int width = output.Width(); - fprintf(stderr, "recodebeam decode #1 outputwidth= %i lstmchoice= %i \n", width, lstm_choice_mode); // JDWDEBUG - if (lstm_choice_mode) { - timesteps.clear(); - } - for (int t = 0; t < width; ++t) { - fprintf(stderr, "recodebeam decode #1 unicharid,code= timestep# %i \n", t); // JDWDEBUG - ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); - DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); - if (lstm_choice_mode) { - SaveMostCertainChoices(output.f(t), output.NumFeatures(), charset, t); - } - } -} - -void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY &output, double dict_ratio, - double cert_offset, double worst_dict_cert, - const UNICHARSET *charset) { - fprintf(stderr, "recodebeam decode #2 \n"); // JDWDEBUG - beam_size_ = 0; - int width = output.dim1(); - for (int t = 0; t < width; ++t) { - fprintf(stderr, "recodebeam decode #@ unicharid,code= timestep# %i \n", t); // JDWDEBUG - ComputeTopN(output[t], output.dim2(), kBeamWidths[0]); - DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset); - } -} - -void RecodeBeamSearch::DecodeSecondaryBeams(const NetworkIO &output, double dict_ratio, - double cert_offset, double worst_dict_cert, - const UNICHARSET *charset, int lstm_choice_mode) { - fprintf(stderr, "recodebeam decode secondary \n"); // JDWDEBUG - for (auto data : secondary_beam_) { - delete data; - } - secondary_beam_.clear(); - if (character_boundaries_.size() < 2) { - return; - } - int width = output.Width(); - int bucketNumber = 0; - for (int t = 0; t < width; ++t) { - while ((bucketNumber + 1) < character_boundaries_.size() && - t >= character_boundaries_[bucketNumber + 1]) { - ++bucketNumber; - } - ComputeSecTopN(&(excludedUnichars)[bucketNumber], output.f(t), output.NumFeatures(), - kBeamWidths[0]); - DecodeSecondaryStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); - } -} - -void RecodeBeamSearch::SaveMostCertainChoices(const float *outputs, int num_outputs, - const UNICHARSET *charset, int xCoord) { - fprintf(stderr, "recodebeam savemostcertainchoices \n"); // JDWDEBUG - std::vector> choices; - for (int i = 0; i < num_outputs; ++i) { - if (outputs[i] >= 0.01f) { - const char *character; - if (i + 2 >= num_outputs) { - character = ""; - } else if (i > 0) { - character = charset->id_to_unichar_ext(i + 2); - } else { - character = charset->id_to_unichar_ext(i); - } - size_t pos = 0; - // order the possible choices within one timestep - // beginning with the most likely - while (choices.size() > pos && choices[pos].second > outputs[i]) { - pos++; - } - choices.insert(choices.begin() + pos, std::pair(character, outputs[i])); - } - } - timesteps.push_back(choices); -} - -void RecodeBeamSearch::segmentTimestepsByCharacters() { - for (int i = 1; i < character_boundaries_.size(); ++i) { - std::vector>> segment; - for (int j = character_boundaries_[i - 1]; j < character_boundaries_[i]; ++j) { - segment.push_back(timesteps[j]); - } - segmentedTimesteps.push_back(segment); - } -} -std::vector>> -RecodeBeamSearch::combineSegmentedTimesteps( - std::vector>>> *segmentedTimesteps) { - std::vector>> combined_timesteps; - for (auto &segmentedTimestep : *segmentedTimesteps) { - for (auto &j : segmentedTimestep) { - combined_timesteps.push_back(j); - } - } - return combined_timesteps; -} - -void RecodeBeamSearch::calculateCharBoundaries(std::vector *starts, std::vector *ends, - std::vector *char_bounds_, int maxWidth) { - fprintf(stderr, "recodebeam calculatecharboundaries maxwidth= %i \n", maxWidth); // JDWDEBUG - // char_bounds_->push_back(0); // JDWTODO - char_bounds_->push_back((*starts)[0]); // JDWTODO - for (int i = 0; i < ends->size(); ++i) { - int middle = ((*starts)[i + 1] - (*ends)[i]) / 2; - fprintf(stderr, "%s %i %i %i \n", "calculatecharboundaries start&end&middle=", (*starts)[i + 1], (*ends)[i], middle); - char_bounds_->push_back((*ends)[i] + middle); - } - char_bounds_->pop_back(); - char_bounds_->push_back(maxWidth); -} - -// Returns the best path as labels/scores/xcoords similar to simple CTC. -void RecodeBeamSearch::ExtractBestPathAsLabels(std::vector *labels, - std::vector *xcoords) const { - fprintf(stderr, "recodebeam extractbestpathaslabels \n"); // JDWDEBUG - labels->clear(); - xcoords->clear(); - std::vector best_nodes; - ExtractBestPaths(&best_nodes, nullptr); - // Now just run CTC on the best nodes. - int t = 0; - int width = best_nodes.size(); - while (t < width) { - int label = best_nodes[t]->code; - if (label != null_char_) { - labels->push_back(label); - xcoords->push_back(t); - } - while (++t < width && !is_simple_text_ && best_nodes[t]->code == label) { - } - } - xcoords->push_back(width); -} - -// Returns the best path as unichar-ids/certs/ratings/xcoords skipping -// duplicates, nulls and intermediate parts. -void RecodeBeamSearch::ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, - std::vector *unichar_ids, - std::vector *certs, - std::vector *ratings, - std::vector *xcoords) const { - fprintf(stderr, "recodebeam extractbestpathasunicharids \n"); // JDWDEBUG - std::vector best_nodes; - ExtractBestPaths(&best_nodes, nullptr); - ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords); - if (debug) { - DebugPath(unicharset, best_nodes); - DebugUnicharPath(unicharset, best_nodes, *unichar_ids, *certs, *ratings, *xcoords); - } -} - -// Returns the best path as a set of WERD_RES. -void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, - const UNICHARSET *unicharset, - PointerVector *words, - int lstm_choice_mode) { -fprintf(stderr, "recodebeam extractbestpathaswords \n"); // JDWDEBUG -words->truncate(0); - std::vector unichar_ids; - std::vector certs; - std::vector ratings; - std::vector xcoords; - std::vector best_nodes; - std::vector second_nodes; - character_boundaries_.clear(); - ExtractBestPaths(&best_nodes, &second_nodes); - if (debug) { - DebugPath(unicharset, best_nodes); - ExtractPathAsUnicharIds(second_nodes, &unichar_ids, &certs, &ratings, &xcoords); - tprintf("\nSecond choice path:\n"); - DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings, xcoords); - } - // If lstm choice mode is required in granularity level 2, it stores the x - // Coordinates of every chosen character, to match the alternative choices to - // it. - ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords, - &character_boundaries_); - int num_ids = unichar_ids.size(); - // JDWDEBUG START - for (int i = 0; i < num_ids; i++){ - const char *c = unicharset->id_to_unichar_ext(unichar_ids[i]); - fprintf(stderr, "recodebeam extractbestpathaswords unichar,unicharid= %s %i \n", c, unichar_ids[i]); - } - // JDWDEBUG END - if (debug) { - DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings, xcoords); - } - // Convert labels to unichar-ids. - int word_end = 0; - float prev_space_cert = 0.0f; - for (int word_start = 0; word_start < num_ids; word_start = word_end) { - for (word_end = word_start + 1; word_end < num_ids; ++word_end) { - // A word is terminated when a space character or start_of_word flag is - // hit. We also want to force a separate word for every non - // space-delimited character when not in a dictionary context. - if (unichar_ids[word_end] == UNICHAR_SPACE) { - break; - } - int index = xcoords[word_end]; - if (best_nodes[index]->start_of_word) { - break; - } - if (best_nodes[index]->permuter == TOP_CHOICE_PERM && - (!unicharset->IsSpaceDelimited(unichar_ids[word_end]) || - !unicharset->IsSpaceDelimited(unichar_ids[word_end - 1]))) { - break; - } - } - float space_cert = 0.0f; - if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) { - space_cert = certs[word_end]; - } - bool leading_space = word_start > 0 && unichar_ids[word_start - 1] == UNICHAR_SPACE; - // Create a WERD_RES for the output word. - WERD_RES *word_res = - InitializeWord(leading_space, line_box, word_start, word_end, - std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); - for (int i = word_start; i < word_end; ++i) { - auto *choices = new BLOB_CHOICE_LIST; - BLOB_CHOICE_IT bc_it(choices); - auto *choice = new BLOB_CHOICE(unichar_ids[i], ratings[i], certs[i], -1, 1.0f, - static_cast(INT16_MAX), 0.0f, BCC_STATIC_CLASSIFIER); - int col = i - word_start; - choice->set_matrix_cell(col, col); - bc_it.add_after_then_move(choice); - word_res->ratings->put(col, col, choices); - } - int index = xcoords[word_end - 1]; - word_res->FakeWordFromRatings(best_nodes[index]->permuter); - words->push_back(word_res); - prev_space_cert = space_cert; - if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) { - ++word_end; - } - } -} - -struct greater_than { - inline bool operator()(const RecodeNode *&node1, const RecodeNode *&node2) { - return (node1->score > node2->score); - } -}; - -void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *charset, - bool secondary) const { - std::vector> topology; - std::unordered_set visited; - const std::vector &beam = !secondary ? beam_ : secondary_beam_; - // create the topology - for (int step = beam.size() - 1; step >= 0; --step) { - std::vector layer; - topology.push_back(layer); - } - // fill the topology with depths first - for (int step = beam.size() - 1; step >= 0; --step) { - std::vector &heaps = beam.at(step)->beams_->heap(); - for (auto node : heaps) { - int backtracker = 0; - const RecodeNode *curr = &node.data(); - while (curr != nullptr && !visited.count(curr)) { - visited.insert(curr); - topology[step - backtracker].push_back(curr); - curr = curr->prev; - ++backtracker; - } - } - } - int ct = 0; - int cb = 1; - for (std::vector layer : topology) { - if (cb >= character_boundaries_.size()) { - break; - } - if (ct == character_boundaries_[cb]) { - tprintf("***\n"); - ++cb; - } - for (const RecodeNode *node : layer) { - const char *code; - int intCode; - if (node->unichar_id != INVALID_UNICHAR_ID) { - code = charset->id_to_unichar(node->unichar_id); - intCode = node->unichar_id; - } else if (node->code == null_char_) { - intCode = 0; - code = " "; - } else { - intCode = 666; - code = "*"; - } - int intPrevCode = 0; - const char *prevCode; - float prevScore = 0; - if (node->prev != nullptr) { - prevScore = node->prev->score; - if (node->prev->unichar_id != INVALID_UNICHAR_ID) { - prevCode = charset->id_to_unichar(node->prev->unichar_id); - intPrevCode = node->prev->unichar_id; - } else if (node->code == null_char_) { - intPrevCode = 0; - prevCode = " "; - } else { - prevCode = "*"; - intPrevCode = 666; - } - } else { - prevCode = " "; - } - if (uids) { - tprintf("%x(|)%f(>)%x(|)%f\n", intPrevCode, prevScore, intCode, node->score); - } else { - tprintf("%s(|)%f(>)%s(|)%f\n", prevCode, prevScore, code, node->score); - } - } - tprintf("-\n"); - ++ct; - } - tprintf("***\n"); -} - -void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) { - if (character_boundaries_.size() < 2) { - return; - } - fprintf(stderr, "recodebeam extractsymbolchoices \n"); // JDWDEBUG - // For the first iteration the original beam is analyzed. After that a - // new beam is calculated based on the results from the original beam. - std::vector ¤tBeam = secondary_beam_.empty() ? beam_ : secondary_beam_; - character_boundaries_[0] = 0; - for (int j = 1; j < character_boundaries_.size(); ++j) { - std::vector unichar_ids; - std::vector certs; - std::vector ratings; - std::vector xcoords; - int backpath = character_boundaries_[j] - character_boundaries_[j - 1]; - std::vector &heaps = - currentBeam.at(character_boundaries_[j] - 1)->beams_->heap(); - std::vector best_nodes; - std::vector best; - // Scan the segmented node chain for valid unichar ids. - for (auto entry : heaps) { - bool validChar = false; - int backcounter = 0; - const RecodeNode *node = &entry.data(); - while (node != nullptr && backcounter < backpath) { - if (node->code != null_char_ && node->unichar_id != INVALID_UNICHAR_ID) { - validChar = true; - break; - } - node = node->prev; - ++backcounter; - } - if (validChar) { - best.push_back(&entry.data()); - } - } - // find the best rated segmented node chain and extract the unichar id. - if (!best.empty()) { - std::sort(best.begin(), best.end(), greater_than()); - ExtractPath(best[0], &best_nodes, backpath); - ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords); - } - if (!unichar_ids.empty()) { - int bestPos = 0; - for (int i = 1; i < unichar_ids.size(); ++i) { - if (ratings[i] < ratings[bestPos]) { - bestPos = i; - } - } - // TODO: bestCode is currently unused (see commit 2dd5d0d60). - int bestCode = -10; - for (auto &node : best_nodes) { - if (node->unichar_id == unichar_ids[bestPos]) { - bestCode = node->code; - } - } - // Exclude the best choice for the followup decoding. - std::unordered_set excludeCodeList; - for (auto &best_node : best_nodes) { - if (best_node->code != null_char_) { - excludeCodeList.insert(best_node->code); - } - } - if (j - 1 < excludedUnichars.size()) { - for (auto elem : excludeCodeList) { - excludedUnichars[j - 1].insert(elem); - } - } else { - excludedUnichars.push_back(excludeCodeList); - } - // Save the best choice for the choice iterator. - if (j - 1 < ctc_choices.size()) { - int id = unichar_ids[bestPos]; - const char *result = unicharset->id_to_unichar_ext(id); - float rating = ratings[bestPos]; - ctc_choices[j - 1].push_back(std::pair(result, rating)); - } else { - std::vector> choice; - int id = unichar_ids[bestPos]; - const char *result = unicharset->id_to_unichar_ext(id); - float rating = ratings[bestPos]; - choice.emplace_back(result, rating); - ctc_choices.push_back(choice); - } - // fill the blank spot with an empty array - } else { - if (j - 1 >= excludedUnichars.size()) { - std::unordered_set excludeCodeList; - excludedUnichars.push_back(excludeCodeList); - } - if (j - 1 >= ctc_choices.size()) { - std::vector> choice; - ctc_choices.push_back(choice); - } - } - } - for (auto data : secondary_beam_) { - delete data; - } - secondary_beam_.clear(); -} - -// Generates debug output of the content of the beams after a Decode. -void RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const { - fprintf(stderr, "recodebeam debugbeams \n"); // JDWDEBUG - for (int p = 0; p < beam_size_; ++p) { - for (int d = 0; d < 2; ++d) { - for (int c = 0; c < NC_COUNT; ++c) { - auto cont = static_cast(c); - int index = BeamIndex(d, cont, 0); - if (beam_[p]->beams_[index].empty()) { - continue; - } - // Print all the best scoring nodes for each unichar found. - tprintf("Position %d: %s+%s beam\n", p, d ? "Dict" : "Non-Dict", kNodeContNames[c]); - DebugBeamPos(unicharset, beam_[p]->beams_[index]); - } - } - } -} - -// Generates debug output of the content of a single beam position. -void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const { - std::vector unichar_bests(unicharset.size()); - const RecodeNode *null_best = nullptr; - int heap_size = heap.size(); - for (int i = 0; i < heap_size; ++i) { - const RecodeNode *node = &heap.get(i).data(); - if (node->unichar_id == INVALID_UNICHAR_ID) { - if (null_best == nullptr || null_best->score < node->score) { - null_best = node; - } - } else { - if (unichar_bests[node->unichar_id] == nullptr || - unichar_bests[node->unichar_id]->score < node->score) { - unichar_bests[node->unichar_id] = node; - } - } - } - for (auto &unichar_best : unichar_bests) { - if (unichar_best != nullptr) { - const RecodeNode &node = *unichar_best; - node.Print(null_char_, unicharset, 1); - } - } - if (null_best != nullptr) { - null_best->Print(null_char_, unicharset, 1); - } -} - -// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping -// duplicates, nulls and intermediate parts. -/* static */ -void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vector &best_nodes, - std::vector *unichar_ids, - std::vector *certs, - std::vector *ratings, - std::vector *xcoords, - std::vector *character_boundaries) { - unichar_ids->clear(); - certs->clear(); - ratings->clear(); - xcoords->clear(); - std::vector starts; - std::vector ends; - // Backtrack extracting only valid, non-duplicate unichar-ids. - fprintf(stderr, "recodebeam extractpathasunicharids \n"); // JDWDEBUG - int t = 0; - int width = best_nodes.size(); - fprintf(stderr, "%s %i \n", "extractpathasunicharids width=", width); - while (t < width) { - double certainty = 0.0; - double rating = 0.0; - bool foundInvalid = false; // JDWDEBUG - while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) { - fprintf(stderr, "%s %i \n", "extractpathasunicharids bypass invalid unicharid code=", best_nodes[t]->code); // JDWDEBUG - foundInvalid = true; // JDWDEBUG - double cert = best_nodes[t++]->certainty; - if (cert < certainty) { - certainty = cert; - } - rating -= cert; - } - // starts.push_back(t); // JDWTODO - if (t < width) { - starts.push_back(t); // JDWTODO - fprintf(stderr, "%s %i %i %f \n", "extractpathasunicharids valid unicharid,code,score=", best_nodes[t]->unichar_id, best_nodes[t]->code, best_nodes[t]->score); // JDWDEBUG - // JDWDEBUG START - if (!foundInvalid) - fprintf(stderr, "%s \n", "extractpathasunicharids foundvalid with no invalid"); - // JDWDEBUG END - int unichar_id = best_nodes[t]->unichar_id; - if (unichar_id == UNICHAR_SPACE && !certs->empty() && best_nodes[t]->permuter != NO_PERM) { - // All the rating and certainty go on the previous character except - // for the space itself. - fprintf(stderr, "%s %i \n", "extractpathasunicharids unicharid space", best_nodes[t]->code); // JDWDEBUG - if (certainty < certs->back()) { - certs->back() = certainty; - } - ratings->back() += rating; - certainty = 0.0; - rating = 0.0; - } - xcoords->push_back(t); - unichar_ids->push_back(unichar_id); - t++; // JDWTODO - // do { // JDWTODO - while (t < width && best_nodes[t]->duplicate) { // JDWTODO - // double cert = best_nodes[t++]->certainty; // JDWTODO - double cert = best_nodes[t]->certainty; // JDWTODO - // Special-case NO-PERM space to forget the certainty of the previous - // nulls. See long comment in ContinueContext. - if (cert < certainty || - (unichar_id == UNICHAR_SPACE && best_nodes[t - 1]->permuter == NO_PERM)) { - certainty = cert; - } - rating -= cert; - // JDWDEBUG START - if (t < width && best_nodes[t]->duplicate) - fprintf(stderr, "%s %i %i \n", "extractpathasunicharids duplicate removed unicharid,code=", best_nodes[t]->unichar_id, best_nodes[t]->code); // JDWDEBUG - // JDWDEBUG END - t++; // JDWTODO - } // JDWTODO - // } while (t < width && best_nodes[t]->duplicate); // JDWTODO - ends.push_back(t); - certs->push_back(certainty); - ratings->push_back(rating); - } else if (!certs->empty()) { - if (certainty < certs->back()) { - certs->back() = certainty; - } - ratings->back() += rating; - } - } - starts.push_back(width); - if (character_boundaries != nullptr) { - calculateCharBoundaries(&starts, &ends, character_boundaries, width); - } - xcoords->push_back(width); -} - -// Sets up a word with the ratings matrix and fake blobs with boxes in the -// right places. -WERD_RES *RecodeBeamSearch::InitializeWord(bool leading_space, const TBOX &line_box, int word_start, - int word_end, float space_certainty, - const UNICHARSET *unicharset, - const std::vector &xcoords, float scale_factor) { - // Make a fake blob for each non-zero label. - fprintf(stderr, "recodebeam initializeword scalefactor= %f \n", scale_factor); // JDWDEBUG - fprintf(stderr, "recodebeam initializeword start,end= %i %i \n", word_start, word_end); // JDWDEBUG - C_BLOB_LIST blobs; - C_BLOB_IT b_it(&blobs); - for (int i = word_start; i < word_end; ++i) { - if (character_boundaries_.size() > (i + 1)) { - TBOX box(static_cast(std::floor(character_boundaries_[i] * scale_factor)) + - line_box.left(), - line_box.bottom(), - static_cast(std::ceil(character_boundaries_[i + 1] * scale_factor)) + - line_box.left(), - line_box.top()); - // JDWDEBUG START - std::string debug_str; - debug_str = "fake boxblob for werd being built in recodebeam "; - box.print_to_str(debug_str); - fprintf(stderr, "%s %i %i %i \n", debug_str.c_str(), i, character_boundaries_[i], character_boundaries_[i + 1]); - // JDWDEBUG END - b_it.add_after_then_move(C_BLOB::FakeBlob(box)); - } - } - // Make a fake word from the blobs. - WERD *word = new WERD(&blobs, leading_space, nullptr); - // Make a WERD_RES from the word. - auto *word_res = new WERD_RES(word); - word_res->end = word_end - word_start + leading_space; - word_res->uch_set = unicharset; - word_res->combination = true; // Give it ownership of the word. - word_res->space_certainty = space_certainty; - word_res->ratings = new MATRIX(word_end - word_start, 1); - return word_res; -} - -// Fills top_n_flags_ with bools that are true iff the corresponding output -// is one of the top_n. -void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int top_n) { - fprintf(stderr, "recodebeam computetopn \n"); // JDWDEBUG - top_n_flags_.resize(num_outputs, TN_ALSO_RAN); - top_code_ = -1; - second_code_ = -1; - top_heap_.clear(); - for (int i = 0; i < num_outputs; ++i) { - if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) { - TopPair entry(outputs[i], i); - top_heap_.Push(&entry); - if (top_heap_.size() > top_n) { - top_heap_.Pop(&entry); - } - } - } - - float top_key = 0.0F; // JDWTODO - float second_key = 0.0F; // JDWTODO - bool found_first_whammy = false; // JDWTODO - bool found_second_whammy = false; // JDWTODO - while (!top_heap_.empty()) { - TopPair entry; - top_heap_.Pop(&entry); - if (in_double_whammy_ && entry.data() == first_whammy_) // JDWTODO - found_first_whammy = true; // JDWTODO - if (in_double_whammy_ && entry.data() == second_whammy_) // JDWTODO - found_second_whammy = true; // JDWTODO - if (top_heap_.size() > 1) { - top_n_flags_[entry.data()] = TN_TOPN; - fprintf(stderr, "recodebeam computetopn topn code,key= %i %f \n", entry.data(), entry.key()); // JDWDEBUG - } else { - top_n_flags_[entry.data()] = TN_TOP2; - fprintf(stderr, "recodebeam computetopn top2 code,key= %i %f \n", entry.data(), entry.key()); // JDWDEBUG - if (top_heap_.empty()) { - top_code_ = entry.data(); - top_key = entry.key(); // JDWTODO - } else { - second_code_ = entry.data(); - second_key = entry.key(); // JDWTODO - } - } - } - - // JDWTODO START - if (in_double_whammy_) { - if (!found_first_whammy && !found_second_whammy){ - in_double_whammy_ = false; - first_whammy_ = -1; - second_whammy_ = -1; - fprintf(stderr, "recodebeam computetopn double whammy cleared unicharid,code= \n"); - } - } - // JDWTODO END - - // JDWTODO START - if (!in_double_whammy_) { - if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > 0.25F && second_key > 0.25F){ - in_double_whammy_ = true; - first_whammy_ = top_code_; - second_whammy_ = second_code_; - fprintf(stderr, "recodebeam computetopn double whammy found unicharid,code= %f %f \n", top_key, second_key); - } - } - // JDWTODO END - - fprintf(stderr, "recodebeam computetopn unicharid,code= top_code,second_code= %i %i \n", top_code_, second_code_); // JDWDEBUG - top_n_flags_[null_char_] = TN_TOP2; -} - -void RecodeBeamSearch::ComputeSecTopN(std::unordered_set *exList, const float *outputs, - int num_outputs, int top_n) { - fprintf(stderr, "recodebeam computesectopn \n"); // JDWDEBUG - top_n_flags_.resize(num_outputs, TN_ALSO_RAN); - top_code_ = -1; - second_code_ = -1; - top_heap_.clear(); - for (int i = 0; i < num_outputs; ++i) { - if ((top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key()) && !exList->count(i)) { - TopPair entry(outputs[i], i); - top_heap_.Push(&entry); - if (top_heap_.size() > top_n) { - top_heap_.Pop(&entry); - } - } - } - while (!top_heap_.empty()) { - TopPair entry; - top_heap_.Pop(&entry); - if (top_heap_.size() > 1) { - top_n_flags_[entry.data()] = TN_TOPN; - } else { - top_n_flags_[entry.data()] = TN_TOP2; - if (top_heap_.empty()) { - top_code_ = entry.data(); - } else { - second_code_ = entry.data(); - } - } - } - top_n_flags_[null_char_] = TN_TOP2; -} - -// Adds the computation for the current time-step to the beam. Call at each -// time-step in sequence from left to right. outputs is the activation vector -// for the current timestep. -void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio, - double cert_offset, double worst_dict_cert, - const UNICHARSET *charset, bool debug) { - fprintf(stderr, "recodebeam decodestep timestep= %i \n", t); // JDWDEBUG - if (t == beam_.size()) { - beam_.push_back(new RecodeBeam); - } - RecodeBeam *step = beam_[t]; - beam_size_ = t + 1; - step->Clear(); - if (t == 0) { - // The first step can only use singles and initials. - ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - if (dict_ != nullptr) { - ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - } - } else { - RecodeBeam *prev = beam_[t - 1]; - if (debug) { - int beam_index = BeamIndex(true, NC_ANYTHING, 0); - for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - std::vector path; - ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); - tprintf("Step %d: Dawg beam %d:\n", t, i); - DebugPath(charset, path); - } - beam_index = BeamIndex(false, NC_ANYTHING, 0); - for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - std::vector path; - ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); - tprintf("Step %d: Non-Dawg beam %d:\n", t, i); - DebugPath(charset, path); - } - } - int total_beam = 0; - // Work through the scores by group (top-2, top-n, the rest) while the beam - // is empty. This enables extending the context using only the top-n results - // first, which may have an empty intersection with the valid codes, so we - // fall back to the rest if the beam is empty. - for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) { - auto top_n = static_cast(tn); - for (int index = 0; index < kNumBeams; ++index) { - // Working backwards through the heaps doesn't guarantee that we see the - // best first, but it comes before a lot of the worst, so it is slightly - // more efficient than going forwards. - for (int i = prev->beams_[index].size() - 1; i >= 0; --i) { - ContinueContext(&prev->beams_[index].get(i).data(), index, outputs, top_n, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - } - } - for (int index = 0; index < kNumBeams; ++index) { - if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) { - total_beam += step->beams_[index].size(); - } - } - } - // Special case for the best initial dawg. Push it on the heap if good - // enough, but there is only one, so it doesn't blow up the beam. - for (int c = 0; c < NC_COUNT; ++c) { - if (step->best_initial_dawgs_[c].code >= 0) { - fprintf(stderr, "recodebeam decodestep special case initial dawg %i \n", t); // JDWDEBUG - int index = BeamIndex(true, static_cast(c), 0); - RecodeHeap *dawg_heap = &step->beams_[index]; - PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c], dawg_heap); - } - } - } -} - -void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double dict_ratio, - double cert_offset, double worst_dict_cert, - const UNICHARSET *charset, bool debug) { - fprintf(stderr, "recodebeam decodesecondarystep \n"); // JDWDEBUG - if (t == secondary_beam_.size()) { - secondary_beam_.push_back(new RecodeBeam); - } - RecodeBeam *step = secondary_beam_[t]; - step->Clear(); - if (t == 0) { - // The first step can only use singles and initials. - ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - if (dict_ != nullptr) { - ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - } - } else { - RecodeBeam *prev = secondary_beam_[t - 1]; - if (debug) { - int beam_index = BeamIndex(true, NC_ANYTHING, 0); - for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - std::vector path; - ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); - tprintf("Step %d: Dawg beam %d:\n", t, i); - DebugPath(charset, path); - } - beam_index = BeamIndex(false, NC_ANYTHING, 0); - for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - std::vector path; - ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); - tprintf("Step %d: Non-Dawg beam %d:\n", t, i); - DebugPath(charset, path); - } - } - int total_beam = 0; - // Work through the scores by group (top-2, top-n, the rest) while the beam - // is empty. This enables extending the context using only the top-n results - // first, which may have an empty intersection with the valid codes, so we - // fall back to the rest if the beam is empty. - for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) { - auto top_n = static_cast(tn); - for (int index = 0; index < kNumBeams; ++index) { - // Working backwards through the heaps doesn't guarantee that we see the - // best first, but it comes before a lot of the worst, so it is slightly - // more efficient than going forwards. - for (int i = prev->beams_[index].size() - 1; i >= 0; --i) { - ContinueContext(&prev->beams_[index].get(i).data(), index, outputs, top_n, charset, - dict_ratio, cert_offset, worst_dict_cert, step); - } - } - for (int index = 0; index < kNumBeams; ++index) { - if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) { - total_beam += step->beams_[index].size(); - } - } - } - // Special case for the best initial dawg. Push it on the heap if good - // enough, but there is only one, so it doesn't blow up the beam. - for (int c = 0; c < NC_COUNT; ++c) { - if (step->best_initial_dawgs_[c].code >= 0) { - int index = BeamIndex(true, static_cast(c), 0); - RecodeHeap *dawg_heap = &step->beams_[index]; - PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c], dawg_heap); - } - } - } -} - -// Adds to the appropriate beams the legal (according to recoder) -// continuations of context prev, which is of the given length, using the -// given network outputs to provide scores to the choices. Uses only those -// choices for which top_n_flags[index] == top_n_flag. -void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const float *outputs, - TopNState top_n_flag, const UNICHARSET *charset, - double dict_ratio, double cert_offset, - double worst_dict_cert, RecodeBeam *step) { - // JDWDEBUG START - if (prev != nullptr) { - const char *ucc = charset->id_to_unichar_ext(prev->unichar_id); - fprintf(stderr, "recodebeam continuecontext unicharid,code,unichar,index,topn = %i %i %s %i %i \n", prev->unichar_id, prev->code, ucc, index, top_n_flag); - } - else { - fprintf(stderr, "recodebeam continuecontext top prev null index,topn = %i %i \n", index, top_n_flag); - } - // JDWDEBUG END - RecodedCharID prefix; - RecodedCharID full_code; - const RecodeNode *previous = prev; - int length = LengthFromBeamsIndex(index); - bool use_dawgs = IsDawgFromBeamsIndex(index); - NodeContinuation prev_cont = ContinuationFromBeamsIndex(index); - for (int p = length - 1; p >= 0; --p, previous = previous->prev) { - while (previous != nullptr && (previous->duplicate || previous->code == null_char_)) { - fprintf(stderr, "recodebeam continuecontext stepping back code= %i \n", previous->code); - previous = previous->prev; - } - if (previous != nullptr) { - prefix.Set(p, previous->code); - full_code.Set(p, previous->code); - } - } - if (prev != nullptr && !is_simple_text_) { - if (top_n_flags_[prev->code] == top_n_flag) { - if (prev_cont != NC_NO_DUP) { - float cert = NetworkIO::ProbToCertainty(outputs[prev->code]) + cert_offset; - fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", prev->unichar_id, 1); - PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, cert, worst_dict_cert, - dict_ratio, use_dawgs, NC_ANYTHING, prev, step); - } - if (prev_cont == NC_ANYTHING && top_n_flag == TN_TOP2 && prev->code != null_char_) { - float cert = - NetworkIO::ProbToCertainty(outputs[prev->code] + outputs[null_char_]) + cert_offset; - fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", prev->unichar_id, 1); - PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, cert, worst_dict_cert, - dict_ratio, use_dawgs, NC_NO_DUP, prev, step); - } - } - if (prev_cont == NC_ONLY_DUP) { - return; - } - if (prev->code != null_char_ && length > 0 && top_n_flags_[null_char_] == top_n_flag) { - // Allow nulls within multi code sequences, as the nulls within are not - // explicitly included in the code sequence. - float cert = NetworkIO::ProbToCertainty(outputs[null_char_]) + cert_offset; - fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); - PushDupOrNoDawgIfBetter(length, false, null_char_, INVALID_UNICHAR_ID, cert, worst_dict_cert, - dict_ratio, use_dawgs, NC_ANYTHING, prev, step); - } - } - const std::vector *final_codes = recoder_.GetFinalCodes(prefix); - if (final_codes != nullptr) { - for (int code : *final_codes) { - if (top_n_flags_[code] != top_n_flag) { - continue; - } - if (prev != nullptr && prev->code == code && !is_simple_text_) { - continue; - } - float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; - if (cert < kMinCertainty && code != null_char_) { - continue; - } - full_code.Set(length, code); - int unichar_id = recoder_.DecodeUnichar(full_code); - // Map the null char to INVALID. - if (length == 0 && code == null_char_) { - unichar_id = INVALID_UNICHAR_ID; - } - if (unichar_id != INVALID_UNICHAR_ID && charset != nullptr && - !charset->get_enabled(unichar_id)) { - continue; // disabled by whitelist/blacklist - } - ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, use_dawgs, NC_ANYTHING, - prev, step); - if (top_n_flag == TN_TOP2 && code != null_char_) { - float prob = outputs[code] + outputs[null_char_]; - if (prev != nullptr && prev_cont == NC_ANYTHING && prev->code != null_char_ && - ((prev->code == top_code_ && code == second_code_) || - (code == top_code_ && prev->code == second_code_))) { - prob += outputs[prev->code]; - } - float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; - ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, use_dawgs, NC_ONLY_DUP, - prev, step); - } - } - } - const std::vector *next_codes = recoder_.GetNextCodes(prefix); - if (next_codes != nullptr) { - for (int code : *next_codes) { - if (top_n_flags_[code] != top_n_flag) { - continue; - } - if (prev != nullptr && prev->code == code && !is_simple_text_) { - continue; - } - float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; - fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); - PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert, worst_dict_cert, - dict_ratio, use_dawgs, NC_ANYTHING, prev, step); - if (top_n_flag == TN_TOP2 && code != null_char_) { - float prob = outputs[code] + outputs[null_char_]; - if (prev != nullptr && prev_cont == NC_ANYTHING && prev->code != null_char_ && - ((prev->code == top_code_ && code == second_code_) || - (code == top_code_ && prev->code == second_code_))) { - prob += outputs[prev->code]; - } - float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; - fprintf(stderr, "recodebeam continuecontext before pushdupornodawgifbetter unicharid,dup= %i %i \n", INVALID_UNICHAR_ID, 0); - PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert, worst_dict_cert, - dict_ratio, use_dawgs, NC_ONLY_DUP, prev, step); - } - } - } -} - -// Continues for a new unichar, using dawg or non-dawg as per flag. -void RecodeBeamSearch::ContinueUnichar(int code, int unichar_id, float cert, float worst_dict_cert, - float dict_ratio, bool use_dawgs, NodeContinuation cont, - const RecodeNode *prev, RecodeBeam *step) { - fprintf(stderr, "recodebeam ContinueUnichar unicharid,code,cont= %i %i %i \n", unichar_id, code, cont); // JDWDEBUG - if (use_dawgs) { - if (cert > worst_dict_cert) { - ContinueDawg(code, unichar_id, cert, cont, prev, step); - } - } else { - RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; - fprintf(stderr, "recodebeam ContinueUnichar before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG - PushHeapIfBetter(kBeamWidths[0], code, unichar_id, TOP_CHOICE_PERM, false, false, false, false, - cert * dict_ratio, prev, nullptr, nodawg_heap); - if (dict_ != nullptr && ((unichar_id == UNICHAR_SPACE && cert > worst_dict_cert) || - !dict_->getUnicharset().IsSpaceDelimited(unichar_id))) { - // Any top choice position that can start a new word, ie a space or - // any non-space-delimited character, should also be considered - // by the dawg search, so push initial dawg to the dawg heap. - float dawg_cert = cert; - PermuterType permuter = TOP_CHOICE_PERM; - // Since we use the space either side of a dictionary word in the - // certainty of the word, (to properly handle weak spaces) and the - // space is coming from a non-dict word, we need special conditions - // to avoid degrading the certainty of the dict word that follows. - // With a space we don't multiply the certainty by dict_ratio, and we - // flag the space with NO_PERM to indicate that we should not use the - // predecessor nulls to generate the confidence for the space, as they - // have already been multiplied by dict_ratio, and we can't go back to - // insert more entries in any previous heaps. - if (unichar_id == UNICHAR_SPACE) { - permuter = NO_PERM; - } else { - dawg_cert *= dict_ratio; - } - fprintf(stderr, "recodebeam ContinueUnichar before pushinitialdawgifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG - PushInitialDawgIfBetter(code, unichar_id, permuter, false, false, dawg_cert, cont, prev, - step); - } - } -} - -// Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev, -// appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id -// is a valid continuation of whatever is in prev. -void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont, - const RecodeNode *prev, RecodeBeam *step) { - fprintf(stderr, "recodebeam ContinueDawg unicharid,code,cont= %i %i %i \n", unichar_id, code, cont); // JDWDEBUG - RecodeHeap *dawg_heap = &step->beams_[BeamIndex(true, cont, 0)]; - RecodeHeap *nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; - if (unichar_id == INVALID_UNICHAR_ID) { - fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG - PushHeapIfBetter(kBeamWidths[0], code, unichar_id, NO_PERM, false, false, false, false, cert, - prev, nullptr, dawg_heap); - return; - } - // Avoid dictionary probe if score a total loss. - float score = cert; - if (prev != nullptr) { - score += prev->score; - } - if (dawg_heap->size() >= kBeamWidths[0] && score <= dawg_heap->PeekTop().data().score && - nodawg_heap->size() >= kBeamWidths[0] && score <= nodawg_heap->PeekTop().data().score) { - return; - } - const RecodeNode *uni_prev = prev; - // Prev may be a partial code, null_char, or duplicate, so scan back to the - // last valid unichar_id. - while (uni_prev != nullptr && - (uni_prev->unichar_id == INVALID_UNICHAR_ID || uni_prev->duplicate)) { - uni_prev = uni_prev->prev; - } - if (unichar_id == UNICHAR_SPACE) { - if (uni_prev != nullptr && uni_prev->end_of_word) { - // Space is good. Push initial state, to the dawg beam and a regular - // space to the top choice beam. - fprintf(stderr, "recodebeam ContinueDawg before PushInitialDawgIfBetter unicharid= %i \n", unichar_id); // JDWDEBUG - PushInitialDawgIfBetter(code, unichar_id, uni_prev->permuter, false, false, cert, cont, prev, - step); - fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG - PushHeapIfBetter(kBeamWidths[0], code, unichar_id, uni_prev->permuter, false, false, false, - false, cert, prev, nullptr, nodawg_heap); - } - return; - } else if (uni_prev != nullptr && uni_prev->start_of_dawg && - uni_prev->unichar_id != UNICHAR_SPACE && - dict_->getUnicharset().IsSpaceDelimited(uni_prev->unichar_id) && - dict_->getUnicharset().IsSpaceDelimited(unichar_id)) { - return; // Can't break words between space delimited chars. - } - DawgPositionVector initial_dawgs; - auto *updated_dawgs = new DawgPositionVector; - DawgArgs dawg_args(&initial_dawgs, updated_dawgs, NO_PERM); - bool word_start = false; - if (uni_prev == nullptr) { - // Starting from beginning of line. - dict_->default_dawgs(&initial_dawgs, false); - word_start = true; - } else if (uni_prev->dawgs != nullptr) { - // Continuing a previous dict word. - dawg_args.active_dawgs = uni_prev->dawgs; - word_start = uni_prev->start_of_dawg; - } else { - return; // Can't continue if not a dict word. - } - auto permuter = static_cast( - dict_->def_letter_is_okay(&dawg_args, dict_->getUnicharset(), unichar_id, false)); - if (permuter != NO_PERM) { - PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, word_start, - dawg_args.valid_end, false, cert, prev, dawg_args.updated_dawgs, dawg_heap); - if (dawg_args.valid_end && !space_delimited_) { - // We can start another word right away, so push initial state as well, - // to the dawg beam, and the regular character to the top choice beam, - // since non-dict words can start here too. - fprintf(stderr, "recodebeam ContinueDawg before PushInitialDawgIfBetter unicharid= %i \n", unichar_id); // JDWDEBUG - PushInitialDawgIfBetter(code, unichar_id, permuter, word_start, true, cert, cont, prev, step); - fprintf(stderr, "recodebeam ContinueDawg before pushheapifbetter unicharid,dup= %i %i \n", unichar_id, 0 ); // JDWDEBUG - PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, word_start, true, false, - cert, prev, nullptr, nodawg_heap); - } - } else { - delete updated_dawgs; - } -} - -// Adds a RecodeNode composed of the tuple (code, unichar_id, -// initial-dawg-state, prev, cert) to the given heap if/ there is room or if -// better than the current worst element if already full. -void RecodeBeamSearch::PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter, - bool start, bool end, float cert, - NodeContinuation cont, const RecodeNode *prev, - RecodeBeam *step) { - fprintf(stderr, "recodebeam PushInitialDawgIfBetter unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG - RecodeNode *best_initial_dawg = &step->best_initial_dawgs_[cont]; - float score = cert; - if (prev != nullptr) { - score += prev->score; - } - if (best_initial_dawg->code < 0 || score > best_initial_dawg->score) { - auto *initial_dawgs = new DawgPositionVector; - dict_->default_dawgs(initial_dawgs, false); - fprintf(stderr, "recodebeam PushInitialDawgIfBetter adding new node unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG - RecodeNode node(code, unichar_id, permuter, true, start, end, false, cert, score, prev, - initial_dawgs, ComputeCodeHash(code, false, prev)); - *best_initial_dawg = node; - } -} - -// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, -// false, false, false, false, cert, prev, nullptr) to heap if there is room -// or if better than the current worst element if already full. -/* static */ -void RecodeBeamSearch::PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id, - float cert, float worst_dict_cert, float dict_ratio, - bool use_dawgs, NodeContinuation cont, - const RecodeNode *prev, RecodeBeam *step) { - fprintf(stderr, "recodebeam PushDupOrNoDawgIfBetter %i \n", unichar_id); // JDWDEBUG - int index = BeamIndex(use_dawgs, cont, length); - if (use_dawgs) { - if (cert > worst_dict_cert) { - PushHeapIfBetter(kBeamWidths[length], code, unichar_id, prev ? prev->permuter : NO_PERM, - false, false, false, dup, cert, prev, nullptr, &step->beams_[index]); - } - } else { - cert *= dict_ratio; - if (cert >= kMinCertainty || code == null_char_) { - PushHeapIfBetter(kBeamWidths[length], code, unichar_id, - prev ? prev->permuter : TOP_CHOICE_PERM, false, false, false, dup, cert, - prev, nullptr, &step->beams_[index]); - } - } -} - -// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, -// dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room -// or if better than the current worst element if already full. -void RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id, - PermuterType permuter, bool dawg_start, bool word_start, - bool end, bool dup, float cert, const RecodeNode *prev, - DawgPositionVector *d, RecodeHeap *heap) { - fprintf(stderr, "recodebeam PushHeapIfBetter #1 %i \n", unichar_id); // JDWDEBUG - float score = cert; - if (prev != nullptr) { - score += prev->score; - } - if (heap->size() < max_size || score > heap->PeekTop().data().score) { - uint64_t hash = ComputeCodeHash(code, dup, prev); - RecodeNode node(code, unichar_id, permuter, dawg_start, word_start, end, dup, cert, score, prev, - d, hash); - if (UpdateHeapIfMatched(&node, heap)) { - return; - } - // JDWTODO START - if (!AddToHeapIsAllowed(&node)) { - return; - } - // JDWTODO END - fprintf(stderr, "recodebeam PushHeapIfBetter #1 adding node unicharid,code= %i %i \n", unichar_id, code); // JDWDEBUG - RecodePair entry(score, node); - heap->Push(&entry); - ASSERT_HOST(entry.data().dawgs == nullptr); - if (heap->size() > max_size) { - heap->Pop(&entry); - } - } else { - delete d; - } -} - -// Adds a RecodeNode to heap if there is room -// or if better than the current worst element if already full. -void RecodeBeamSearch::PushHeapIfBetter(int max_size, RecodeNode *node, RecodeHeap *heap) { - fprintf(stderr, "recodebeam PushHeapIfBetter #1 %i \n", node->unichar_id); // JDWDEBUG - if (heap->size() < max_size || node->score > heap->PeekTop().data().score) { - if (UpdateHeapIfMatched(node, heap)) { - return; - } - fprintf(stderr, "recodebeam PushHeapIfBetter #2 adding node unicharid,code= %i %i \n", node->unichar_id, node->code); // JDWDEBUG - RecodePair entry(node->score, *node); - heap->Push(&entry); - ASSERT_HOST(entry.data().dawgs == nullptr); - if (heap->size() > max_size) { - heap->Pop(&entry); - } - } -} - -// Searches the heap for a matching entry, and updates the score with -// reshuffle if needed. Returns true if there was a match. -bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap) { - // TODO(rays) consider hash map instead of linear search. - // It might not be faster because the hash map would have to be updated - // every time a heap reshuffle happens, and that would be a lot of overhead. - fprintf(stderr, "recodebeam UpdateHeapIfMatched %i \n", new_node->unichar_id); // JDWDEBUG - std::vector &nodes = heap->heap(); - for (auto &i : nodes) { - RecodeNode &node = i.data(); - if (node.code == new_node->code && node.code_hash == new_node->code_hash && - node.permuter == new_node->permuter && node.start_of_dawg == new_node->start_of_dawg) { - if (new_node->score > node.score) { - // The new one is better. Update the entire node in the heap and - // reshuffle. - fprintf(stderr, "recodebeam UpdateHeapIfMatched doing update unicharid,code= %i %i \n", new_node->unichar_id, new_node->code); // JDWDEBUG - node = *new_node; - i.key() = node.score; - heap->Reshuffle(&i); - } - return true; - } - } - return false; -} - -// JDWTODO START -bool RecodeBeamSearch::AddToHeapIsAllowed(RecodeNode *new_node) { - if (!in_double_whammy_) - return true; - const RecodeNode *prev_node = new_node->prev; - if (prev_node != nullptr && prev_node->code == first_whammy_ && new_node->code == second_whammy_) { - fprintf(stderr, "recodebeam AddToHeapIsAllowed second whammy not allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); - return false; - } - if (prev_node != nullptr && prev_node->code == second_whammy_ && new_node->code == first_whammy_) { - fprintf(stderr, "recodebeam AddToHeapIsAllowed first whammy not allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); - return false; - } -if (prev_node != nullptr){ - fprintf(stderr, "recodebeam AddToHeapIsAllowed allowed unicharid,code= + prevcode %i %i %i \n", new_node->unichar_id, new_node->code, prev_node->code); -} -else { - fprintf(stderr, "recodebeam AddToHeapIsAllowed allowed unicharid,code= + prevcode %i %i null \n", new_node->unichar_id, new_node->code); -} - return true; -} -// JDWTODO END - -// Computes and returns the code-hash for the given code and prev. -uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const { - uint64_t hash = prev == nullptr ? 0 : prev->code_hash; - if (!dup && code != null_char_) { - int num_classes = recoder_.code_range(); - uint64_t carry = (((hash >> 32) * num_classes) >> 32); - hash *= num_classes; - hash += carry; - hash += code; - } - return hash; -} - -// Backtracks to extract the best path through the lattice that was built -// during Decode. On return the best_nodes vector essentially contains the set -// of code, score pairs that make the optimal path with the constraint that -// the recoder can decode the code sequence back to a sequence of unichar-ids. -void RecodeBeamSearch::ExtractBestPaths(std::vector *best_nodes, - std::vector *second_nodes) const { - // Scan both beams to extract the best and second best paths. - fprintf(stderr, "recodebeam extractbestpaths \n"); // JDWDEBUG - const RecodeNode *best_node = nullptr; - const RecodeNode *second_best_node = nullptr; - const RecodeBeam *last_beam = beam_[beam_size_ - 1]; - for (int c = 0; c < NC_COUNT; ++c) { - if (c == NC_ONLY_DUP) { - continue; - } - auto cont = static_cast(c); - for (int is_dawg = 0; is_dawg < 2; ++is_dawg) { - int beam_index = BeamIndex(is_dawg, cont, 0); - int heap_size = last_beam->beams_[beam_index].size(); - for (int h = 0; h < heap_size; ++h) { - const RecodeNode *node = &last_beam->beams_[beam_index].get(h).data(); - if (is_dawg) { - // dawg_node may be a null_char, or duplicate, so scan back to the - // last valid unichar_id. - const RecodeNode *dawg_node = node; - while (dawg_node != nullptr && - (dawg_node->unichar_id == INVALID_UNICHAR_ID || dawg_node->duplicate)) { - dawg_node = dawg_node->prev; - } - if (dawg_node == nullptr || - (!dawg_node->end_of_word && dawg_node->unichar_id != UNICHAR_SPACE)) { - // Dawg node is not valid. - continue; - } - } - if (best_node == nullptr || node->score > best_node->score) { - second_best_node = best_node; - best_node = node; - fprintf(stderr, "recodebeam extractbestpaths bestnodebeam= %i \n", beam_index); // JDWDEBUG - } else if (second_best_node == nullptr || node->score > second_best_node->score) { - second_best_node = node; - fprintf(stderr, "recodebeam extractbestpaths secondbestnodebeam= %i \n", beam_index); // JDWDEBUG - } - } - } - } - if (second_nodes != nullptr) { - fprintf(stderr, "recodebeam extractbestpaths extract second best \n"); // JDWDEBUG - ExtractPath(second_best_node, second_nodes); - } - fprintf(stderr, "recodebeam extractbestpaths extract best \n"); // JDWDEBUG - ExtractPath(best_node, best_nodes); -} - -// Helper backtracks through the lattice from the given node, storing the -// path and reversing it. -void RecodeBeamSearch::ExtractPath(const RecodeNode *node, - std::vector *path) const { - path->clear(); - while (node != nullptr) { - fprintf(stderr, "recodebeam extractpath unicharid,code,cert,score= %i %i %f %f %i \n", node->unichar_id, node->code, node->certainty, node->score, node->duplicate); // JDWDEBUG - path->push_back(node); - node = node->prev; - } - std::reverse(path->begin(), path->end()); -} - -void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector *path, - int limiter) const { - int pathcounter = 0; - path->clear(); - while (node != nullptr && pathcounter < limiter) { - path->push_back(node); - node = node->prev; - ++pathcounter; - } - std::reverse(path->begin(), path->end()); -} - -// Helper prints debug information on the given lattice path. -void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset, - const std::vector &path) const { - for (int c = 0; c < path.size(); ++c) { - const RecodeNode &node = *path[c]; - tprintf("%d ", c); - node.Print(null_char_, *unicharset, 1); - } -} - -// Helper prints debug information on the given unichar path. -void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset, - const std::vector &path, - const std::vector &unichar_ids, - const std::vector &certs, - const std::vector &ratings, - const std::vector &xcoords) const { - int num_ids = unichar_ids.size(); - double total_rating = 0.0; - for (int c = 0; c < num_ids; ++c) { - int coord = xcoords[c]; - tprintf("%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\n", coord, unichar_ids[c], - unicharset->debug_str(unichar_ids[c]).c_str(), ratings[c], certs[c], - path[coord]->start_of_word, path[coord]->end_of_word, path[coord]->permuter); - total_rating += ratings[c]; - } - tprintf("Path total rating = %g\n", total_rating); -} - -} // namespace tesseract. diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 96e5fbb276..40976bafc2 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -662,20 +662,46 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to } } } + float top_key = 0.0F; + float second_key = 0.0F; + bool found_first_code = false; + bool found_second_code = false; while (!top_heap_.empty()) { TopPair entry; top_heap_.Pop(&entry); + if (in_possible_diplopia_ && entry.data() == first_diplopia_code_) + found_first_code = true; + if (in_possible_diplopia_ && entry.data() == second_diplopia_code_) + found_second_code = true; if (top_heap_.size() > 1) { top_n_flags_[entry.data()] = TN_TOPN; } else { top_n_flags_[entry.data()] = TN_TOP2; if (top_heap_.empty()) { top_code_ = entry.data(); + top_key = entry.key(); } else { second_code_ = entry.data(); + second_key = entry.key(); } } } + // need to identify if we are in a potential diplopia situation + // or if we already are, then determine if it is ended + if (in_possible_diplopia_) { + if (!found_first_code && !found_second_code){ + in_possible_diplopia_ = false; + first_diplopia_code_ = -1; + second_diplopia_code_ = -1; + } + } + if (!in_possible_diplopia_) { + if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > 0.25F && second_key > 0.25F){ + in_possible_diplopia_ = true; + first_diplopia_code_ = top_code_; + second_diplopia_code_ = second_code_; + } + } top_n_flags_[null_char_] = TN_TOP2; } @@ -1143,6 +1169,10 @@ void RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id, if (UpdateHeapIfMatched(&node, heap)) { return; } + // check to see if node is possible diplopia + if (!AddToHeapIsAllowed(&node)) { + return; + } RecodePair entry(score, node); heap->Push(&entry); ASSERT_HOST(entry.data().dawgs == nullptr); @@ -1194,6 +1224,20 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea return false; } +// Determines if node can be added to heap based on possible diplopia status +bool RecodeBeamSearch::AddToHeapIsAllowed(RecodeNode *new_node) { + if (!in_possible_diplopia_) + return true; + const RecodeNode *prev_node = new_node->prev; + if (prev_node != nullptr && prev_node->code == first_diplopia_code_ && new_node->code == second_diplopia_code_) { + return false; + } + if (prev_node != nullptr && prev_node->code == second_diplopia_code_ && new_node->code == first_diplopia_code_) { + return false; + } + return true; +} + // Computes and returns the code-hash for the given code and prev. uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const { uint64_t hash = prev == nullptr ? 0 : prev->code_hash; From ae85bc4aab2b9df86da332014af6641994ffef1c Mon Sep 17 00:00:00 2001 From: woodjohndavid <57116722+woodjohndavid@users.noreply.github.com> Date: Sun, 27 Jun 2021 10:19:27 -0700 Subject: [PATCH 4/8] diplopia interim solution final --- src/lstm/recodebeam.cpp | 7 ++++++- src/lstm/recodebeam.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 40976bafc2..45e5efd5cd 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -40,6 +40,11 @@ const int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = { static const char *kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; +// the minimum diplopia key is the minimum score (key) from +// the network output to qualify as a likely 'real' character +// for the purposes of identifying possible diplopia +static const float kMinDiplopiaKey = 0.25; + // Prints debug details of the node. void RecodeNode::Print(int null_char, const UNICHARSET &unicharset, int depth) const { if (code == null_char) { @@ -696,7 +701,7 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to } } if (!in_possible_diplopia_) { - if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > 0.25F && second_key > 0.25F){ + if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > kMinDiplopiaKey && second_key > kMinDiplopiaKey){ in_possible_diplopia_ = true; first_diplopia_code_ = top_code_; second_diplopia_code_ = second_code_; diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 78ff911bd4..14076ea772 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -429,7 +429,7 @@ class TESS_API RecodeBeamSearch { // The encoded (class label) of the null/reject character. int null_char_; // Variables used in tracking possible diplopia case - // Refer to ComputeTopN routine for more information + // Refer to ComputeTopN routine for use of these variables bool in_possible_diplopia_; int first_diplopia_code_; int second_diplopia_code_; From a8af23f0f0a350ce3eff31eeab40aed5f7320ac9 Mon Sep 17 00:00:00 2001 From: woodjohndavid <57116722+woodjohndavid@users.noreply.github.com> Date: Tue, 29 Jun 2021 13:51:04 -0700 Subject: [PATCH 5/8] Style Changes Per stweil comments --- src/lstm/recodebeam.cpp | 9 ++++++--- src/lstm/recodebeam.h | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 45e5efd5cd..8667841047 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -674,10 +674,12 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to while (!top_heap_.empty()) { TopPair entry; top_heap_.Pop(&entry); - if (in_possible_diplopia_ && entry.data() == first_diplopia_code_) + if (in_possible_diplopia_ && entry.data() == first_diplopia_code_) { found_first_code = true; - if (in_possible_diplopia_ && entry.data() == second_diplopia_code_) + } + if (in_possible_diplopia_ && entry.data() == second_diplopia_code_) { found_second_code = true; + } if (top_heap_.size() > 1) { top_n_flags_[entry.data()] = TN_TOPN; } else { @@ -1231,8 +1233,9 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea // Determines if node can be added to heap based on possible diplopia status bool RecodeBeamSearch::AddToHeapIsAllowed(RecodeNode *new_node) { - if (!in_possible_diplopia_) + if (!in_possible_diplopia_) { return true; + } const RecodeNode *prev_node = new_node->prev; if (prev_node != nullptr && prev_node->code == first_diplopia_code_ && new_node->code == second_diplopia_code_) { return false; diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 14076ea772..ed1daf21ca 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -426,13 +426,13 @@ class TESS_API RecodeBeamSearch { // True if the input is simple text, ie adjacent equal chars are not to be // eliminated. bool is_simple_text_; - // The encoded (class label) of the null/reject character. - int null_char_; // Variables used in tracking possible diplopia case // Refer to ComputeTopN routine for use of these variables bool in_possible_diplopia_; int first_diplopia_code_; int second_diplopia_code_; + // The encoded (class label) of the null/reject character. + int null_char_; }; } // namespace tesseract. From bf6d32ef847b6ba96d7f345173f4853775fd5ab7 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 22 Jan 2022 13:06:06 +0100 Subject: [PATCH 6/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Robert Pösel Signed-off-by: Stefan Weil Co-authored-by: Robert Pösel --- src/lstm/recodebeam.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 8667841047..bc0e5e7a69 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -43,7 +43,7 @@ static const char *kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; // the minimum diplopia key is the minimum score (key) from // the network output to qualify as a likely 'real' character // for the purposes of identifying possible diplopia -static const float kMinDiplopiaKey = 0.25; +static const float kMinDiplopiaKey = 0.25f; // Prints debug details of the node. void RecodeNode::Print(int null_char, const UNICHARSET &unicharset, int depth) const { @@ -190,7 +190,7 @@ RecodeBeamSearch::combineSegmentedTimesteps( void RecodeBeamSearch::calculateCharBoundaries(std::vector *starts, std::vector *ends, std::vector *char_bounds_, int maxWidth) { - char_bounds_->push_back((*starts)[0]); + char_bounds_->push_back((*starts)[0]); for (int i = 0; i < ends->size(); ++i) { int middle = ((*starts)[i + 1] - (*ends)[i]) / 2; char_bounds_->push_back((*ends)[i] + middle); @@ -667,8 +667,8 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to } } } - float top_key = 0.0F; - float second_key = 0.0F; + float top_key = 0.0f; + float second_key = 0.0f; bool found_first_code = false; bool found_second_code = false; while (!top_heap_.empty()) { @@ -696,14 +696,14 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to // need to identify if we are in a potential diplopia situation // or if we already are, then determine if it is ended if (in_possible_diplopia_) { - if (!found_first_code && !found_second_code){ + if (!found_first_code && !found_second_code) { in_possible_diplopia_ = false; first_diplopia_code_ = -1; second_diplopia_code_ = -1; } } if (!in_possible_diplopia_) { - if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > kMinDiplopiaKey && second_key > kMinDiplopiaKey){ + if (top_code_ != null_char_ && second_code_ != null_char_ && top_key > kMinDiplopiaKey && second_key > kMinDiplopiaKey) { in_possible_diplopia_ = true; first_diplopia_code_ = top_code_; second_diplopia_code_ = second_code_; From 5b445e103c0d2fa0718a7487fc486846deb59d9e Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 22 Jan 2022 13:42:57 +0100 Subject: [PATCH 7/8] Apply suggestions from code review Signed-off-by: Stefan Weil --- src/lstm/recodebeam.cpp | 12 ++++++------ src/lstm/recodebeam.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index bc0e5e7a69..13c2962ff1 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -40,9 +40,9 @@ const int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = { static const char *kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; -// the minimum diplopia key is the minimum score (key) from +// The minimum diplopia key is the minimum score (key) from // the network output to qualify as a likely 'real' character -// for the purposes of identifying possible diplopia +// for the purposes of identifying possible diplopia. static const float kMinDiplopiaKey = 0.25f; // Prints debug details of the node. @@ -693,8 +693,8 @@ void RecodeBeamSearch::ComputeTopN(const float *outputs, int num_outputs, int to } } } - // need to identify if we are in a potential diplopia situation - // or if we already are, then determine if it is ended + // Need to identify if we are in a potential diplopia situation + // or if we already are, then determine if it is ended. if (in_possible_diplopia_) { if (!found_first_code && !found_second_code) { in_possible_diplopia_ = false; @@ -1176,7 +1176,7 @@ void RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id, if (UpdateHeapIfMatched(&node, heap)) { return; } - // check to see if node is possible diplopia + // Check to see if node is possible diplopia. if (!AddToHeapIsAllowed(&node)) { return; } @@ -1231,7 +1231,7 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea return false; } -// Determines if node can be added to heap based on possible diplopia status +// Determines if node can be added to heap based on possible diplopia status. bool RecodeBeamSearch::AddToHeapIsAllowed(RecodeNode *new_node) { if (!in_possible_diplopia_) { return true; diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index ed1daf21ca..2dfb18a74b 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -378,7 +378,7 @@ class TESS_API RecodeBeamSearch { // with reshuffle if needed. Returns true if there was a match. bool UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap); // Determines if new node can be added to the heap for the current beam. - // Returns false if we are in possible diplopia situation + // Returns false if we are in possible diplopia situation. bool AddToHeapIsAllowed(RecodeNode *new_node); // Computes and returns the code-hash for the given code and prev. uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const; @@ -426,8 +426,8 @@ class TESS_API RecodeBeamSearch { // True if the input is simple text, ie adjacent equal chars are not to be // eliminated. bool is_simple_text_; - // Variables used in tracking possible diplopia case - // Refer to ComputeTopN routine for use of these variables + // Variables used in tracking possible diplopia case. + // Refer to ComputeTopN routine for use of these variables. bool in_possible_diplopia_; int first_diplopia_code_; int second_diplopia_code_; From f8f7a3f6862c95de7a56deeac0e533f0023f43cc Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 22 Jan 2022 13:45:04 +0100 Subject: [PATCH 8/8] Apply suggestions from code review Signed-off-by: Stefan Weil --- src/lstm/recodebeam.cpp | 3 --- src/lstm/recodebeam.h | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 13c2962ff1..4413181a8d 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -70,9 +70,6 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress &recoder, int null_char , beam_size_(0) , top_code_(-1) , second_code_(-1) - , in_possible_diplopia_(false) - , first_diplopia_code_(-1) - , second_diplopia_code_(-1) , dict_(dict) , space_delimited_(true) , is_simple_text_(simple_text) diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 2dfb18a74b..8ae8e4f52e 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -428,9 +428,9 @@ class TESS_API RecodeBeamSearch { bool is_simple_text_; // Variables used in tracking possible diplopia case. // Refer to ComputeTopN routine for use of these variables. - bool in_possible_diplopia_; - int first_diplopia_code_; - int second_diplopia_code_; + bool in_possible_diplopia_ = false; + int first_diplopia_code_ = -1; + int second_diplopia_code_ = -1; // The encoded (class label) of the null/reject character. int null_char_; };