Skip to content

Commit

Permalink
Fix possible prediction duplicate (#74)
Browse files Browse the repository at this point in the history
When search prediction from pinyin dict, we did not check the
duplication (nor compare with the model prediction).

Fix #70
  • Loading branch information
wengxt authored May 21, 2024
1 parent 1118bfd commit 01ac0e0
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 6 deletions.
27 changes: 22 additions & 5 deletions src/libime/pinyin/pinyinprediction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,20 @@
* SPDX-License-Identifier: LGPL-2.1-or-later
*/
#include "pinyinprediction.h"
#include "libime/core/languagemodel.h"
#include "libime/core/prediction.h"
#include "libime/pinyin/pinyindictionary.h"
#include <algorithm>
#include <cstddef>
#include <fcitx-utils/macros.h>
#include <fcitx-utils/misc.h>
#include <fcitx-utils/stringutils.h>
#include <iterator>
#include <memory>
#include <string>
#include <string_view>
#include <tuple>
#include <unordered_set>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -69,16 +74,18 @@ PinyinPrediction::predict(const State &state,
});
std::make_heap(intermedidateResult.begin(), intermedidateResult.end(), cmp);

State prevState = model()->nullState(), outState;
State prevState = model()->nullState();
State outState;
std::vector<WordNode> nodes;
if (sentence.size() >= 1) {
std::unordered_set<std::string> dup;
if (!sentence.empty()) {
nodes.reserve(sentence.size());
for (const auto &word : fcitx::MakeIterRange(
sentence.begin(), std::prev(sentence.end()))) {
auto idx = model()->index(word);
nodes.emplace_back(word, idx);
model()->score(prevState, nodes.back(), outState);
prevState = std::move(outState);
prevState = outState;
}
// We record the last score for the sentence word to adjust the partial
// score. E.g. for 无, model may contain 压力 and dict contain 聊 score
Expand All @@ -88,26 +95,36 @@ PinyinPrediction::predict(const State &state,
float adjust = model()->score(prevState, nodes.back(), outState);
for (auto &result : intermedidateResult) {
std::get<float>(result) += adjust;
dup.insert(std::get<std::string>(result));
}
}

d->dict_->matchWordsPrefix(
lastEncodedPinyin.data(), lastEncodedPinyin.size(),
[this, &sentence, &prevState, &cmp, &intermedidateResult,
[this, &sentence, &prevState, &cmp, &intermedidateResult, &dup,
maxSize](std::string_view, std::string_view hz, float cost) {
if (sentence.back().size() < hz.size() &&
fcitx::stringutils::startsWith(hz, sentence.back())) {

std::string newWord(hz.substr(sentence.back().size()));
if (dup.count(newWord)) {
return true;
}

std::tuple<std::string, float, PinyinPredictionSource> newItem{
std::string(hz.substr(sentence.back().size())),
std::move(newWord),
cost + model()->singleWordScore(prevState, hz),
PinyinPredictionSource::Dictionary};

dup.insert(std::get<std::string>(newItem));
intermedidateResult.push_back(std::move(newItem));
std::push_heap(intermedidateResult.begin(),
intermedidateResult.end(), cmp);
while (intermedidateResult.size() > maxSize) {
std::pop_heap(intermedidateResult.begin(),
intermedidateResult.end(), cmp);
dup.erase(
std::get<std::string>(intermedidateResult.back()));
intermedidateResult.pop_back();
}
}
Expand Down
10 changes: 9 additions & 1 deletion src/libime/pinyin/pinyinprediction.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,17 @@
#ifndef _FCITX_LIBIME_PINYIN_PREDICTION_H_
#define _FCITX_LIBIME_PINYIN_PREDICTION_H_

#include "libime/pinyin/pinyindictionary.h"
#include "libimepinyin_export.h"
#include <cstddef>
#include <fcitx-utils/macros.h>
#include <libime/core/languagemodel.h>
#include <libime/core/prediction.h>
#include <libime/pinyin/pinyindictionary.h>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

namespace libime {

Expand Down
15 changes: 15 additions & 0 deletions test/testpinyinprediction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
#include "libime/pinyin/pinyinencoder.h"
#include "libime/pinyin/pinyinprediction.h"
#include "testdir.h"
#include <algorithm>
#include <fcitx-utils/log.h>
#include <string>

using namespace libime;

Expand Down Expand Up @@ -45,5 +47,18 @@ int main() {
auto noPyResult = prediction.predict({"", "喜欢", "中国"}, 20);
FCITX_ASSERT(result.size() > noPyResult.size())
<< result << " " << noPyResult;

// Check if word that exists in multiple sub dicts won't generate multiple
// result.
py = PinyinEncoder::encodeFullPinyin("guan'xi");
FCITX_ASSERT(
dict.lookupWord(PinyinDictionary::SystemDict, "guan'xi'ren", "关系人"));
dict.addWord(PinyinDictionary::UserDict, "guan'xi'ren", "关系人");
result = prediction.predict(model.nullState(), {"关系"},
{py.data(), py.size()}, 49);
FCITX_ASSERT(
std::count_if(result.begin(), result.end(), [](const auto &item) {
return std::get<std::string>(item) == "";
}) == 1);
return 0;
}

0 comments on commit 01ac0e0

Please sign in to comment.