Skip to content

Commit

Permalink
Merge branch 'google:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
shitamo authored Jun 3, 2024
2 parents d0507e5 + 864c543 commit afaeb37
Show file tree
Hide file tree
Showing 23 changed files with 138 additions and 118 deletions.
101 changes: 53 additions & 48 deletions src/base/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -791,27 +791,29 @@ bool Util::IsEnglishTransliteration(absl::string_view value) {
// script type
// TODO(yukawa, team): Make a mechanism to keep this classifier up-to-date
// based on the original data from Unicode.org.
Util::ScriptType Util::GetScriptType(char32_t w) {
if (INRANGE(w, 0x0030, 0x0039) || // ascii number
INRANGE(w, 0xFF10, 0xFF19)) { // full width number
Util::ScriptType Util::GetScriptType(char32_t codepoint) {
if (INRANGE(codepoint, 0x0030, 0x0039) || // ascii number
INRANGE(codepoint, 0xFF10, 0xFF19)) { // full width number
return NUMBER;
} else if (INRANGE(w, 0x0041, 0x005A) || // ascii upper
INRANGE(w, 0x0061, 0x007A) || // ascii lower
INRANGE(w, 0xFF21, 0xFF3A) || // fullwidth ascii upper
INRANGE(w, 0xFF41, 0xFF5A)) { // fullwidth ascii lower
} else if (INRANGE(codepoint, 0x0041, 0x005A) || // ascii upper
INRANGE(codepoint, 0x0061, 0x007A) || // ascii lower
INRANGE(codepoint, 0xFF21, 0xFF3A) || // fullwidth ascii upper
INRANGE(codepoint, 0xFF41, 0xFF5A)) { // fullwidth ascii lower
return ALPHABET;
} else if (w == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々"
INRANGE(w, 0x3400,
} else if (codepoint == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々"
INRANGE(codepoint, 0x3400,
0x4DBF) || // CJK Unified Ideographs Extension A
INRANGE(w, 0x4E00, 0x9FFF) || // CJK Unified Ideographs
INRANGE(w, 0xF900, 0xFAFF) || // CJK Compatibility Ideographs
INRANGE(w, 0x20000,
INRANGE(codepoint, 0x4E00, 0x9FFF) || // CJK Unified Ideographs
INRANGE(codepoint, 0xF900,
0xFAFF) || // CJK Compatibility Ideographs
INRANGE(codepoint, 0x20000,
0x2A6DF) || // CJK Unified Ideographs Extension B
INRANGE(w, 0x2A700,
INRANGE(codepoint, 0x2A700,
0x2B73F) || // CJK Unified Ideographs Extension C
INRANGE(w, 0x2B740,
INRANGE(codepoint, 0x2B740,
0x2B81F) || // CJK Unified Ideographs Extension D
INRANGE(w, 0x2F800, 0x2FA1F)) { // CJK Compatibility Ideographs
INRANGE(codepoint, 0x2F800,
0x2FA1F)) { // CJK Compatibility Ideographs
// As of Unicode 6.0.2, each block has the following characters assigned.
// [U+3400, U+4DB5]: CJK Unified Ideographs Extension A
// [U+4E00, U+9FCB]: CJK Unified Ideographs
Expand All @@ -821,51 +823,53 @@ Util::ScriptType Util::GetScriptType(char32_t w) {
// [U+2B740, U+2B81D]: CJK Unified Ideographs Extension D
// [U+2F800, U+2FA1D]: CJK Compatibility Ideographs
return KANJI;
} else if (INRANGE(w, 0x3041, 0x309F) || // hiragana
w == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE
} else if (INRANGE(codepoint, 0x3041, 0x309F) || // hiragana
codepoint == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE
return HIRAGANA;
} else if (INRANGE(w, 0x30A1, 0x30FF) || // full width katakana
INRANGE(w, 0x31F0,
} else if (INRANGE(codepoint, 0x30A1, 0x30FF) || // full width katakana
INRANGE(codepoint, 0x31F0,
0x31FF) || // Katakana Phonetic Extensions for Ainu
INRANGE(w, 0xFF65, 0xFF9F) || // half width katakana
w == 0x1B000) { // KATAKANA LETTER ARCHAIC E
INRANGE(codepoint, 0xFF65, 0xFF9F) || // half width katakana
codepoint == 0x1B000) { // KATAKANA LETTER ARCHAIC E
return KATAKANA;
} else if (INRANGE(w, 0x02300, 0x023F3) || // Miscellaneous Technical
INRANGE(w, 0x02700, 0x027BF) || // Dingbats
INRANGE(w, 0x1F000, 0x1F02F) || // Mahjong tiles
INRANGE(w, 0x1F030, 0x1F09F) || // Domino tiles
INRANGE(w, 0x1F0A0, 0x1F0FF) || // Playing cards
INRANGE(w, 0x1F100,
} else if (INRANGE(codepoint, 0x02300, 0x023F3) || // Miscellaneous Technical
INRANGE(codepoint, 0x02700, 0x027BF) || // Dingbats
INRANGE(codepoint, 0x1F000, 0x1F02F) || // Mahjong tiles
INRANGE(codepoint, 0x1F030, 0x1F09F) || // Domino tiles
INRANGE(codepoint, 0x1F0A0, 0x1F0FF) || // Playing cards
INRANGE(codepoint, 0x1F100,
0x1F2FF) || // Enclosed Alphanumeric Supplement
INRANGE(w, 0x1F200, 0x1F2FF) || // Enclosed Ideographic Supplement
INRANGE(w, 0x1F300,
INRANGE(codepoint, 0x1F200,
0x1F2FF) || // Enclosed Ideographic Supplement
INRANGE(codepoint, 0x1F300,
0x1F5FF) || // Miscellaneous Symbols And Pictographs
INRANGE(w, 0x1F600, 0x1F64F) || // Emoticons
INRANGE(w, 0x1F680, 0x1F6FF) || // Transport And Map Symbols
INRANGE(w, 0x1F700, 0x1F77F) || // Alchemical Symbols
w == 0x26CE) { // Ophiuchus
INRANGE(codepoint, 0x1F600, 0x1F64F) || // Emoticons
INRANGE(codepoint, 0x1F680,
0x1F6FF) || // Transport And Map Symbols
INRANGE(codepoint, 0x1F700, 0x1F77F) || // Alchemical Symbols
codepoint == 0x26CE) { // Ophiuchus
return EMOJI;
}

return UNKNOWN_SCRIPT;
}

Util::FormType Util::GetFormType(char32_t w) {
Util::FormType Util::GetFormType(char32_t codepoint) {
// 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
// http://www.unicode.org/reports/tr11/

// Characters marked as 'Na' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (INRANGE(w, 0x0020, 0x007F) || // ascii
INRANGE(w, 0x27E6, 0x27ED) || // narrow mathematical symbols
INRANGE(w, 0x2985, 0x2986)) { // narrow white parentheses
if (INRANGE(codepoint, 0x0020, 0x007F) || // ascii
INRANGE(codepoint, 0x27E6, 0x27ED) || // narrow mathematical symbols
INRANGE(codepoint, 0x2985, 0x2986)) { // narrow white parentheses
return HALF_WIDTH;
}

// Other characters marked as 'Na' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (INRANGE(w, 0x00A2, 0x00AF)) {
switch (w) {
if (INRANGE(codepoint, 0x00A2, 0x00AF)) {
switch (codepoint) {
case 0x00A2: // CENT SIGN
case 0x00A3: // POUND SIGN
case 0x00A5: // YEN SIGN
Expand All @@ -878,13 +882,13 @@ Util::FormType Util::GetFormType(char32_t w) {

// Characters marked as 'H' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (w == 0x20A9 || // WON SIGN
INRANGE(w, 0xFF61, 0xFF9F) || // half-width katakana
INRANGE(w, 0xFFA0, 0xFFBE) || // half-width hangul
INRANGE(w, 0xFFC2, 0xFFCF) || // half-width hangul
INRANGE(w, 0xFFD2, 0xFFD7) || // half-width hangul
INRANGE(w, 0xFFDA, 0xFFDC) || // half-width hangul
INRANGE(w, 0xFFE8, 0xFFEE)) { // half-width symbols
if (codepoint == 0x20A9 || // WON SIGN
INRANGE(codepoint, 0xFF61, 0xFF9F) || // half-width katakana
INRANGE(codepoint, 0xFFA0, 0xFFBE) || // half-width hangul
INRANGE(codepoint, 0xFFC2, 0xFFCF) || // half-width hangul
INRANGE(codepoint, 0xFFD2, 0xFFD7) || // half-width hangul
INRANGE(codepoint, 0xFFDA, 0xFFDC) || // half-width hangul
INRANGE(codepoint, 0xFFE8, 0xFFEE)) { // half-width symbols
return HALF_WIDTH;
}

Expand Down Expand Up @@ -970,9 +974,10 @@ Util::ScriptType Util::GetScriptTypeWithoutSymbols(absl::string_view str) {
// return true if all script_type in str is "type"
bool Util::IsScriptType(absl::string_view str, Util::ScriptType type) {
for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
const char32_t codepoint = iter.Get();
// Exception: 30FC (PROLONGEDSOUND MARK is categorized as HIRAGANA as well)
if (type != GetScriptType(w) && (w != 0x30FC || type != HIRAGANA)) {
if (type != GetScriptType(codepoint) &&
(codepoint != 0x30FC || type != HIRAGANA)) {
return false;
}
}
Expand Down
18 changes: 9 additions & 9 deletions src/base/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ class Util {
SCRIPT_TYPE_SIZE,
};

// return script type of w
static ScriptType GetScriptType(char32_t w);
// Returns the script type of `codepoint`.
static ScriptType GetScriptType(char32_t codepoint);

// Returns the script type of the first character in `str`.
// This function finds the first UTF-8 chars and returns its script type.
Expand All @@ -198,7 +198,7 @@ class Util {
static ScriptType GetFirstScriptType(absl::string_view str,
size_t *mblen = nullptr);

// return script type of string. all chars in str must be
// Returns the script type of a string. All chars in str must be
// KATAKANA/HIRAGANA/KANJI/NUMBER or ALPHABET.
// If str has mixed scripts, this function returns UNKNOWN_SCRIPT
static ScriptType GetScriptType(absl::string_view str);
Expand All @@ -207,10 +207,10 @@ class Util {
// in the |str|.
static ScriptType GetScriptTypeWithoutSymbols(absl::string_view str);

// return true if all script_type in str is "type"
// Returns true if all script_type in str is "type"
static bool IsScriptType(absl::string_view str, ScriptType type);

// return true if the string contains script_type char
// Returns true if the string contains script_type char
static bool ContainsScriptType(absl::string_view str, ScriptType type);

// See 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
Expand All @@ -223,12 +223,12 @@ class Util {
FORM_TYPE_SIZE,
};

// return Form type of single character.
// Returns Form type of single character.
// This function never returns UNKNOWN_FORM.
static FormType GetFormType(char32_t w);
static FormType GetFormType(char32_t codepoint);

// return FormType of string.
// return UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
// Returns FormType of string.
// Returns UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
static FormType GetFormType(absl::string_view str);

// Returns true if all characters of `str` are ASCII (U+00 - U+7F).
Expand Down
7 changes: 4 additions & 3 deletions src/converter/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ bool ExtractLastTokenWithScriptType(const absl::string_view text,
std::vector<char32_t> reverse_last_token;
Util::ScriptType last_script_type_found = Util::GetScriptType(iter.Get());
for (; !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
if ((w == ' ') || (Util::GetScriptType(w) != last_script_type_found)) {
const char32_t codepoint = iter.Get();
if ((codepoint == ' ') ||
(Util::GetScriptType(codepoint) != last_script_type_found)) {
break;
}
reverse_last_token.push_back(w);
reverse_last_token.push_back(codepoint);
}

*last_script_type = last_script_type_found;
Expand Down
3 changes: 1 addition & 2 deletions src/converter/immutable_converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1926,8 +1926,7 @@ void ImmutableConverter::InsertCandidates(const ConversionRequest &request,
prev = node;
}

const size_t expand_size =
std::max<size_t>(1, std::min<size_t>(512, max_candidates_size));
const size_t expand_size = std::clamp<size_t>(max_candidates_size, 1, 512);

const bool is_single_segment =
(type == SINGLE_SEGMENT || type == FIRST_INNER_SEGMENT);
Expand Down
6 changes: 3 additions & 3 deletions src/dictionary/user_dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ class UserDictionary::UserDictionaryReloader {
UserDictionaryReloader(const UserDictionaryReloader &) = delete;
UserDictionaryReloader &operator=(const UserDictionaryReloader &) = delete;

~UserDictionaryReloader() = default;
~UserDictionaryReloader() { Wait(); }

// When the user dictionary exists AND the modification time has been updated,
// reloads the dictionary. Returns true when reloader thread is started.
Expand Down Expand Up @@ -342,7 +342,7 @@ UserDictionary::UserDictionary(std::unique_ptr<const UserPosInterface> user_pos,
Reload();
}

UserDictionary::~UserDictionary() = default;
UserDictionary::~UserDictionary() { WaitForReloader(); }

bool UserDictionary::HasKey(absl::string_view key) const {
// TODO(noriyukit): Currently, we don't support HasKey() for user dictionary
Expand Down Expand Up @@ -568,7 +568,7 @@ void UserDictionary::WaitForReloader() { reloader_->Wait(); }
void UserDictionary::Swap(std::unique_ptr<TokensIndex> new_tokens) {
DCHECK(new_tokens);
absl::WriterMutexLock l(&mutex_);
tokens_.swap(new_tokens);
tokens_ = std::move(new_tokens);
}

bool UserDictionary::Load(
Expand Down
7 changes: 7 additions & 0 deletions src/dictionary/user_dictionary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,13 @@ TEST_F(UserDictionaryTest, AsyncLoadTest) {
}
dic->WaitForReloader();
}

// Fix b//341758719. Waits the reload inside the destructor.
{
std::unique_ptr<UserDictionary> dic(CreateDictionary());
dic->SetUserDictionaryName(filename);
dic->Reload();
}
}

TEST_F(UserDictionaryTest, TestSuppressionDictionary) {
Expand Down
2 changes: 1 addition & 1 deletion src/gui/config_dialog/config_dialog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ void ConfigDialog::ConvertFromProto(const config::Config &config) {
SET_CHECKBOX(realtimeConversionCheckBox, use_realtime_conversion);

suggestionsSizeSpinBox->setValue(
std::max(1, std::min<int>(9, config.suggestions_size())));
std::clamp<int>(config.suggestions_size(), 1, 9));

// tab5
SetSendStatsCheckBox();
Expand Down
2 changes: 1 addition & 1 deletion src/gui/dictionary_tool/find_dialog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void FindDialog::FindBackward() {
void FindDialog::Find(FindDialog::Direction direction) {
const QString &query = QuerylineEdit->text();
const int start_row = std::max(0, table_->currentRow());
int start_column = std::min(1, std::max(0, table_->currentColumn()));
int start_column = std::clamp(table_->currentColumn(), 0, 1);
int matched_column = -1;
int matched_row = -1;

Expand Down
4 changes: 3 additions & 1 deletion src/prediction/dictionary_prediction_aggregator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1761,7 +1761,9 @@ void DictionaryPredictionAggregator::AggregateTypingCorrectedPrediction(
// bias = hyp_score - base_score, so larger is better.
// bias is computed in log10 domain, so we need to use the different
// scale factor. 500 * log(10) = ~1150.
result.wcost -= 1150 * query.bias;
const int adjustment = -1150 * query.bias;
result.typing_correction_adjustment = adjustment;
result.wcost += adjustment;
results->emplace_back(std::move(result));
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/prediction/dictionary_prediction_aggregator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,13 @@ void SetUpInputForSuggestionWithHistory(absl::string_view key,
void GenerateKeyEvents(absl::string_view text,
std::vector<commands::KeyEvent> *keys) {
keys->clear();
for (const char32_t w : Util::Utf8ToUtf32(text)) {
for (const char32_t codepoint : Util::Utf8ToUtf32(text)) {
commands::KeyEvent key;
if (w <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(w);
if (codepoint <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(codepoint);
} else {
key.set_key_code('?');
*key.mutable_key_string() = Util::CodepointToUtf8(w);
*key.mutable_key_string() = Util::CodepointToUtf8(codepoint);
}
keys->push_back(key);
}
Expand Down
4 changes: 2 additions & 2 deletions src/prediction/dictionary_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,8 @@ bool DictionaryPredictor::PredictForRequest(const ConversionRequest &request,
const TypingCorrectionMixingParams typing_correction_mixing_params =
MaybePopulateTypingCorrectedResults(request, *segments, &results);

MaybeRescoreResults(request, *segments, absl::MakeSpan(results));

return AddPredictionToCandidates(request, segments,
typing_correction_mixing_params,
absl::MakeSpan(results));
Expand All @@ -334,8 +336,6 @@ void DictionaryPredictor::RewriteResultsForPrediction(
SetPredictionCost(request.request_type(), segments, results);
}

MaybeRescoreResults(request, segments, absl::MakeSpan(*results));

if (!is_mixed_conversion) {
const size_t input_key_len =
Util::CharsLen(segments.conversion_segment(0).key());
Expand Down
2 changes: 1 addition & 1 deletion src/prediction/predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ bool DefaultPredictor::PredictForRequest(const ConversionRequest &request,

int size = kPredictionSize;
if (request.request_type() == ConversionRequest::SUGGESTION) {
size = std::min(9, std::max<int>(1, request.config().suggestions_size()));
size = std::clamp<int>(request.config().suggestions_size(), 1, 9);
}

bool result = false;
Expand Down
31 changes: 21 additions & 10 deletions src/prediction/result.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ struct Result {
// Context "insensitive" candidate cost.
int wcost = 0;
// Context "sensitive" candidate cost.
// TODO(noriyukit): The cost is basically calculated by the underlying LM, but
// currently it is updated by other modules and heuristics at many locations;
// e.g., see SetPredictionCostForMixedConversion() in
// dictionary_predictgor.cc. Ideally, such cost adjustments should be kept
// separately from the original LM cost to perform rescoring in a rigorous
// manner.
int cost = 0;
int lid = 0;
int rid = 0;
Expand All @@ -127,23 +133,28 @@ struct Result {
int cost_before_rescoring = 0;
// If removed is true, this result is not used for a candidate.
bool removed = false;
// confidence score of typing correction. Larger is more confident.
// Confidence score of typing correction. Larger is more confident.
float typing_correction_score = 0.0;
// Adjustment for `wcost` made by the typing correction. This value can be
// zero, positive (penalty) or negative (bonus), and it is added to `wcost`.
int typing_correction_adjustment = 0;
#ifndef NDEBUG
std::string log;
#endif // NDEBUG

template <typename S>
friend void AbslStringify(S &sink, const Result &r) {
absl::Format(&sink,
"key: %s, value: %s, types: %d, wcost: %d, cost: %d, lid: %d, "
"rid: %d, attrs: %d, bdd: %s, srcinfo: %d, origkey: %s, "
"consumed_key_size: %d, penalty: %d, removed: %v",
r.key, r.value, r.types, r.wcost, r.cost, r.lid, r.rid,
r.candidate_attributes,
absl::StrJoin(r.inner_segment_boundary, ","), r.source_info,
r.non_expanded_original_key, r.consumed_key_size, r.penalty,
r.removed);
absl::Format(
&sink,
"key: %s, value: %s, types: %d, wcost: %d, cost: %d, cost_before: %d, "
"lid: %d, "
"rid: %d, attrs: %d, bdd: %s, srcinfo: %d, origkey: %s, "
"consumed_key_size: %d, penalty: %d, tc_adjustment: %d, removed: %v",
r.key, r.value, r.types, r.wcost, r.cost, r.cost_before_rescoring,
r.lid, r.rid, r.candidate_attributes,
absl::StrJoin(r.inner_segment_boundary, ","), r.source_info,
r.non_expanded_original_key, r.consumed_key_size, r.penalty,
r.typing_correction_adjustment, r.removed);
#ifndef NDEBUG
sink.Append(", log:\n");
for (absl::string_view line : absl::StrSplit(r.log, '\n')) {
Expand Down
Loading

0 comments on commit afaeb37

Please sign in to comment.