Merge branch 'google:master' into master

shitamo · Jun 3, 2024 · afaeb37 · afaeb37
2 parents d0507e5 + 864c543
commit afaeb37
Show file tree

Hide file tree

Showing 23 changed files with 138 additions and 118 deletions.
diff --git a/src/base/util.cc b/src/base/util.cc
@@ -791,27 +791,29 @@ bool Util::IsEnglishTransliteration(absl::string_view value) {
 // script type
 // TODO(yukawa, team): Make a mechanism to keep this classifier up-to-date
 //   based on the original data from Unicode.org.
-Util::ScriptType Util::GetScriptType(char32_t w) {
-  if (INRANGE(w, 0x0030, 0x0039) ||  // ascii number
-      INRANGE(w, 0xFF10, 0xFF19)) {  // full width number
+Util::ScriptType Util::GetScriptType(char32_t codepoint) {
+  if (INRANGE(codepoint, 0x0030, 0x0039) ||  // ascii number
+      INRANGE(codepoint, 0xFF10, 0xFF19)) {  // full width number
     return NUMBER;
-  } else if (INRANGE(w, 0x0041, 0x005A) ||  // ascii upper
-             INRANGE(w, 0x0061, 0x007A) ||  // ascii lower
-             INRANGE(w, 0xFF21, 0xFF3A) ||  // fullwidth ascii upper
-             INRANGE(w, 0xFF41, 0xFF5A)) {  // fullwidth ascii lower
+  } else if (INRANGE(codepoint, 0x0041, 0x005A) ||  // ascii upper
+             INRANGE(codepoint, 0x0061, 0x007A) ||  // ascii lower
+             INRANGE(codepoint, 0xFF21, 0xFF3A) ||  // fullwidth ascii upper
+             INRANGE(codepoint, 0xFF41, 0xFF5A)) {  // fullwidth ascii lower
     return ALPHABET;
-  } else if (w == 0x3005 ||  // IDEOGRAPHIC ITERATION MARK "々"
-             INRANGE(w, 0x3400,
+  } else if (codepoint == 0x3005 ||  // IDEOGRAPHIC ITERATION MARK "々"
+             INRANGE(codepoint, 0x3400,
                      0x4DBF) ||  // CJK Unified Ideographs Extension A
-             INRANGE(w, 0x4E00, 0x9FFF) ||  // CJK Unified Ideographs
-             INRANGE(w, 0xF900, 0xFAFF) ||  // CJK Compatibility Ideographs
-             INRANGE(w, 0x20000,
+             INRANGE(codepoint, 0x4E00, 0x9FFF) ||  // CJK Unified Ideographs
+             INRANGE(codepoint, 0xF900,
+                     0xFAFF) ||  // CJK Compatibility Ideographs
+             INRANGE(codepoint, 0x20000,
                      0x2A6DF) ||  // CJK Unified Ideographs Extension B
-             INRANGE(w, 0x2A700,
+             INRANGE(codepoint, 0x2A700,
                      0x2B73F) ||  // CJK Unified Ideographs Extension C
-             INRANGE(w, 0x2B740,
+             INRANGE(codepoint, 0x2B740,
                      0x2B81F) ||  // CJK Unified Ideographs Extension D
-             INRANGE(w, 0x2F800, 0x2FA1F)) {  // CJK Compatibility Ideographs
+             INRANGE(codepoint, 0x2F800,
+                     0x2FA1F)) {  // CJK Compatibility Ideographs
     // As of Unicode 6.0.2, each block has the following characters assigned.
     // [U+3400, U+4DB5]:   CJK Unified Ideographs Extension A
     // [U+4E00, U+9FCB]:   CJK Unified Ideographs
@@ -821,51 +823,53 @@ Util::ScriptType Util::GetScriptType(char32_t w) {
     // [U+2B740, U+2B81D]: CJK Unified Ideographs Extension D
     // [U+2F800, U+2FA1D]: CJK Compatibility Ideographs
     return KANJI;
-  } else if (INRANGE(w, 0x3041, 0x309F) ||  // hiragana
-             w == 0x1B001) {                // HIRAGANA LETTER ARCHAIC YE
+  } else if (INRANGE(codepoint, 0x3041, 0x309F) ||  // hiragana
+             codepoint == 0x1B001) {  // HIRAGANA LETTER ARCHAIC YE
     return HIRAGANA;
-  } else if (INRANGE(w, 0x30A1, 0x30FF) ||  // full width katakana
-             INRANGE(w, 0x31F0,
+  } else if (INRANGE(codepoint, 0x30A1, 0x30FF) ||  // full width katakana
+             INRANGE(codepoint, 0x31F0,
                      0x31FF) ||  // Katakana Phonetic Extensions for Ainu
-             INRANGE(w, 0xFF65, 0xFF9F) ||  // half width katakana
-             w == 0x1B000) {                // KATAKANA LETTER ARCHAIC E
+             INRANGE(codepoint, 0xFF65, 0xFF9F) ||  // half width katakana
+             codepoint == 0x1B000) {                // KATAKANA LETTER ARCHAIC E
     return KATAKANA;
-  } else if (INRANGE(w, 0x02300, 0x023F3) ||  // Miscellaneous Technical
-             INRANGE(w, 0x02700, 0x027BF) ||  // Dingbats
-             INRANGE(w, 0x1F000, 0x1F02F) ||  // Mahjong tiles
-             INRANGE(w, 0x1F030, 0x1F09F) ||  // Domino tiles
-             INRANGE(w, 0x1F0A0, 0x1F0FF) ||  // Playing cards
-             INRANGE(w, 0x1F100,
+  } else if (INRANGE(codepoint, 0x02300, 0x023F3) ||  // Miscellaneous Technical
+             INRANGE(codepoint, 0x02700, 0x027BF) ||  // Dingbats
+             INRANGE(codepoint, 0x1F000, 0x1F02F) ||  // Mahjong tiles
+             INRANGE(codepoint, 0x1F030, 0x1F09F) ||  // Domino tiles
+             INRANGE(codepoint, 0x1F0A0, 0x1F0FF) ||  // Playing cards
+             INRANGE(codepoint, 0x1F100,
                      0x1F2FF) ||  // Enclosed Alphanumeric Supplement
-             INRANGE(w, 0x1F200, 0x1F2FF) ||  // Enclosed Ideographic Supplement
-             INRANGE(w, 0x1F300,
+             INRANGE(codepoint, 0x1F200,
+                     0x1F2FF) ||  // Enclosed Ideographic Supplement
+             INRANGE(codepoint, 0x1F300,
                      0x1F5FF) ||  // Miscellaneous Symbols And Pictographs
-             INRANGE(w, 0x1F600, 0x1F64F) ||  // Emoticons
-             INRANGE(w, 0x1F680, 0x1F6FF) ||  // Transport And Map Symbols
-             INRANGE(w, 0x1F700, 0x1F77F) ||  // Alchemical Symbols
-             w == 0x26CE) {                   // Ophiuchus
+             INRANGE(codepoint, 0x1F600, 0x1F64F) ||  // Emoticons
+             INRANGE(codepoint, 0x1F680,
+                     0x1F6FF) ||  // Transport And Map Symbols
+             INRANGE(codepoint, 0x1F700, 0x1F77F) ||  // Alchemical Symbols
+             codepoint == 0x26CE) {                   // Ophiuchus
     return EMOJI;
   }
 
   return UNKNOWN_SCRIPT;
 }
 
-Util::FormType Util::GetFormType(char32_t w) {
+Util::FormType Util::GetFormType(char32_t codepoint) {
   // 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
   // http://www.unicode.org/reports/tr11/
 
   // Characters marked as 'Na' in
   // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-  if (INRANGE(w, 0x0020, 0x007F) ||  // ascii
-      INRANGE(w, 0x27E6, 0x27ED) ||  // narrow mathematical symbols
-      INRANGE(w, 0x2985, 0x2986)) {  // narrow white parentheses
+  if (INRANGE(codepoint, 0x0020, 0x007F) ||  // ascii
+      INRANGE(codepoint, 0x27E6, 0x27ED) ||  // narrow mathematical symbols
+      INRANGE(codepoint, 0x2985, 0x2986)) {  // narrow white parentheses
     return HALF_WIDTH;
   }
 
   // Other characters marked as 'Na' in
   // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-  if (INRANGE(w, 0x00A2, 0x00AF)) {
-    switch (w) {
+  if (INRANGE(codepoint, 0x00A2, 0x00AF)) {
+    switch (codepoint) {
       case 0x00A2:  // CENT SIGN
       case 0x00A3:  // POUND SIGN
       case 0x00A5:  // YEN SIGN
@@ -878,13 +882,13 @@ Util::FormType Util::GetFormType(char32_t w) {
 
   // Characters marked as 'H' in
   // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-  if (w == 0x20A9 ||                 // WON SIGN
-      INRANGE(w, 0xFF61, 0xFF9F) ||  // half-width katakana
-      INRANGE(w, 0xFFA0, 0xFFBE) ||  // half-width hangul
-      INRANGE(w, 0xFFC2, 0xFFCF) ||  // half-width hangul
-      INRANGE(w, 0xFFD2, 0xFFD7) ||  // half-width hangul
-      INRANGE(w, 0xFFDA, 0xFFDC) ||  // half-width hangul
-      INRANGE(w, 0xFFE8, 0xFFEE)) {  // half-width symbols
+  if (codepoint == 0x20A9 ||                 // WON SIGN
+      INRANGE(codepoint, 0xFF61, 0xFF9F) ||  // half-width katakana
+      INRANGE(codepoint, 0xFFA0, 0xFFBE) ||  // half-width hangul
+      INRANGE(codepoint, 0xFFC2, 0xFFCF) ||  // half-width hangul
+      INRANGE(codepoint, 0xFFD2, 0xFFD7) ||  // half-width hangul
+      INRANGE(codepoint, 0xFFDA, 0xFFDC) ||  // half-width hangul
+      INRANGE(codepoint, 0xFFE8, 0xFFEE)) {  // half-width symbols
     return HALF_WIDTH;
   }
 
@@ -970,9 +974,10 @@ Util::ScriptType Util::GetScriptTypeWithoutSymbols(absl::string_view str) {
 // return true if all script_type in str is "type"
 bool Util::IsScriptType(absl::string_view str, Util::ScriptType type) {
   for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
-    const char32_t w = iter.Get();
+    const char32_t codepoint = iter.Get();
     // Exception: 30FC (PROLONGEDSOUND MARK is categorized as HIRAGANA as well)
-    if (type != GetScriptType(w) && (w != 0x30FC || type != HIRAGANA)) {
+    if (type != GetScriptType(codepoint) &&
+        (codepoint != 0x30FC || type != HIRAGANA)) {
       return false;
     }
   }

diff --git a/src/base/util.h b/src/base/util.h
@@ -188,8 +188,8 @@ class Util {
     SCRIPT_TYPE_SIZE,
   };
 
-  // return script type of w
-  static ScriptType GetScriptType(char32_t w);
+  // Returns the script type of `codepoint`.
+  static ScriptType GetScriptType(char32_t codepoint);
 
   // Returns the script type of the first character in `str`.
   // This function finds the first UTF-8 chars and returns its script type.
@@ -198,7 +198,7 @@ class Util {
   static ScriptType GetFirstScriptType(absl::string_view str,
                                        size_t *mblen = nullptr);
 
-  // return script type of string. all chars in str must be
+  // Returns the script type of a string. All chars in str must be
   // KATAKANA/HIRAGANA/KANJI/NUMBER or ALPHABET.
   // If str has mixed scripts, this function returns UNKNOWN_SCRIPT
   static ScriptType GetScriptType(absl::string_view str);
@@ -207,10 +207,10 @@ class Util {
   // in the |str|.
   static ScriptType GetScriptTypeWithoutSymbols(absl::string_view str);
 
-  // return true if all script_type in str is "type"
+  // Returns true if all script_type in str is "type"
   static bool IsScriptType(absl::string_view str, ScriptType type);
 
-  // return true if the string contains script_type char
+  // Returns true if the string contains script_type char
   static bool ContainsScriptType(absl::string_view str, ScriptType type);
 
   // See 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
@@ -223,12 +223,12 @@ class Util {
     FORM_TYPE_SIZE,
   };
 
-  // return Form type of single character.
+  // Returns Form type of single character.
   // This function never returns UNKNOWN_FORM.
-  static FormType GetFormType(char32_t w);
+  static FormType GetFormType(char32_t codepoint);
 
-  // return FormType of string.
-  // return UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
+  // Returns FormType of string.
+  // Returns UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
   static FormType GetFormType(absl::string_view str);
 
   // Returns true if all characters of `str` are ASCII (U+00 - U+7F).

diff --git a/src/converter/converter.cc b/src/converter/converter.cc
@@ -177,11 +177,12 @@ bool ExtractLastTokenWithScriptType(const absl::string_view text,
   std::vector<char32_t> reverse_last_token;
   Util::ScriptType last_script_type_found = Util::GetScriptType(iter.Get());
   for (; !iter.Done(); iter.Next()) {
-    const char32_t w = iter.Get();
-    if ((w == ' ') || (Util::GetScriptType(w) != last_script_type_found)) {
+    const char32_t codepoint = iter.Get();
+    if ((codepoint == ' ') ||
+        (Util::GetScriptType(codepoint) != last_script_type_found)) {
       break;
     }
-    reverse_last_token.push_back(w);
+    reverse_last_token.push_back(codepoint);
   }
 
   *last_script_type = last_script_type_found;

diff --git a/src/converter/immutable_converter.cc b/src/converter/immutable_converter.cc
@@ -1926,8 +1926,7 @@ void ImmutableConverter::InsertCandidates(const ConversionRequest &request,
     prev = node;
   }
 
-  const size_t expand_size =
-      std::max<size_t>(1, std::min<size_t>(512, max_candidates_size));
+  const size_t expand_size = std::clamp<size_t>(max_candidates_size, 1, 512);
 
   const bool is_single_segment =
       (type == SINGLE_SEGMENT || type == FIRST_INNER_SEGMENT);

diff --git a/src/dictionary/user_dictionary.cc b/src/dictionary/user_dictionary.cc
@@ -259,7 +259,7 @@ class UserDictionary::UserDictionaryReloader {
   UserDictionaryReloader(const UserDictionaryReloader &) = delete;
   UserDictionaryReloader &operator=(const UserDictionaryReloader &) = delete;
 
-  ~UserDictionaryReloader() = default;
+  ~UserDictionaryReloader() { Wait(); }
 
   // When the user dictionary exists AND the modification time has been updated,
   // reloads the dictionary.  Returns true when reloader thread is started.
@@ -342,7 +342,7 @@ UserDictionary::UserDictionary(std::unique_ptr<const UserPosInterface> user_pos,
   Reload();
 }
 
-UserDictionary::~UserDictionary() = default;
+UserDictionary::~UserDictionary() { WaitForReloader(); }
 
 bool UserDictionary::HasKey(absl::string_view key) const {
   // TODO(noriyukit): Currently, we don't support HasKey() for user dictionary
@@ -568,7 +568,7 @@ void UserDictionary::WaitForReloader() { reloader_->Wait(); }
 void UserDictionary::Swap(std::unique_ptr<TokensIndex> new_tokens) {
   DCHECK(new_tokens);
   absl::WriterMutexLock l(&mutex_);
-  tokens_.swap(new_tokens);
+  tokens_ = std::move(new_tokens);
 }
 
 bool UserDictionary::Load(

diff --git a/src/dictionary/user_dictionary_test.cc b/src/dictionary/user_dictionary_test.cc
@@ -805,6 +805,13 @@ TEST_F(UserDictionaryTest, AsyncLoadTest) {
     }
     dic->WaitForReloader();
   }
+
+  // Fix b//341758719. Waits the reload inside the destructor.
+  {
+    std::unique_ptr<UserDictionary> dic(CreateDictionary());
+    dic->SetUserDictionaryName(filename);
+    dic->Reload();
+  }
 }
 
 TEST_F(UserDictionaryTest, TestSuppressionDictionary) {

diff --git a/src/gui/config_dialog/config_dialog.cc b/src/gui/config_dialog/config_dialog.cc
@@ -551,7 +551,7 @@ void ConfigDialog::ConvertFromProto(const config::Config &config) {
   SET_CHECKBOX(realtimeConversionCheckBox, use_realtime_conversion);
 
   suggestionsSizeSpinBox->setValue(
-      std::max(1, std::min<int>(9, config.suggestions_size())));
+      std::clamp<int>(config.suggestions_size(), 1, 9));
 
   // tab5
   SetSendStatsCheckBox();

diff --git a/src/gui/dictionary_tool/find_dialog.cc b/src/gui/dictionary_tool/find_dialog.cc
@@ -101,7 +101,7 @@ void FindDialog::FindBackward() {
 void FindDialog::Find(FindDialog::Direction direction) {
   const QString &query = QuerylineEdit->text();
   const int start_row = std::max(0, table_->currentRow());
-  int start_column = std::min(1, std::max(0, table_->currentColumn()));
+  int start_column = std::clamp(table_->currentColumn(), 0, 1);
   int matched_column = -1;
   int matched_row = -1;
 

diff --git a/src/prediction/dictionary_prediction_aggregator.cc b/src/prediction/dictionary_prediction_aggregator.cc
@@ -1761,7 +1761,9 @@ void DictionaryPredictionAggregator::AggregateTypingCorrectedPrediction(
       // bias = hyp_score - base_score, so larger is better.
       // bias is computed in log10 domain, so we need to use the different
       // scale factor. 500 * log(10) = ~1150.
-      result.wcost -= 1150 * query.bias;
+      const int adjustment = -1150 * query.bias;
+      result.typing_correction_adjustment = adjustment;
+      result.wcost += adjustment;
       results->emplace_back(std::move(result));
     }
   }

diff --git a/src/prediction/dictionary_prediction_aggregator_test.cc b/src/prediction/dictionary_prediction_aggregator_test.cc
@@ -291,13 +291,13 @@ void SetUpInputForSuggestionWithHistory(absl::string_view key,
 void GenerateKeyEvents(absl::string_view text,
                        std::vector<commands::KeyEvent> *keys) {
   keys->clear();
-  for (const char32_t w : Util::Utf8ToUtf32(text)) {
+  for (const char32_t codepoint : Util::Utf8ToUtf32(text)) {
     commands::KeyEvent key;
-    if (w <= 0x7F) {  // IsAscii, w is unsigned.
-      key.set_key_code(w);
+    if (codepoint <= 0x7F) {  // IsAscii, w is unsigned.
+      key.set_key_code(codepoint);
     } else {
       key.set_key_code('?');
-      *key.mutable_key_string() = Util::CodepointToUtf8(w);
+      *key.mutable_key_string() = Util::CodepointToUtf8(codepoint);
     }
     keys->push_back(key);
   }

diff --git a/src/prediction/dictionary_predictor.cc b/src/prediction/dictionary_predictor.cc
@@ -311,6 +311,8 @@ bool DictionaryPredictor::PredictForRequest(const ConversionRequest &request,
   const TypingCorrectionMixingParams typing_correction_mixing_params =
       MaybePopulateTypingCorrectedResults(request, *segments, &results);
 
+  MaybeRescoreResults(request, *segments, absl::MakeSpan(results));
+
   return AddPredictionToCandidates(request, segments,
                                    typing_correction_mixing_params,
                                    absl::MakeSpan(results));
@@ -334,8 +336,6 @@ void DictionaryPredictor::RewriteResultsForPrediction(
     SetPredictionCost(request.request_type(), segments, results);
   }
 
-  MaybeRescoreResults(request, segments, absl::MakeSpan(*results));
-
   if (!is_mixed_conversion) {
     const size_t input_key_len =
         Util::CharsLen(segments.conversion_segment(0).key());

diff --git a/src/prediction/predictor.cc b/src/prediction/predictor.cc
@@ -233,7 +233,7 @@ bool DefaultPredictor::PredictForRequest(const ConversionRequest &request,
 
   int size = kPredictionSize;
   if (request.request_type() == ConversionRequest::SUGGESTION) {
-    size = std::min(9, std::max<int>(1, request.config().suggestions_size()));
+    size = std::clamp<int>(request.config().suggestions_size(), 1, 9);
   }
 
   bool result = false;

diff --git a/src/prediction/result.h b/src/prediction/result.h
@@ -103,6 +103,12 @@ struct Result {
   // Context "insensitive" candidate cost.
   int wcost = 0;
   // Context "sensitive" candidate cost.
+  // TODO(noriyukit): The cost is basically calculated by the underlying LM, but
+  // currently it is updated by other modules and heuristics at many locations;
+  // e.g., see SetPredictionCostForMixedConversion() in
+  // dictionary_predictgor.cc. Ideally, such cost adjustments should be kept
+  // separately from the original LM cost to perform rescoring in a rigorous
+  // manner.
   int cost = 0;
   int lid = 0;
   int rid = 0;
@@ -127,23 +133,28 @@ struct Result {
   int cost_before_rescoring = 0;
   // If removed is true, this result is not used for a candidate.
   bool removed = false;
-  // confidence score of typing correction. Larger is more confident.
+  // Confidence score of typing correction. Larger is more confident.
   float typing_correction_score = 0.0;
+  // Adjustment for `wcost` made by the typing correction. This value can be
+  // zero, positive (penalty) or negative (bonus), and it is added to `wcost`.
+  int typing_correction_adjustment = 0;
 #ifndef NDEBUG
   std::string log;
 #endif  // NDEBUG
 
   template <typename S>
   friend void AbslStringify(S &sink, const Result &r) {
-    absl::Format(&sink,
-                 "key: %s, value: %s, types: %d, wcost: %d, cost: %d, lid: %d, "
-                 "rid: %d, attrs: %d, bdd: %s, srcinfo: %d, origkey: %s, "
-                 "consumed_key_size: %d, penalty: %d, removed: %v",
-                 r.key, r.value, r.types, r.wcost, r.cost, r.lid, r.rid,
-                 r.candidate_attributes,
-                 absl::StrJoin(r.inner_segment_boundary, ","), r.source_info,
-                 r.non_expanded_original_key, r.consumed_key_size, r.penalty,
-                 r.removed);
+    absl::Format(
+        &sink,
+        "key: %s, value: %s, types: %d, wcost: %d, cost: %d, cost_before: %d, "
+        "lid: %d, "
+        "rid: %d, attrs: %d, bdd: %s, srcinfo: %d, origkey: %s, "
+        "consumed_key_size: %d, penalty: %d, tc_adjustment: %d, removed: %v",
+        r.key, r.value, r.types, r.wcost, r.cost, r.cost_before_rescoring,
+        r.lid, r.rid, r.candidate_attributes,
+        absl::StrJoin(r.inner_segment_boundary, ","), r.source_info,
+        r.non_expanded_original_key, r.consumed_key_size, r.penalty,
+        r.typing_correction_adjustment, r.removed);
 #ifndef NDEBUG
     sink.Append(", log:\n");
     for (absl::string_view line : absl::StrSplit(r.log, '\n')) {