Skip to content

Commit

Permalink
Merge branch 'google:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
shitamo authored Jul 11, 2024
2 parents a5141ed + d2fc9c7 commit fce359c
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 39 deletions.
4 changes: 4 additions & 0 deletions src/bazel/stubs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ def cc_stub(name, tags = None, target_compatible_with = None, **kwargs):
lexan = struct(
resource_files = cc_stub,
)

windows = struct(
resource_files = cc_stub,
)
27 changes: 27 additions & 0 deletions src/data/dictionary_manual/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# data/dictionary_manual

This directory contains word entries to be added to the main word dictionary.

## TSV files (e.g. places.tsv, words.tsv)

Entries are added to the main dictionary with the following adjustments.

* The POS (e.g. 名詞) are converted to the POS-ID (e.g. 1843).
* THe cost is assigned to the medium cost of whole the same POS words.

Those adjustments are processed by
[dictionary/gen_aux_dictionary.py](https://github.com/google/mozc/blob/master/src/dictionary/gen_aux_dictionary.py)

If the same entries are already in the main word dictionary, entries in this
directory is not used. If you need more controls, you might want to use
`aux_dictionary.tsv` and `dictionary_filter.tsv`.

* https://github.com/google/mozc/blob/master/src/data/dictionary_oss/aux_dictionary.tsv
* https://github.com/google/mozc/blob/master/src/data/dictionary_oss/dictionary_filter.tsv

## domain.txt

This is the same format and used as a part of the main word dictionary.

We recommend TSV files rather than this file, since the values for POS and cost
will be changed per dictionary update.
13 changes: 9 additions & 4 deletions src/data/dictionary_oss/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@

## aux_dictionary.tsv

AUX dictionary is a mechanism to add word entries to `data/dictionary_oss/dictionary0x.txt`
without running the internal training pipeline.

Pull requests under this directory are acceptable.
AUX dictionary is a mechanism to add word entries to
`data/dictionary_oss/dictionary0x.txt` without running the internal training
pipeline.

> [!NOTE]
>
> We recommend to update TSV files in
> [data/dictionary_manual](https://github.com/google/mozc/blob/master/src/data/dictionary_manual/) as an
> easier way. Please consider using `data/dictionary_manual/` at first.
### Format of aux_dictionary.tsv

Expand Down
4 changes: 2 additions & 2 deletions src/renderer/win32/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ load(
"mozc_cc_library",
"mozc_cc_test",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")

package(
default_visibility = ["//renderer:__pkg__"],
Expand All @@ -59,7 +59,7 @@ mozc_cc_binary(
],
)

lexan.resource_files(
windows.resource_files(
name = "win32_renderer_main_resources",
manifests = [
"mozc_renderer.exe.manifest",
Expand Down
35 changes: 18 additions & 17 deletions src/rewriter/date_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,8 @@ std::vector<std::string> GetConversions(const DateRewriter::DateData &data,
} // namespace

bool DateRewriter::RewriteDate(Segment *segment,
const absl::string_view extra_format) {
const absl::string_view extra_format,
size_t &num_done_out) {
const std::string &key = segment->key();
auto rit = std::find_if(std::begin(kDateData), std::end(kDateData),
[&key](auto data) { return key == data.key; });
Expand Down Expand Up @@ -1005,10 +1006,12 @@ bool DateRewriter::RewriteDate(Segment *segment,
const size_t min_idx = std::min<size_t>(3, end_idx);
const size_t insert_idx = std::clamp(cand_idx + 1, min_idx, end_idx);
segment->insert_candidates(insert_idx, std::move(candidates));
num_done_out = 1;
return true;
}

size_t DateRewriter::RewriteEra(Segments::range segments_range) {
bool DateRewriter::RewriteEra(Segments::range segments_range,
size_t &num_done_out) {
// Rewrite:
// * If the first segment ends with the `kNenKey`, or
// * If the second segment starts with the `kNenKey`.
Expand All @@ -1019,29 +1022,29 @@ size_t DateRewriter::RewriteEra(Segments::range segments_range) {
key.remove_suffix(kNenKey.size());
} else if (segments_range.size() < 2 ||
!absl::StartsWith(segments_range[1].key(), kNenKey)) {
return 0;
return false;
}

if (Util::GetScriptType(key) != Util::NUMBER) {
return 0;
return false;
}

const size_t len = Util::CharsLen(key);
if (len < 3 || len > 4) {
LOG(WARNING) << "Too long year";
return 0;
return false;
}

std::string year_str = japanese_util::FullWidthAsciiToHalfWidthAscii(key);

uint32_t year = 0;
if (!absl::SimpleAtoi(year_str, &year)) {
return 0;
return false;
}

std::vector<std::string> results;
if (!AdToEra(year, 0, /* unknown month */ &results)) {
return 0;
return false;
}

constexpr absl::string_view kDescription = "和暦";
Expand All @@ -1061,10 +1064,12 @@ size_t DateRewriter::RewriteEra(Segments::range segments_range) {
constexpr int kInsertPosition = 2;
segment.insert_candidates(kInsertPosition, std::move(candidates));

return has_suffix ? 1 : 2;
num_done_out = has_suffix ? 1 : 2;
return true;
}

bool DateRewriter::RewriteAd(Segments::range segments_range) {
bool DateRewriter::RewriteAd(Segments::range segments_range,
size_t &num_done_out) {
// Rewrite:
// * If the first segment ends with the `kNenKey`, or
// * If the second segment starts with the `kNenKey`.
Expand Down Expand Up @@ -1100,6 +1105,7 @@ bool DateRewriter::RewriteAd(Segments::range segments_range) {
// Insert position is the last of candidates
const int position = static_cast<int>(segment->candidates_size());
segment->insert_candidates(position, std::move(candidates));
num_done_out = has_suffix ? 1 : 2;
return true;
}

Expand Down Expand Up @@ -1526,14 +1532,9 @@ bool DateRewriter::Rewrite(const ConversionRequest &request,
return true;
}

if (RewriteAd(rest_segments) || RewriteDate(seg, extra_format)) {
modified = true;
num_done = 1;
continue;
}

num_done = RewriteEra(rest_segments);
if (num_done) {
if (RewriteAd(rest_segments, num_done) ||
RewriteDate(seg, extra_format, num_done) ||
RewriteEra(rest_segments, num_done)) {
modified = true;
continue;
}
Expand Down
11 changes: 7 additions & 4 deletions src/rewriter/date_rewriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,13 @@ class DateRewriter : public RewriterInterface {
static constexpr char kExtraFormatKey[] = "DATE_FORMAT";

private:
static bool RewriteDate(Segment *segment, absl::string_view extra_format);
// Returns the number of segments processed.
static size_t RewriteEra(Segments::range segments_range);
static bool RewriteAd(Segments::range segments_range);
// If the rewrite is done, returns `true` and sets the `num_done_out` to the
// number of segments processed. The `num_done_out` is not modified if the
// rewrite is not done.
static bool RewriteDate(Segment *segment, absl::string_view extra_format,
size_t &num_done_out);
static bool RewriteEra(Segments::range segments_range, size_t &num_done_out);
static bool RewriteAd(Segments::range segments_range, size_t &num_done_out);
bool ResizeSegmentsForRewriteAd(const ConversionRequest &request,
Segments::const_range segments_range,
Segments *segments) const;
Expand Down
4 changes: 2 additions & 2 deletions src/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ load(
"BRANDING",
"MACOS_BUNDLE_ID_PREFIX",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")

package(default_visibility = ["//:__subpackages__"])

Expand All @@ -55,7 +55,7 @@ mozc_cc_binary(
),
)

lexan.resource_files(
windows.resource_files(
name = "mozc_server_resources",
manifests = [
"mozc_server.exe.manifest",
Expand Down
4 changes: 2 additions & 2 deletions src/win32/base/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ load(
"mozc_cc_test",
"mozc_cc_win32_library",
)
load("//bazel:stubs.bzl", "lexan") # @unused
load("//bazel:stubs.bzl", "windows") # @unused
load(
"//bazel/win32:build_defs.bzl",
"features_gdi",
Expand Down Expand Up @@ -116,7 +116,7 @@ mozc_cc_test(
)

filegroup(
# TODO(yuryu): lexan.resource_files doesn't seem to accept cc_library.
# TODO(yuryu): windows.resource_files doesn't seem to accept cc_library.
# This is a workaround for now.
name = "display_name_resource_h_for_rc",
srcs = ["display_name_resource.h"],
Expand Down
4 changes: 2 additions & 2 deletions src/win32/broker/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ load(
"mozc_cc_binary",
"mozc_cc_library",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")

package(
default_visibility = ["//visibility:private"],
Expand All @@ -58,7 +58,7 @@ mozc_cc_binary(
],
)

lexan.resource_files(
windows.resource_files(
name = "mozc_broker_resource",
manifests = [
"mozc_broker.exe.manifest",
Expand Down
4 changes: 2 additions & 2 deletions src/win32/cache_service/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ load(
"mozc_cc_binary",
"mozc_cc_library",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")

package(default_visibility = ["//:__subpackages__"])

Expand All @@ -54,7 +54,7 @@ mozc_cc_binary(
],
)

lexan.resource_files(
windows.resource_files(
name = "mozc_cache_service_resources",
manifests = [
"mozc_cache_service.exe.manifest",
Expand Down
4 changes: 2 additions & 2 deletions src/win32/custom_action/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ load(
"MOZC_TAGS",
"mozc_cc_binary",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")

mozc_cc_binary(
name = "custom_action",
Expand Down Expand Up @@ -71,7 +71,7 @@ mozc_cc_binary(
],
)

lexan.resource_files(
windows.resource_files(
name = "custom_action_resource",
rc_files = ["custom_action.rc"],
resources = ["resource.h"],
Expand Down
4 changes: 2 additions & 2 deletions src/win32/tip/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ load(
"mozc_cc_library",
"mozc_cc_test",
)
load("//bazel:stubs.bzl", "lexan")
load("//bazel:stubs.bzl", "windows")
load(
"//bazel/win32:build_defs.bzl",
"features_gdi",
Expand Down Expand Up @@ -69,7 +69,7 @@ mozc_cc_binary(
],
)

lexan.resource_files(
windows.resource_files(
name = "mozc_tip_resource",
rc_files = [
"tip_resource.rc",
Expand Down

0 comments on commit fce359c

Please sign in to comment.