From 81d7c7f5d68662d75effc24ffc2ce1ff04266dff Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Wed, 11 Sep 2024 23:54:17 +0800 Subject: [PATCH] Support Korea morphological analyzer through mecab (#1860) ### What problem does this PR solve? Use libmecab embedded within ijma, and the dictionary is generated according to the instructions of https://bitbucket.org/eunjeon/mecab-ko-dic Issue link:#1228 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- docs/references/http_api_reference.mdx | 1 + docs/references/pysdk_api_reference.md | 1 + src/common/analyzer/analyzer_pool.cpp | 22 +++++ src/common/analyzer/analyzer_pool.cppm | 1 + src/common/analyzer/ijma.cppm | 3 +- src/common/analyzer/korea_analyzer.cpp | 73 +++++++++++++++++ src/common/analyzer/korea_analyzer.cppm | 80 +++++++++++++++++++ .../ijma/{src/libmecab => include}/mecab.h | 0 third_party/ijma/include/mecab_wrapper.h | 38 +++++++++ third_party/ijma/src/mecab_wrapper.cpp | 67 ++++++++++++++++ 10 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 src/common/analyzer/korea_analyzer.cpp create mode 100644 src/common/analyzer/korea_analyzer.cppm rename third_party/ijma/{src/libmecab => include}/mecab.h (100%) create mode 100644 third_party/ijma/include/mecab_wrapper.h create mode 100644 third_party/ijma/src/mecab_wrapper.cpp diff --git a/docs/references/http_api_reference.mdx b/docs/references/http_api_reference.mdx index eab0a2532a..deb082c662 100644 --- a/docs/references/http_api_reference.mdx +++ b/docs/references/http_api_reference.mdx @@ -926,6 +926,7 @@ curl --request POST \ - `"chinese"`: Simplified Chinese - `"tradition"`: Traditional Chinese - `"japanese"`: Japanese + - `"korea"`: Korea - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram) - Parameter settings for a secondary index: - `"type"`: `"secondary"` diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md index 8069cd154b..c5661742e6 100644 --- a/docs/references/pysdk_api_reference.md +++ b/docs/references/pysdk_api_reference.md @@ -721,6 +721,7 @@ An `IndexInfo` structure contains three fields,`column_name`, `index_type`, and - `"chinese"`: Simplified Chinese - `"tradition"`: Traditional Chinese - `"japanese"`: Japanese + - `"korea"`: Korea - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram) - Parameter settings for a secondary index: No parameters are required. For now, use an empty list `[]`. diff --git a/src/common/analyzer/analyzer_pool.cpp b/src/common/analyzer/analyzer_pool.cpp index bd908aec6a..52625b7f37 100644 --- a/src/common/analyzer/analyzer_pool.cpp +++ b/src/common/analyzer/analyzer_pool.cpp @@ -27,6 +27,7 @@ import stemmer; import chinese_analyzer; import traditional_chinese_analyzer; import japanese_analyzer; +import korea_analyzer; import standard_analyzer; import ngram_analyzer; import logger; @@ -150,6 +151,27 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v } return {MakeUnique(*reinterpret_cast(prototype)), Status::OK()}; } + case Str2Int(KOREA.data()): { + Analyzer *prototype = cache_[KOREA].get(); + if (prototype == nullptr) { + String path; + Config *config = InfinityContext::instance().config(); + if (config == nullptr) { + // InfinityContext has not been initialized. + path = "/var/infinity/resource"; + } else { + path = config->ResourcePath(); + } + UniquePtr analyzer = MakeUnique(std::move(path)); + Status load_status = analyzer->Load(); + if (!load_status.ok()) { + return {nullptr, load_status}; + } + prototype = analyzer.get(); + cache_[KOREA] = std::move(analyzer); + } + return {MakeUnique(*reinterpret_cast(prototype)), Status::OK()}; + } case Str2Int(STANDARD.data()): { UniquePtr analyzer = MakeUnique(); Language lang = STEM_LANG_ENGLISH; diff --git a/src/common/analyzer/analyzer_pool.cppm b/src/common/analyzer/analyzer_pool.cppm index e268164e8e..10ccc3c4c6 100644 --- a/src/common/analyzer/analyzer_pool.cppm +++ b/src/common/analyzer/analyzer_pool.cppm @@ -36,6 +36,7 @@ public: static constexpr std::string_view CHINESE = "chinese"; static constexpr std::string_view TRADITIONALCHINESE = "tradition"; static constexpr std::string_view JAPANESE = "japanese"; + static constexpr std::string_view KOREA = "korea"; static constexpr std::string_view STANDARD = "standard"; static constexpr std::string_view NGRAM = "ngram"; diff --git a/src/common/analyzer/ijma.cppm b/src/common/analyzer/ijma.cppm index 73f8134dca..3a91d761bf 100644 --- a/src/common/analyzer/ijma.cppm +++ b/src/common/analyzer/ijma.cppm @@ -18,6 +18,7 @@ module; #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wunused-but-set-variable" #include +#include #pragma clang diagnostic pop export module ijma; @@ -26,6 +27,6 @@ export namespace jma { using jma::Analyzer; using jma::Knowledge; +using jma::MeCab; using jma::Sentence; - } // namespace jma diff --git a/src/common/analyzer/korea_analyzer.cpp b/src/common/analyzer/korea_analyzer.cpp new file mode 100644 index 0000000000..00a24a9dfd --- /dev/null +++ b/src/common/analyzer/korea_analyzer.cpp @@ -0,0 +1,73 @@ +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" +#include +#pragma clang diagnostic pop + +#include +#include +#include +import stl; +import term; +import analyzer; +import common_analyzer; +import logger; +import status; +import ijma; + +module korea_analyzer; + +namespace fs = std::filesystem; + +namespace infinity { +static const String KNOWLEDGE_PATH = "mecab/ko-dic"; + +KoreaAnalyzer::KoreaAnalyzer(const String &base_path) { + cjk_ = true; + own_mecab_ = true; + fs::path root(base_path); + fs::path knowledge_path(root / KNOWLEDGE_PATH); + knowledge_path_ = "-d " + knowledge_path.string(); +} + +KoreaAnalyzer::KoreaAnalyzer(const KoreaAnalyzer &other) { + cjk_ = true; + knowledge_path_ = other.knowledge_path_; + own_mecab_ = false; + SetCaseSensitive(false); +} + +KoreaAnalyzer::~KoreaAnalyzer() { + if (own_mecab_) + delete mecab_; +} + +Status KoreaAnalyzer::Load() { + try { + mecab_ = new jma::MeCab(knowledge_path_); + } catch (std::logic_error) { + return Status::InvalidAnalyzerFile("Failed to load Korea analyzer"); + } + + SetCaseSensitive(false); + + return Status::OK(); +} + +} // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/korea_analyzer.cppm b/src/common/analyzer/korea_analyzer.cppm new file mode 100644 index 0000000000..717c85fed3 --- /dev/null +++ b/src/common/analyzer/korea_analyzer.cppm @@ -0,0 +1,80 @@ +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +export module korea_analyzer; + +import stl; +import ijma; +import term; +import common_analyzer; +import status; + +namespace infinity { + +export class KoreaAnalyzer : public CommonLanguageAnalyzer { +public: + KoreaAnalyzer(const String &path); + + KoreaAnalyzer(const KoreaAnalyzer &other); + + ~KoreaAnalyzer(); + + Status Load(); + +protected: + void Parse(const String &input) override { + mecab_->SetSentence(input); + local_offset_ = -1; + + ResetToken(); + } + + bool NextToken() override { + while (DoNext()) { + mecab_->GetToken(token_str_); + token_ = token_str_.c_str(); + len_ = token_str_.size(); + offset_ = local_offset_; + is_index_ = true; + return true; + } + ResetToken(); + return false; + } + + bool IsAlpha() override { return mecab_->IsAlpha(); } + + bool IsSpecialChar() override { return false; } + +private: + bool DoNext() { + while (!mecab_->IsEnd()) { + mecab_->Next(); + ++local_offset_; + return true; + } + return false; + } + + String knowledge_path_; + + jma::MeCab *mecab_{nullptr}; + + bool own_mecab_; + + String token_str_; +}; +} // namespace infinity diff --git a/third_party/ijma/src/libmecab/mecab.h b/third_party/ijma/include/mecab.h similarity index 100% rename from third_party/ijma/src/libmecab/mecab.h rename to third_party/ijma/include/mecab.h diff --git a/third_party/ijma/include/mecab_wrapper.h b/third_party/ijma/include/mecab_wrapper.h new file mode 100644 index 0000000000..bdf88c9522 --- /dev/null +++ b/third_party/ijma/include/mecab_wrapper.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +#include "mecab.h" + +namespace jma { + +class MeCab { +public: + MeCab(const std::string &option); + + ~MeCab(); + + bool Parse(std::vector &out, const char *str, size_t str_len = 0); + + void SetSentence(const std::string &str); + + bool IsAlpha() const; + + bool IsSpecial() const; + + bool IsEnd() const; + + void Next(); + + void GetToken(std::string &out); + + std::string GetFeature(); + +private: + ::MeCab::Tagger *tagger_{nullptr}; + const ::MeCab::Node *node_{nullptr}; + char buf_[1024]; +}; + +} // namespace jma \ No newline at end of file diff --git a/third_party/ijma/src/mecab_wrapper.cpp b/third_party/ijma/src/mecab_wrapper.cpp new file mode 100644 index 0000000000..74cc3e7edd --- /dev/null +++ b/third_party/ijma/src/mecab_wrapper.cpp @@ -0,0 +1,67 @@ +#include "mecab_wrapper.h" +#include "mecab.h" + +namespace jma { + +MeCab::MeCab(const std::string &option) : tagger_(::MeCab::createTagger(option.c_str())) {} + +MeCab::~MeCab() { delete tagger_; } + +bool MeCab::Parse(std::vector &out, const char *str, size_t str_len) { + if (str_len == 0) { + str_len = strlen(str); + } + const char *p = tagger_->parse(str, str_len); + if (p == 0) + return false; + while (*p) { + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') { + p++; + continue; + } + const char *q = strchr(p, ' '); + if (q == 0) { + out.push_back(p); + break; + } + out.push_back(std::string(p, q)); + p = q + 1; + } + return true; +} + +void MeCab::SetSentence(const std::string &str) { node_ = tagger_->parseToNode(str.c_str(), str.size()); } + +bool MeCab::IsAlpha() const { + const char *p = node_->feature; + if (node_->length < 2) + return false; + return p[0] == 'S' && p[1] == 'L'; +} + +bool MeCab::IsSpecial() const { + const char *p = node_->feature; + return p[0] == 'S' && p[1] == 'C'; +} + +bool MeCab::IsEnd() const { + if (node_ == nullptr) + return true; + return node_->stat == MECAB_EOS_NODE; +} + +void MeCab::Next() { + // assert(node_); + node_ = node_->next; +} + +void MeCab::GetToken(std::string &out) { + out = node_->surface; + strcpy(buf_, node_->surface); + buf_[node_->length] = '\0'; + out.assign(buf_); +} + +std::string MeCab::GetFeature() { return node_->feature; } + +} // namespace jma