From 3b1bf262c71be8fc725cb1793dc1295e29ed6e6b Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 28 Oct 2024 00:48:46 +0900 Subject: [PATCH 1/5] Add more saisiot morphemes --- ModelGenerator/morphemes.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ModelGenerator/morphemes.txt b/ModelGenerator/morphemes.txt index cc511e22..4b0cb495 100644 --- a/ModelGenerator/morphemes.txt +++ b/ModelGenerator/morphemes.txt @@ -6370,7 +6370,7 @@ 와인 NNG 270 은혜 NNG 270 공평 NNG 270 -횟수 NNG 270 +횟수 NNG 270 complex 회/NNG ᆺ/Z_SIOT 수/NNG 010112 반짝이 VV 270 complex 반짝/MAG 이/XSV 0223 서랍 NNG 270 허무 NNG 270 @@ -14689,7 +14689,7 @@ LG화학 NNP 82 조흥은행 NNP 75 노라 EC 75 영양가 NNG 75 -툇마루 NNG 75 +툇마루 NNG 75 complex 퇴/NNG ᆺ/Z_SIOT 마루/NNG 010113 오묘 XR 75 의미심장 XR 75 주인집 NNG 75 @@ -16670,7 +16670,7 @@ LG화학 NNP 82 막중 XR 61 엄중 XR 61 경박 XR 61 -셋방 NNG 61 +셋방 NNG 61 complex 세/NNG ᆺ/Z_SIOT 방/NNG 010112 애무 NNG 61 천진 NNG 61 맞아들이 VV 61 complex 맞/VV 어/EC 들이/VV 011224 @@ -23962,7 +23962,7 @@ SK그룹 NNP 33 판매자 NNG 33 차두리 NNP 33 자필 NNG 33 -곳간 NNG 33 +곳간 NNG 33 complex 고/NNG ᆺ/Z_SIOT 간/NNB 010112 에베레스트 NNP 33 국전 NNG 33 온존 NNG 33 From 3c4445dc9f54d84a8ed47649b3358db86b7fbac9 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 28 Oct 2024 00:48:54 +0900 Subject: [PATCH 2/5] Update model file --- models/base/sj.morph | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/base/sj.morph b/models/base/sj.morph index 68234535..128626ed 100644 --- a/models/base/sj.morph +++ b/models/base/sj.morph @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:125fb05ad20c0d8d7ebb45591b8acaadcea0e740197aceff1ee2d14e8c8195e4 -size 3586754 +oid sha256:8f92b96709467b4941a8d98efda10a803f7a7457bb5c7d9d18b8466ec3ededb6 +size 3586826 From c1e6fe71dec72b954de4913b440d4c34177695c7 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 28 Oct 2024 00:50:01 +0900 Subject: [PATCH 3/5] Add more test cases for `splitSaisiot` & `mergeSaisiot` --- test/test_cpp.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index eed7774b..30414ab2 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -994,6 +994,7 @@ TEST(KiwiCpp, ZSiot) auto resNone = kiwi.analyze(s, Match::allWithNormalizing); auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot); auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot); + EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; })); EXPECT_EQ(resSplit.first.size(), 3); EXPECT_EQ(resSplit.first[0].tag, POSTag::nng); EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot); @@ -1001,6 +1002,16 @@ TEST(KiwiCpp, ZSiot) EXPECT_EQ(resMerge.first.size(), 1); EXPECT_EQ(resMerge.first[0].tag, POSTag::nng); } + + for (auto s : {u"발렛 파킹", u"미닛"}) + { + auto resNone = kiwi.analyze(s, Match::allWithNormalizing); + auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot); + auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot); + EXPECT_EQ(resNone.second, resSplit.second); + EXPECT_EQ(resNone.second, resMerge.second); + EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; })); + } } TEST(KiwiCpp, AnalyzeWithWordPosition) From 3ee5513bf5ded011727cccf30280c78cb0788bd6 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 28 Oct 2024 00:51:24 +0900 Subject: [PATCH 4/5] Fix false positives of `Z_SIOT` --- src/Kiwi.cpp | 1 + src/PathEvaluator.hpp | 46 +++++++++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 08b96512..61f7fb16 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -1065,6 +1065,7 @@ namespace kiwi false, !!(matchOptions & Match::splitComplex), !!(matchOptions & Match::splitSaisiot), + !!(matchOptions & Match::mergeSaisiot), blocklist ); insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized); diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index fc15d9ad..a912ca0f 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -119,6 +119,7 @@ namespace kiwi bool openEnd, bool splitComplex = false, bool splitSaisiot = false, + bool mergeSaisiot = false, const std::unordered_set* blocklist = nullptr ); @@ -136,6 +137,7 @@ namespace kiwi const Vector& prevSpStates, bool splitComplex = false, bool splitSaisiot = false, + bool mergeSaisiot = false, const std::unordered_set* blocklist = nullptr ); @@ -525,7 +527,7 @@ namespace kiwi // fill the rest information of resultOut newPath.wid = lastSeqId; - if (curMorph->chunks.empty() || curMorph->complex) + if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot) { newPath.combineSocket = curMorph->combineSocket; newPath.ownFormId = ownFormId; @@ -570,7 +572,7 @@ namespace kiwi // fill the rest information of resultOut newPath.wid = lastSeqId; - if (curMorph->chunks.empty() || curMorph->complex) + if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot) { newPath.combineSocket = curMorph->combineSocket; newPath.ownFormId = ownFormId; @@ -622,7 +624,7 @@ namespace kiwi // fill the rest information of resultOut newPath.wid = lastSeqId; - if (curMorph->chunks.empty() || curMorph->complex) + if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot) { newPath.combineSocket = curMorph->combineSocket; newPath.ownFormId = ownFormId; @@ -659,7 +661,7 @@ namespace kiwi const Morpheme* lastMorph; Wid firstWid; - if (curMorph->chunks.empty() || curMorph->complex) + if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot) { lastMorph = curMorph->getCombined() ? curMorph->getCombined() : curMorph; firstWid = curMorph->lmMorphemeId; @@ -691,8 +693,10 @@ namespace kiwi { for (auto& prevPath : cache[prev - startNode]) { - // 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외 - if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag)) + // 사이시옷 뒤에 명사가 아닌 태그가 오거나 공백이 있는 경우 제외 + if (prevPath.morpheme->tag == POSTag::z_siot && ( + !isNNClass(curMorph->tag) || prev->endPos < node->startPos + )) { continue; } @@ -701,7 +705,7 @@ namespace kiwi if (prevPath.combineSocket) { // merge with only the same socket - if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex)) + if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)) { continue; } @@ -747,7 +751,7 @@ namespace kiwi } auto cLmState = prevPath.lmState; - if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex)) + if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)) { // no-op } @@ -760,7 +764,7 @@ namespace kiwi } float ll = cLmState.next(langMdl, firstWid); candScore += ll; - if (!(curMorph->chunks.empty() || curMorph->complex)) + if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)) { for (size_t i = 1; i < curMorph->chunks.size(); ++i) { @@ -833,6 +837,7 @@ namespace kiwi const Vector& prevSpStates, bool splitComplex, bool splitSaisiot, + bool mergeSaisiot, const std::unordered_set* blocklist ) { @@ -893,6 +898,11 @@ namespace kiwi // 사이시옷(zSiot)을 위한 지름길 if (curMorph->tag == POSTag::z_siot) { + if (!(splitSaisiot || mergeSaisiot)) + { + continue; + } + for (auto* prev = node->getPrev(); prev; prev = prev->getSibling()) { for (auto& p : cache[prev - startNode]) @@ -912,7 +922,7 @@ namespace kiwi } // if the morpheme has chunk set - if (!(curMorph->chunks.empty()|| curMorph->complex)) + if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)) { // '하다/하게/하지'가 '다/게/지'로 축약된 경우인데 앞에 공백이 있는 경우는 탐색후보에서 제외 if (node->prev && node[-(int)node->prev].endPos < node->startPos @@ -1019,13 +1029,13 @@ namespace kiwi float scoreDiff = cur->accScore - prev->accScore; float typoCostDiff = cur->accTypoCost - prev->accTypoCost; auto morpheme = cur->morpheme; - size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex) ? 1 : morpheme->chunks.size(); + const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size(); auto& gNode = graph[csearcher(cur)]; scoreDiff += typoCostDiff * typoCostWeight; scoreDiff /= numNewTokens; typoCostDiff /= numNewTokens; - if (morpheme->chunks.empty() || morpheme->complex) + if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) { ret.emplace_back( unifyMorpheme(morpheme), @@ -1093,6 +1103,7 @@ namespace kiwi bool openEnd, bool splitComplex, bool splitSaisiot, + bool mergeSaisiot, const std::unordered_set* blocklist ) { @@ -1148,24 +1159,24 @@ namespace kiwi { evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, node->form->candidate, - false, uniqStates, splitComplex, splitSaisiot, blocklist); + false, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist); if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m) { - return m->combineSocket || (!m->chunks.empty() && !m->complex); + return m->combineSocket || !(m->chunks.empty() || m->complex || m->saisiot); })) { ownFormList.emplace_back(node->form->form); ownFormId = ownFormList.size(); evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeLCands, - true, uniqStates, splitComplex, splitSaisiot, blocklist); + true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist); }; } else { evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeCands, - true, uniqStates, splitComplex, splitSaisiot, blocklist); + true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist); } #ifdef DEBUG_PRINT @@ -1186,13 +1197,14 @@ namespace kiwi for (auto& p : cache[prev - startNode]) { if (p.combineSocket) continue; - if (!p.morpheme->chunks.empty() && !p.morpheme->complex) + if (!(p.morpheme->chunks.empty() || p.morpheme->complex || p.morpheme->saisiot)) { if (p.morpheme->chunks.size() <= (p.morpheme->combineSocket ? 2 : 1)) { if (!FeatureTestor::isMatched(nullptr, p.morpheme->vowel)) continue; } } + if (p.morpheme->tag == POSTag::z_siot) continue; float c = p.accScore + (openEnd ? 0 : p.lmState.next(kw->langMdl, eosId)); if (p.spState.singleQuote) c -= 2; From 2c162c7a8828c6c99eba80bdeb0b00fe1a667913 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 28 Oct 2024 00:54:12 +0900 Subject: [PATCH 5/5] bump to v0.20.0 --- CMakeLists.txt | 2 +- bindings/java/kr/pe/bab2min/Kiwi.java | 2 +- include/kiwi/Form.h | 2 +- include/kiwi/Kiwi.h | 2 +- include/kiwi/Macro.h | 4 ++-- include/kiwi/SwTokenizer.h | 2 +- include/kiwi/Types.h | 2 +- include/kiwi/TypoTransformer.h | 2 +- include/kiwi/capi.h | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37635539..380ac334 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.12) -project(kiwi VERSION 0.19.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier") +project(kiwi VERSION 0.20.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier") set ( CMAKE_CXX_STANDARD 14 ) set ( CMAKE_VERBOSE_MAKEFILE true ) diff --git a/bindings/java/kr/pe/bab2min/Kiwi.java b/bindings/java/kr/pe/bab2min/Kiwi.java index 197f9060..00472dad 100644 --- a/bindings/java/kr/pe/bab2min/Kiwi.java +++ b/bindings/java/kr/pe/bab2min/Kiwi.java @@ -12,7 +12,7 @@ public class Kiwi implements AutoCloseable { private long _inst; - final private static String _version = "0.19.1"; + final private static String _version = "0.20.0"; public static class Match { final static public int none = 0, diff --git a/include/kiwi/Form.h b/include/kiwi/Form.h index b024764c..60acc9cb 100644 --- a/include/kiwi/Form.h +++ b/include/kiwi/Form.h @@ -2,7 +2,7 @@ * @file Form.h * @author bab2min (bab2min@gmail.com) * @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더 - * @version 0.19.0 + * @version 0.20.0 * @date 2024-07-01 * * diff --git a/include/kiwi/Kiwi.h b/include/kiwi/Kiwi.h index b4f846b1..b469bf82 100644 --- a/include/kiwi/Kiwi.h +++ b/include/kiwi/Kiwi.h @@ -2,7 +2,7 @@ * @file Kiwi.h * @author bab2min (bab2min@gmail.com) * @brief Kiwi C++ API를 담고 있는 헤더 파일 - * @version 0.19.0 + * @version 0.20.0 * @date 2024-07-01 * * diff --git a/include/kiwi/Macro.h b/include/kiwi/Macro.h index c27bd4a7..5e3b99e1 100644 --- a/include/kiwi/Macro.h +++ b/include/kiwi/Macro.h @@ -4,7 +4,7 @@ #define KIWI_STR(x) KIWI_STR_HELPER(x) #define KIWI_VERSION_MAJOR 0 -#define KIWI_VERSION_MINOR 19 -#define KIWI_VERSION_PATCH 1 +#define KIWI_VERSION_MINOR 20 +#define KIWI_VERSION_PATCH 0 #define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH) diff --git a/include/kiwi/SwTokenizer.h b/include/kiwi/SwTokenizer.h index 56e8f61a..c2b28113 100644 --- a/include/kiwi/SwTokenizer.h +++ b/include/kiwi/SwTokenizer.h @@ -2,7 +2,7 @@ * @file SwTokenizer.h * @author bab2min (bab2min@gmail.com) * @brief Subword Tokenizer - * @version 0.19.0 + * @version 0.20.0 * @date 2024-07-01 * * diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index f8769c0f..e6258499 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -2,7 +2,7 @@ * @file Types.h * @author bab2min (bab2min@gmail.com) * @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일 - * @version 0.19.0 + * @version 0.20.0 * @date 2024-07-01 * * diff --git a/include/kiwi/TypoTransformer.h b/include/kiwi/TypoTransformer.h index cf2a9757..6c216a10 100644 --- a/include/kiwi/TypoTransformer.h +++ b/include/kiwi/TypoTransformer.h @@ -2,7 +2,7 @@ * @file TypoTransformer.h * @author bab2min (bab2min@gmail.com) * @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다. - * @version 0.19.0 + * @version 0.20.0 * @date 2024-09-15 * * diff --git a/include/kiwi/capi.h b/include/kiwi/capi.h index e73e9343..b1f53b28 100644 --- a/include/kiwi/capi.h +++ b/include/kiwi/capi.h @@ -2,7 +2,7 @@ * @file capi.h * @author bab2min (bab2min@gmail.com) * @brief Kiwi C API를 담고 있는 헤더 파일 - * @version 0.19.0 + * @version 0.20.0 * @date 2024-07-01 * *