From 9db25a0c1b4219a57c6976f84800d2522297840c Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Fri, 24 Jan 2025 11:40:06 -0300 Subject: [PATCH 1/5] Adds support for the use of external dictionaries for segmenters backed by lindera --- charabia/Cargo.toml | 5 ++++- charabia/src/segmenter/japanese.rs | 23 ++++++++++++++++++++++- charabia/src/segmenter/korean.rs | 17 ++++++++++++++++- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 3384d2d..03b200d 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -46,10 +46,13 @@ hebrew = [] japanese = ["japanese-segmentation-unidic", "japanese-transliteration"] japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] +japanese-segmentation-external = ["lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["lindera/ko-dic", "lindera/compress"] +korean = ["korean_segmentation_kodic", "lindera/compress"] +korean_segmentation_kodic = ["lindera/ko-dic"] +korean_segmentation_external = [] # allow thai specialized tokenization thai = [] diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index a130373..0758c2a 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,6 +1,13 @@ +use std::{env, path::PathBuf}; + +#[cfg(any( + feature = "japanese-segmentation-ipadic", + feature = "japanese-segmentation-unidic" +))] +use lindera::DictionaryKind; #[cfg(feature = "japanese-segmentation-ipadic")] use lindera::Penalty; -use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig}; +use lindera::{DictionaryConfig, Mode, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; @@ -14,6 +21,12 @@ static LINDERA: Lazy = Lazy::new(|| { #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))] compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together"); + #[cfg(all( + feature = "japanese-segmentation-external", + any(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic") + ))] + compile_error!("Feature japanese-segmentation-external and either japanese-segmentation-unidic or japanese-segmentation-ipadic are mutually exclusive and cannot be enabled together"); + #[cfg(feature = "japanese-segmentation-ipadic")] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None }, @@ -26,6 +39,13 @@ static LINDERA: Lazy = Lazy::new(|| { mode: Mode::Normal, ..TokenizerConfig::default() }; + #[cfg(feature = "japanese-segmentation-external")] + let config = TokenizerConfig { + dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY").expect("japanese-segmentation-external feature requires MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY env var to be set"))) }, + mode: Mode::Normal, + ..TokenizerConfig::default() + }; + Tokenizer::from_config(config).unwrap() }); @@ -37,6 +57,7 @@ impl Segmenter for JapaneseSegmenter { } #[cfg(test)] +#[cfg(not(feature = "japanese-segmentation-external"))] mod test { use crate::segmenter::test::test_segmenter; diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index 76b6086..42166cc 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,4 +1,9 @@ -use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig}; +use std::{env, path::PathBuf}; + +#[cfg(not(feature = "korean_segmentation_external"))] +use lindera::DictionaryKind; + +use lindera::{DictionaryConfig, Mode, Penalty, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; @@ -9,11 +14,20 @@ use crate::segmenter::Segmenter; pub struct KoreanSegmenter; static LINDERA: Lazy = Lazy::new(|| { + #[cfg(not(feature = "korean_segmentation_external"))] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::KoDic), path: None }, mode: Mode::Decompose(Penalty::default()), ..TokenizerConfig::default() }; + + #[cfg(feature = "korean_segmentation_external")] + let config = TokenizerConfig { + dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY").expect("korean-segmentation-external feature requires MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY env var to be set"))) }, + mode: Mode::Decompose(Penalty::default()), + ..TokenizerConfig::default() + }; + Tokenizer::from_config(config).unwrap() }); @@ -25,6 +39,7 @@ impl Segmenter for KoreanSegmenter { } #[cfg(test)] +#[cfg(not(feature = "korean_segmentation_external"))] mod test { use crate::segmenter::test::test_segmenter; From 11dbe1fec4f1e4a688a15f0686ae81faf5df219a Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Fri, 24 Jan 2025 11:44:32 -0300 Subject: [PATCH 2/5] rename feature with incorrect case --- charabia/Cargo.toml | 2 +- charabia/src/segmenter/korean.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 03b200d..e4cd023 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -52,7 +52,7 @@ japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization korean = ["korean_segmentation_kodic", "lindera/compress"] korean_segmentation_kodic = ["lindera/ko-dic"] -korean_segmentation_external = [] +korean-segmentation-external = [] # allow thai specialized tokenization thai = [] diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index 42166cc..d48548e 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,6 +1,6 @@ use std::{env, path::PathBuf}; -#[cfg(not(feature = "korean_segmentation_external"))] +#[cfg(not(feature = "korean-segmentation-external"))] use lindera::DictionaryKind; use lindera::{DictionaryConfig, Mode, Penalty, Tokenizer, TokenizerConfig}; @@ -14,14 +14,14 @@ use crate::segmenter::Segmenter; pub struct KoreanSegmenter; static LINDERA: Lazy = Lazy::new(|| { - #[cfg(not(feature = "korean_segmentation_external"))] + #[cfg(not(feature = "korean-segmentation-external"))] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::KoDic), path: None }, mode: Mode::Decompose(Penalty::default()), ..TokenizerConfig::default() }; - #[cfg(feature = "korean_segmentation_external")] + #[cfg(feature = "korean-segmentation-external")] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY").expect("korean-segmentation-external feature requires MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY env var to be set"))) }, mode: Mode::Decompose(Penalty::default()), @@ -39,7 +39,7 @@ impl Segmenter for KoreanSegmenter { } #[cfg(test)] -#[cfg(not(feature = "korean_segmentation_external"))] +#[cfg(not(feature = "korean-segmentation-external"))] mod test { use crate::segmenter::test::test_segmenter; From 1dce5b7b63d73c9d7a42ebde2379e6bc8cd8a725 Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Fri, 24 Jan 2025 11:48:23 -0300 Subject: [PATCH 3/5] same --- charabia/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index e4cd023..092e5f2 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -50,8 +50,8 @@ japanese-segmentation-external = ["lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["korean_segmentation_kodic", "lindera/compress"] -korean_segmentation_kodic = ["lindera/ko-dic"] +korean = ["korean-segmentation-external", "lindera/compress"] +korean-segmentation-kodic = ["lindera/ko-dic"] korean-segmentation-external = [] # allow thai specialized tokenization From cdb2221daf521d5b427fe323cf69b48272c898ea Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Fri, 24 Jan 2025 14:07:03 -0300 Subject: [PATCH 4/5] organize importgs --- charabia/Cargo.toml | 2 +- charabia/src/segmenter/japanese.rs | 7 ++----- charabia/src/segmenter/korean.rs | 1 + 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 092e5f2..b428f71 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -50,7 +50,7 @@ japanese-segmentation-external = ["lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["korean-segmentation-external", "lindera/compress"] +korean = ["korean-segmentation-kodic", "lindera/compress"] korean-segmentation-kodic = ["lindera/ko-dic"] korean-segmentation-external = [] diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index 0758c2a..45993bd 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,9 +1,6 @@ +#[cfg(feature = "japanese-segmentation-external")] use std::{env, path::PathBuf}; - -#[cfg(any( - feature = "japanese-segmentation-ipadic", - feature = "japanese-segmentation-unidic" -))] +#[cfg(not(feature = "japanese-segmentation-external"))] use lindera::DictionaryKind; #[cfg(feature = "japanese-segmentation-ipadic")] use lindera::Penalty; diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index d48548e..bb980c7 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "korean-segmentation-external")] use std::{env, path::PathBuf}; #[cfg(not(feature = "korean-segmentation-external"))] From 211f654fa04828ea79e838f0f0acf54ebb266fc1 Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Fri, 24 Jan 2025 14:15:33 -0300 Subject: [PATCH 5/5] formatting --- charabia/src/segmenter/japanese.rs | 1 + charabia/src/segmenter/korean.rs | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index 45993bd..ac3b0bd 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,5 +1,6 @@ #[cfg(feature = "japanese-segmentation-external")] use std::{env, path::PathBuf}; + #[cfg(not(feature = "japanese-segmentation-external"))] use lindera::DictionaryKind; #[cfg(feature = "japanese-segmentation-ipadic")] diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index bb980c7..264fb4a 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -3,7 +3,6 @@ use std::{env, path::PathBuf}; #[cfg(not(feature = "korean-segmentation-external"))] use lindera::DictionaryKind; - use lindera::{DictionaryConfig, Mode, Penalty, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy;