diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 3384d2d..b428f71 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -46,10 +46,13 @@ hebrew = [] japanese = ["japanese-segmentation-unidic", "japanese-transliteration"] japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] +japanese-segmentation-external = ["lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["lindera/ko-dic", "lindera/compress"] +korean = ["korean-segmentation-kodic", "lindera/compress"] +korean-segmentation-kodic = ["lindera/ko-dic"] +korean-segmentation-external = [] # allow thai specialized tokenization thai = [] diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index a130373..ac3b0bd 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,6 +1,11 @@ +#[cfg(feature = "japanese-segmentation-external")] +use std::{env, path::PathBuf}; + +#[cfg(not(feature = "japanese-segmentation-external"))] +use lindera::DictionaryKind; #[cfg(feature = "japanese-segmentation-ipadic")] use lindera::Penalty; -use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig}; +use lindera::{DictionaryConfig, Mode, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; @@ -14,6 +19,12 @@ static LINDERA: Lazy = Lazy::new(|| { #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))] compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together"); + #[cfg(all( + feature = "japanese-segmentation-external", + any(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic") + ))] + compile_error!("Feature japanese-segmentation-external and either japanese-segmentation-unidic or japanese-segmentation-ipadic are mutually exclusive and cannot be enabled together"); + #[cfg(feature = "japanese-segmentation-ipadic")] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None }, @@ -26,6 +37,13 @@ static LINDERA: Lazy = Lazy::new(|| { mode: Mode::Normal, ..TokenizerConfig::default() }; + #[cfg(feature = "japanese-segmentation-external")] + let config = TokenizerConfig { + dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY").expect("japanese-segmentation-external feature requires MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY env var to be set"))) }, + mode: Mode::Normal, + ..TokenizerConfig::default() + }; + Tokenizer::from_config(config).unwrap() }); @@ -37,6 +55,7 @@ impl Segmenter for JapaneseSegmenter { } #[cfg(test)] +#[cfg(not(feature = "japanese-segmentation-external"))] mod test { use crate::segmenter::test::test_segmenter; diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index 76b6086..264fb4a 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,4 +1,9 @@ -use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig}; +#[cfg(feature = "korean-segmentation-external")] +use std::{env, path::PathBuf}; + +#[cfg(not(feature = "korean-segmentation-external"))] +use lindera::DictionaryKind; +use lindera::{DictionaryConfig, Mode, Penalty, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; @@ -9,11 +14,20 @@ use crate::segmenter::Segmenter; pub struct KoreanSegmenter; static LINDERA: Lazy = Lazy::new(|| { + #[cfg(not(feature = "korean-segmentation-external"))] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::KoDic), path: None }, mode: Mode::Decompose(Penalty::default()), ..TokenizerConfig::default() }; + + #[cfg(feature = "korean-segmentation-external")] + let config = TokenizerConfig { + dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY").expect("korean-segmentation-external feature requires MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY env var to be set"))) }, + mode: Mode::Decompose(Penalty::default()), + ..TokenizerConfig::default() + }; + Tokenizer::from_config(config).unwrap() }); @@ -25,6 +39,7 @@ impl Segmenter for KoreanSegmenter { } #[cfg(test)] +#[cfg(not(feature = "korean-segmentation-external"))] mod test { use crate::segmenter::test::test_segmenter;