Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for the use of external dictionaries for segmenters backed by lindera #326

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ hebrew = []
japanese = ["japanese-segmentation-unidic", "japanese-transliteration"]
japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
japanese-segmentation-external = ["lindera/compress"]
japanese-transliteration = ["dep:wana_kana"]

# allow korean specialized tokenization
korean = ["lindera/ko-dic", "lindera/compress"]
korean = ["korean-segmentation-kodic", "lindera/compress"]
korean-segmentation-kodic = ["lindera/ko-dic"]
korean-segmentation-external = []

# allow thai specialized tokenization
thai = []
Expand Down
21 changes: 20 additions & 1 deletion charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#[cfg(feature = "japanese-segmentation-external")]
use std::{env, path::PathBuf};

#[cfg(not(feature = "japanese-segmentation-external"))]
use lindera::DictionaryKind;
#[cfg(feature = "japanese-segmentation-ipadic")]
use lindera::Penalty;
use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig};
use lindera::{DictionaryConfig, Mode, Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;

use crate::segmenter::Segmenter;
Expand All @@ -14,6 +19,12 @@ static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");

#[cfg(all(
feature = "japanese-segmentation-external",
any(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic")
))]
compile_error!("Feature japanese-segmentation-external and either japanese-segmentation-unidic or japanese-segmentation-ipadic are mutually exclusive and cannot be enabled together");

#[cfg(feature = "japanese-segmentation-ipadic")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
Expand All @@ -26,6 +37,13 @@ static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
mode: Mode::Normal,
..TokenizerConfig::default()
};
#[cfg(feature = "japanese-segmentation-external")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY").expect("japanese-segmentation-external feature requires MEILISEARCH_JAPANESE_EXTERNAL_DICTIONARY env var to be set"))) },
mode: Mode::Normal,
..TokenizerConfig::default()
};

Tokenizer::from_config(config).unwrap()
});

Expand All @@ -37,6 +55,7 @@ impl Segmenter for JapaneseSegmenter {
}

#[cfg(test)]
#[cfg(not(feature = "japanese-segmentation-external"))]
mod test {
use crate::segmenter::test::test_segmenter;

Expand Down
17 changes: 16 additions & 1 deletion charabia/src/segmenter/korean.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig};
#[cfg(feature = "korean-segmentation-external")]
use std::{env, path::PathBuf};

#[cfg(not(feature = "korean-segmentation-external"))]
use lindera::DictionaryKind;
use lindera::{DictionaryConfig, Mode, Penalty, Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;

use crate::segmenter::Segmenter;
Expand All @@ -9,11 +14,20 @@ use crate::segmenter::Segmenter;
pub struct KoreanSegmenter;

static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
#[cfg(not(feature = "korean-segmentation-external"))]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::KoDic), path: None },
mode: Mode::Decompose(Penalty::default()),
..TokenizerConfig::default()
};

#[cfg(feature = "korean-segmentation-external")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: None, path: Some(PathBuf::from(env::var("MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY").expect("korean-segmentation-external feature requires MEILISEARCH_KOREAN_EXTERNAL_DICTIONARY env var to be set"))) },
mode: Mode::Decompose(Penalty::default()),
..TokenizerConfig::default()
};

Tokenizer::from_config(config).unwrap()
});

Expand All @@ -25,6 +39,7 @@ impl Segmenter for KoreanSegmenter {
}

#[cfg(test)]
#[cfg(not(feature = "korean-segmentation-external"))]
mod test {
use crate::segmenter::test::test_segmenter;

Expand Down
Loading