diff --git a/include/analyzers/analyzer.h b/include/analyzers/analyzer.h index d37ac4589..812f3dbe5 100644 --- a/include/analyzers/analyzer.h +++ b/include/analyzers/analyzer.h @@ -71,6 +71,14 @@ class analyzer static std::unique_ptr default_filter_chain(const cpptoml::table& config); + /** + * @param config The config group used to create the analyzer from + * @return the default filter chain for unigram words for this version + * of MeTA, based on a config object + */ + static std::unique_ptr + default_unigram_chain(const cpptoml::table& config); + /** * @param global The original config object with all parameters * @param config The config group used to create the filters from diff --git a/include/analyzers/tokenizers/icu_tokenizer.h b/include/analyzers/tokenizers/icu_tokenizer.h index b825a4d17..7391c2844 100644 --- a/include/analyzers/tokenizers/icu_tokenizer.h +++ b/include/analyzers/tokenizers/icu_tokenizer.h @@ -39,9 +39,16 @@ namespace tokenizers * Optional config parameters: * * ~~~ toml - * language = "en" # lowercase two-letter or three-letter ISO-639 code - * country = "US" # uppercase two-letter ISO-3116 code. If specified, the - * # config must also specify the language. + * # lowercase two-letter or three-letter ISO-639 code + * language = "en" + * + * # uppercase two-letter ISO-3116 code. If specified, the config must also + * # specify the language. + * country = "US" + * + * # whether to suppress the generation of "" or ""; useful for + * # information retrieval with unigrams. Default is false. + * suppress-tags = true * ~~~ */ class icu_tokenizer : public util::clonable @@ -49,14 +56,17 @@ class icu_tokenizer : public util::clonable public: /** * Creates an icu_tokenizer. + * @param suppress_tags Whether to suppress "" and " generation */ - icu_tokenizer(); + explicit icu_tokenizer(bool suppress_tags = false); /** * Creates an icu_tokenizer with a specific segmenter. * @param segmenter The segmenter to use. + * @param suppress_tags Whether to suppress "" and "" generation */ - icu_tokenizer(utf::segmenter segmenter); + explicit icu_tokenizer(utf::segmenter segmenter, + bool suppress_tags = false); /** * Copies an icu_tokenizer. diff --git a/src/analyzers/analyzer.cpp b/src/analyzers/analyzer.cpp index c40626912..a52e792ce 100644 --- a/src/analyzers/analyzer.cpp +++ b/src/analyzers/analyzer.cpp @@ -45,24 +45,42 @@ io::parser analyzer::create_parser(const corpus::document& doc, io::parser::input_type::File}; } +namespace +{ std::unique_ptr - analyzer::default_filter_chain(const cpptoml::table& config) + add_default_filters(std::unique_ptr tokenizer, + const cpptoml::table& config) { - auto stopwords = config.get_as("stop-words"); std::unique_ptr result; - result = make_unique(); - result = make_unique(std::move(result)); + result = make_unique(std::move(tokenizer)); result = make_unique(std::move(result)); result = make_unique(std::move(result), 2, 35); result = make_unique(std::move(result), *stopwords); result = make_unique(std::move(result)); + return result; +} +} + +std::unique_ptr + analyzer::default_filter_chain(const cpptoml::table& config) +{ + auto tokenizer = make_unique(); + auto result = add_default_filters(std::move(tokenizer), config); result = make_unique(std::move(result)); return result; } +std::unique_ptr + analyzer::default_unigram_chain(const cpptoml::table& config) +{ + // suppress "", "" + auto tokenizer = make_unique(true); + return add_default_filters(std::move(tokenizer), config); +} + std::unique_ptr analyzer::load_filter(std::unique_ptr src, const cpptoml::table& config) @@ -83,6 +101,8 @@ std::unique_ptr { if (*check == "default-chain") return default_filter_chain(global); + else if (*check == "default-unigram-chain") + return default_unigram_chain(global); else throw analyzer_exception{"unknown filter option: " + *check}; } diff --git a/src/analyzers/tokenizers/icu_tokenizer.cpp b/src/analyzers/tokenizers/icu_tokenizer.cpp index 66c632873..b0c392743 100644 --- a/src/analyzers/tokenizers/icu_tokenizer.cpp +++ b/src/analyzers/tokenizers/icu_tokenizer.cpp @@ -29,9 +29,13 @@ const std::string icu_tokenizer::id = "icu-tokenizer"; class icu_tokenizer::impl { public: - impl() = default; + impl(bool suppress_tags) : suppress_tags_{suppress_tags} + { + // nothing + } - impl(utf::segmenter segmenter) : segmenter_{std::move(segmenter)} + explicit impl(utf::segmenter segmenter, bool suppress_tags) + : suppress_tags_{suppress_tags}, segmenter_{std::move(segmenter)} { // nothing } @@ -55,7 +59,8 @@ class icu_tokenizer::impl segmenter_.set_content(content); for (const auto& sentence : segmenter_.sentences()) { - tokens_.emplace_back(""); + if (!suppress_tags_) + tokens_.emplace_back(""); for (const auto& word : segmenter_.words(sentence)) { auto wrd = segmenter_.content(word); @@ -70,7 +75,8 @@ class icu_tokenizer::impl tokens_.emplace_back(std::move(wrd)); } - tokens_.emplace_back(""); + if (!suppress_tags_) + tokens_.emplace_back(""); } } @@ -95,6 +101,9 @@ class icu_tokenizer::impl } private: + /// Whether or not to suppress "" or "" generation + const bool suppress_tags_; + /// UTF segmenter to use for this tokenizer utf::segmenter segmenter_; @@ -102,10 +111,13 @@ class icu_tokenizer::impl std::deque tokens_; }; -icu_tokenizer::icu_tokenizer() = default; +icu_tokenizer::icu_tokenizer(bool suppress_tags) : impl_{suppress_tags} +{ + // nothing +} -icu_tokenizer::icu_tokenizer(utf::segmenter segmenter) - : impl_{std::move(segmenter)} +icu_tokenizer::icu_tokenizer(utf::segmenter segmenter, bool suppress_tags) + : impl_{std::move(segmenter), suppress_tags} { // nothing } @@ -140,6 +152,10 @@ std::unique_ptr { auto language = config.get_as("language"); auto country = config.get_as("country"); + bool suppress_tags = false; + + if (auto stags = config.get_as("suppress-tags")) + suppress_tags = *stags; using exception = token_stream::token_stream_exception; @@ -149,12 +165,14 @@ std::unique_ptr if (language) { if (country) - return make_unique(utf::segmenter{*language, *country}); + return make_unique( + utf::segmenter{*language, *country}, suppress_tags); else - return make_unique(utf::segmenter{*language}); + return make_unique(utf::segmenter{*language}, + suppress_tags); } - return make_unique(); + return make_unique(suppress_tags); } } }