Skip to content

Commit

Permalink
Add support for suprressing <s> and </s> tokens.
Browse files Browse the repository at this point in the history
This is mostly useful for information retrieval experiments using unigram
words. This change also adds another filter chain preset,
"default-unigram-chain" which is appropriate for indexing unigram words
for information retrieval purposes.
  • Loading branch information
Chase Geigle committed Jun 13, 2015
1 parent 799fab0 commit 68a1943
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 19 deletions.
8 changes: 8 additions & 0 deletions include/analyzers/analyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ class analyzer
static std::unique_ptr<token_stream>
default_filter_chain(const cpptoml::table& config);

/**
* @param config The config group used to create the analyzer from
* @return the default filter chain for unigram words for this version
* of MeTA, based on a config object
*/
static std::unique_ptr<token_stream>
default_unigram_chain(const cpptoml::table& config);

/**
* @param global The original config object with all parameters
* @param config The config group used to create the filters from
Expand Down
20 changes: 15 additions & 5 deletions include/analyzers/tokenizers/icu_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,34 @@ namespace tokenizers
* Optional config parameters:
*
* ~~~ toml
* language = "en" # lowercase two-letter or three-letter ISO-639 code
* country = "US" # uppercase two-letter ISO-3116 code. If specified, the
* # config must also specify the language.
* # lowercase two-letter or three-letter ISO-639 code
* language = "en"
*
* # uppercase two-letter ISO-3116 code. If specified, the config must also
* # specify the language.
* country = "US"
*
* # whether to suppress the generation of "<s>" or "</s>"; useful for
* # information retrieval with unigrams. Default is false.
* suppress-tags = true
* ~~~
*/
class icu_tokenizer : public util::clonable<token_stream, icu_tokenizer>
{
public:
/**
* Creates an icu_tokenizer.
* @param suppress_tags Whether to suppress "<s>" and "</s"> generation
*/
icu_tokenizer();
explicit icu_tokenizer(bool suppress_tags = false);

/**
* Creates an icu_tokenizer with a specific segmenter.
* @param segmenter The segmenter to use.
* @param suppress_tags Whether to suppress "<s>" and "</s>" generation
*/
icu_tokenizer(utf::segmenter segmenter);
explicit icu_tokenizer(utf::segmenter segmenter,
bool suppress_tags = false);

/**
* Copies an icu_tokenizer.
Expand Down
28 changes: 24 additions & 4 deletions src/analyzers/analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,24 +45,42 @@ io::parser analyzer::create_parser(const corpus::document& doc,
io::parser::input_type::File};
}

namespace
{
std::unique_ptr<token_stream>
analyzer::default_filter_chain(const cpptoml::table& config)
add_default_filters(std::unique_ptr<token_stream> tokenizer,
const cpptoml::table& config)
{

auto stopwords = config.get_as<std::string>("stop-words");

std::unique_ptr<token_stream> result;

result = make_unique<tokenizers::icu_tokenizer>();
result = make_unique<filters::lowercase_filter>(std::move(result));
result = make_unique<filters::lowercase_filter>(std::move(tokenizer));
result = make_unique<filters::alpha_filter>(std::move(result));
result = make_unique<filters::length_filter>(std::move(result), 2, 35);
result = make_unique<filters::list_filter>(std::move(result), *stopwords);
result = make_unique<filters::porter2_stemmer>(std::move(result));
return result;
}
}

std::unique_ptr<token_stream>
analyzer::default_filter_chain(const cpptoml::table& config)
{
auto tokenizer = make_unique<tokenizers::icu_tokenizer>();
auto result = add_default_filters(std::move(tokenizer), config);
result = make_unique<filters::empty_sentence_filter>(std::move(result));
return result;
}

std::unique_ptr<token_stream>
analyzer::default_unigram_chain(const cpptoml::table& config)
{
// suppress "<s>", "</s>"
auto tokenizer = make_unique<tokenizers::icu_tokenizer>(true);
return add_default_filters(std::move(tokenizer), config);
}

std::unique_ptr<token_stream>
analyzer::load_filter(std::unique_ptr<token_stream> src,
const cpptoml::table& config)
Expand All @@ -83,6 +101,8 @@ std::unique_ptr<token_stream>
{
if (*check == "default-chain")
return default_filter_chain(global);
else if (*check == "default-unigram-chain")
return default_unigram_chain(global);
else
throw analyzer_exception{"unknown filter option: " + *check};
}
Expand Down
38 changes: 28 additions & 10 deletions src/analyzers/tokenizers/icu_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@ const std::string icu_tokenizer::id = "icu-tokenizer";
class icu_tokenizer::impl
{
public:
impl() = default;
impl(bool suppress_tags) : suppress_tags_{suppress_tags}
{
// nothing
}

impl(utf::segmenter segmenter) : segmenter_{std::move(segmenter)}
explicit impl(utf::segmenter segmenter, bool suppress_tags)
: suppress_tags_{suppress_tags}, segmenter_{std::move(segmenter)}
{
// nothing
}
Expand All @@ -55,7 +59,8 @@ class icu_tokenizer::impl
segmenter_.set_content(content);
for (const auto& sentence : segmenter_.sentences())
{
tokens_.emplace_back("<s>");
if (!suppress_tags_)
tokens_.emplace_back("<s>");
for (const auto& word : segmenter_.words(sentence))
{
auto wrd = segmenter_.content(word);
Expand All @@ -70,7 +75,8 @@ class icu_tokenizer::impl

tokens_.emplace_back(std::move(wrd));
}
tokens_.emplace_back("</s>");
if (!suppress_tags_)
tokens_.emplace_back("</s>");
}
}

Expand All @@ -95,17 +101,23 @@ class icu_tokenizer::impl
}

private:
/// Whether or not to suppress "<s>" or "</s>" generation
const bool suppress_tags_;

/// UTF segmenter to use for this tokenizer
utf::segmenter segmenter_;

/// Buffered tokens
std::deque<std::string> tokens_;
};

icu_tokenizer::icu_tokenizer() = default;
icu_tokenizer::icu_tokenizer(bool suppress_tags) : impl_{suppress_tags}
{
// nothing
}

icu_tokenizer::icu_tokenizer(utf::segmenter segmenter)
: impl_{std::move(segmenter)}
icu_tokenizer::icu_tokenizer(utf::segmenter segmenter, bool suppress_tags)
: impl_{std::move(segmenter), suppress_tags}
{
// nothing
}
Expand Down Expand Up @@ -140,6 +152,10 @@ std::unique_ptr<token_stream>
{
auto language = config.get_as<std::string>("language");
auto country = config.get_as<std::string>("country");
bool suppress_tags = false;

if (auto stags = config.get_as<bool>("suppress-tags"))
suppress_tags = *stags;

using exception = token_stream::token_stream_exception;

Expand All @@ -149,12 +165,14 @@ std::unique_ptr<token_stream>
if (language)
{
if (country)
return make_unique<icu_tokenizer>(utf::segmenter{*language, *country});
return make_unique<icu_tokenizer>(
utf::segmenter{*language, *country}, suppress_tags);
else
return make_unique<icu_tokenizer>(utf::segmenter{*language});
return make_unique<icu_tokenizer>(utf::segmenter{*language},
suppress_tags);
}

return make_unique<icu_tokenizer>();
return make_unique<icu_tokenizer>(suppress_tags);
}
}
}
Expand Down

0 comments on commit 68a1943

Please sign in to comment.