Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to not encode sentencepiece during training/decoding al… #1003

Merged
merged 4 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
- Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
- Added --custom-fallbacks option that allows to specify a list of option sets that get traversed for subsequent fallbacks upon divergence
- Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers.
- Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts.
Expand Down
6 changes: 6 additions & 0 deletions src/common/config_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
"Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. "
"When set to 0 all lines are going to be used.",
2000000);
cli.add<bool>("--no-spm-encode",
"Assume the input has already had sentencepiece applied before decoding. "
"Expects spm pieces, like the ones produced by spm_encode's default format.");
#endif
// scheduling options

Expand Down Expand Up @@ -752,6 +755,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
#ifdef USE_SENTENCEPIECE
cli.add<bool>("--no-spm-decode",
"Keep the output segmented into SentencePiece subwords");
cli.add<bool>("--no-spm-encode",
"Assume the input has already had sentencepiece applied before decoding. "
"Expects spm pieces, like the ones produced by spm_encode's default format.");
#endif

addSuboptionsInputLength(cli);
Expand Down
32 changes: 22 additions & 10 deletions src/data/sentencepiece_vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class SentencePieceVocab : public IVocab {
// Keeps sentences segmented into subword units
bool keepEncoded_{false};

// Assume sentencepiece has already been applied and we are expecting spm pieces as input
bool noEncode_{false};

// Contains control characters added to vocab due to byte-fallback
std::vector<Word> controlChars_;

Expand Down Expand Up @@ -127,7 +130,8 @@ class SentencePieceVocab : public IVocab {
: options_(options),
batchIndex_(batchIndex),
generator_((uint32_t)Config::seed),
keepEncoded_(options->get<bool>("no-spm-decode", false)) {
keepEncoded_(options->get<bool>("no-spm-decode", false)),
noEncode_(options->get<bool>("no-spm-encode", false)) {
if(options_->has("sentencepiece-alphas")) {
auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
if(alphas.size() <= batchIndex)
Expand Down Expand Up @@ -221,16 +225,24 @@ class SentencePieceVocab : public IVocab {
}

Words encode(const std::string& line, bool addEOS, bool inference) const override {
std::vector<int> spmIds;
if(inference || alpha_ == 0)
spm_->Encode(line, &spmIds);
else
spm_->SampleEncode(line, -1, alpha_, &spmIds);

Words words;
words.reserve(spmIds.size() + addEOS);
for (auto&& spmId : spmIds)
words.push_back(Word::fromWordIndex(spmId));
if (noEncode_) {
auto lineTokens = utils::split(line, " ");
words.reserve(lineTokens.size() + addEOS);
for (auto&& token : lineTokens) {
words.push_back((*this)[token]);
}
} else {
std::vector<int> spmIds;
if(inference || alpha_ == 0)
spm_->Encode(line, &spmIds);
else
spm_->SampleEncode(line, -1, alpha_, &spmIds);

words.reserve(spmIds.size() + addEOS);
for (auto&& spmId : spmIds)
words.push_back(Word::fromWordIndex(spmId));
}

if(addEOS)
words.push_back(getEosId());
Expand Down
Loading