From e878823aae4d3966dcfbdc6497795b7d69a64159 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 28 Jul 2023 13:55:22 +0100 Subject: [PATCH 1/3] Add an option to not encode sentencepiece during training/decoding allowing passing of spmIDs directly --- src/common/config_parser.cpp | 6 ++++++ src/data/sentencepiece_vocab.cpp | 22 +++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 4cc23f2ca..6044e66f7 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -387,6 +387,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. " "When set to 0 all lines are going to be used.", 2000000); + cli.add("--no-spm-encode", + "Assume the input has already had sentencepiece applied before decoding. " + "Expects spm vocabulary IDs, like the ones produced by spm_encode --output_format id"); #endif // scheduling options @@ -698,6 +701,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { #ifdef USE_SENTENCEPIECE cli.add("--no-spm-decode", "Keep the output segmented into SentencePiece subwords"); + cli.add("--no-spm-encode", + "Assume the input has already had sentencepiece applied before decoding. " + "Expects spm vocabulary IDs, like the ones produced by spm_encode --output_format id"); #endif addSuboptionsInputLength(cli); diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 8f774c2bb..2054f06b4 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -39,6 +39,9 @@ class SentencePieceVocab : public IVocab { // Keeps sentences segmented into subword units bool keepEncoded_{false}; + // Assume sentencepiece has already been applied and we are expecting spm vocabulary IDs as input. + bool noEncode_{false}; + // Contains control characters added to vocab due to byte-fallback std::vector controlChars_; @@ -127,7 +130,8 @@ class SentencePieceVocab : public IVocab { : options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed), - keepEncoded_(options->get("no-spm-decode", false)) { + keepEncoded_(options->get("no-spm-decode", false)), + noEncode_(options->get("no-spm-encode", false)) { if(options_->has("sentencepiece-alphas")) { auto alphas = options_->get>("sentencepiece-alphas"); if(alphas.size() <= batchIndex) @@ -222,10 +226,18 @@ class SentencePieceVocab : public IVocab { Words encode(const std::string& line, bool addEOS, bool inference) const override { std::vector spmIds; - if(inference || alpha_ == 0) - spm_->Encode(line, &spmIds); - else - spm_->SampleEncode(line, -1, alpha_, &spmIds); + if (noEncode_) { + auto lineTokens = utils::split(line, " "); + spmIds.reserve(lineTokens.size()); + for (auto&& token : lineTokens) { + spmIds.push_back((int)strtol(token.c_str(), nullptr, 10)); + } + } else { + if(inference || alpha_ == 0) + spm_->Encode(line, &spmIds); + else + spm_->SampleEncode(line, -1, alpha_, &spmIds); + } Words words; words.reserve(spmIds.size() + addEOS); for (auto&& spmId : spmIds) From 29d5d6060ecad0b6122600b1ab482f5c44dc599d Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 28 Jul 2023 14:16:11 +0100 Subject: [PATCH 2/3] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e325e25e..5d7a3a96b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] - +- Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode. ## [1.12.0] - 2023-02-20 ### Added From 5d7d080b9a4c7ba36b15ef4487d4c3f4f9bc3925 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 28 Jul 2023 17:50:32 +0100 Subject: [PATCH 3/3] numbers -> pieces --- src/common/config_parser.cpp | 4 ++-- src/data/sentencepiece_vocab.cpp | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 6044e66f7..fcee35a88 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -389,7 +389,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { 2000000); cli.add("--no-spm-encode", "Assume the input has already had sentencepiece applied before decoding. " - "Expects spm vocabulary IDs, like the ones produced by spm_encode --output_format id"); + "Expects spm pieces, like the ones produced by spm_encode's default format."); #endif // scheduling options @@ -703,7 +703,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Keep the output segmented into SentencePiece subwords"); cli.add("--no-spm-encode", "Assume the input has already had sentencepiece applied before decoding. " - "Expects spm vocabulary IDs, like the ones produced by spm_encode --output_format id"); + "Expects spm pieces, like the ones produced by spm_encode's default format."); #endif addSuboptionsInputLength(cli); diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 2054f06b4..548b95a46 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -39,7 +39,7 @@ class SentencePieceVocab : public IVocab { // Keeps sentences segmented into subword units bool keepEncoded_{false}; - // Assume sentencepiece has already been applied and we are expecting spm vocabulary IDs as input. + // Assume sentencepiece has already been applied and we are expecting spm pieces as input bool noEncode_{false}; // Contains control characters added to vocab due to byte-fallback @@ -225,23 +225,24 @@ class SentencePieceVocab : public IVocab { } Words encode(const std::string& line, bool addEOS, bool inference) const override { - std::vector spmIds; + Words words; if (noEncode_) { auto lineTokens = utils::split(line, " "); - spmIds.reserve(lineTokens.size()); + words.reserve(lineTokens.size() + addEOS); for (auto&& token : lineTokens) { - spmIds.push_back((int)strtol(token.c_str(), nullptr, 10)); + words.push_back((*this)[token]); } } else { + std::vector spmIds; if(inference || alpha_ == 0) spm_->Encode(line, &spmIds); else spm_->SampleEncode(line, -1, alpha_, &spmIds); - } - Words words; words.reserve(spmIds.size() + addEOS); - for (auto&& spmId : spmIds) - words.push_back(Word::fromWordIndex(spmId)); + words.reserve(spmIds.size() + addEOS); + for (auto&& spmId : spmIds) + words.push_back(Word::fromWordIndex(spmId)); + } if(addEOS) words.push_back(getEosId());