From bb6b841200fca68dc7b08a0ddf7522c64cbc1143 Mon Sep 17 00:00:00 2001 From: ASenart <50336379+ASenart@users.noreply.github.com> Date: Tue, 26 Dec 2023 10:09:40 +0100 Subject: [PATCH] Support of new option offset for ignoring token score of special tokens (#1592) Co-authored-by: asenellart --- include/ctranslate2/scoring.h | 4 +++- python/cpp/translator.cc | 9 ++++++++- src/models/language_model.cc | 3 ++- src/models/sequence_to_sequence.cc | 3 ++- src/scoring.cc | 5 +++-- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/ctranslate2/scoring.h b/include/ctranslate2/scoring.h index d1afd35c8..d48e6395a 100644 --- a/include/ctranslate2/scoring.h +++ b/include/ctranslate2/scoring.h @@ -12,6 +12,7 @@ namespace ctranslate2 { struct ScoringOptions { // Truncate the inputs after this many tokens (set 0 to disable truncation). size_t max_input_length = 1024; + dim_t offset = 0; }; struct ScoringResult { @@ -38,6 +39,7 @@ namespace ctranslate2 { layers::DecoderState& state, const std::vector>& sequences, const Vocabulary& vocabulary, - const dim_t preferred_size_multiple = 1); + const dim_t preferred_size_multiple = 1, + const dim_t offset=0); } diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc index d920469fe..b46d7ab9e 100644 --- a/python/cpp/translator.cc +++ b/python/cpp/translator.cc @@ -228,10 +228,12 @@ namespace ctranslate2 { size_t max_batch_size, const std::string& batch_type_str, size_t max_input_length, + dim_t offset, bool asynchronous) { const auto batch_type = str_to_batch_type(batch_type_str); ScoringOptions options; options.max_input_length = max_input_length; + options.offset = offset; std::shared_lock lock(_mutex); assert_model_is_ready(); @@ -252,6 +254,7 @@ namespace ctranslate2 { size_t read_batch_size, const std::string& batch_type_str, size_t max_input_length, + dim_t offset, bool with_tokens_score, const TokenizeFn& source_tokenize_fn, const TokenizeFn& target_tokenize_fn, @@ -263,7 +266,7 @@ namespace ctranslate2 { const auto batch_type = str_to_batch_type(batch_type_str); ScoringOptions options; options.max_input_length = max_input_length; - + options.offset = offset; std::shared_lock lock(_mutex); assert_model_is_ready(); @@ -592,6 +595,7 @@ namespace ctranslate2 { py::arg("max_batch_size")=0, py::arg("batch_type")="examples", py::arg("max_input_length")=1024, + py::arg("offset") = 0, py::arg("asynchronous")=false, py::call_guard(), R"pbdoc( @@ -606,6 +610,7 @@ namespace ctranslate2 { minimized. batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens". max_input_length: Truncate inputs after this many tokens (0 to disable). + offset: Ignore the first n tokens in target in score calculation. asynchronous: Run the scoring asynchronously. Returns: @@ -621,6 +626,7 @@ namespace ctranslate2 { py::arg("read_batch_size")=0, py::arg("batch_type")="examples", py::arg("max_input_length")=1024, + py::arg("offset")=0, py::arg("with_tokens_score")=false, py::arg("source_tokenize_fn")=nullptr, py::arg("target_tokenize_fn")=nullptr, @@ -649,6 +655,7 @@ namespace ctranslate2 { batch_type: Whether :obj:`max_batch_size` and :obj:`read_batch_size` are the number of "examples" or "tokens". max_input_length: Truncate inputs after this many tokens (0 to disable). + offset: Ignore the first n tokens in target in score calculation. with_tokens_score: Include the token-level scores in the output file. source_tokenize_fn: Function to tokenize source lines. target_tokenize_fn: Function to tokenize target lines. diff --git a/src/models/language_model.cc b/src/models/language_model.cc index 466e42594..01ae7c8a4 100644 --- a/src/models/language_model.cc +++ b/src/models/language_model.cc @@ -122,7 +122,8 @@ namespace ctranslate2 { state, ids, vocabulary, - _model->preferred_size_multiple()); + _model->preferred_size_multiple(), + options.offset); } bool DecoderReplica::skip_scoring(const std::vector& tokens, diff --git a/src/models/sequence_to_sequence.cc b/src/models/sequence_to_sequence.cc index a7e64611f..ed4bb214b 100644 --- a/src/models/sequence_to_sequence.cc +++ b/src/models/sequence_to_sequence.cc @@ -256,7 +256,8 @@ namespace ctranslate2 { state, target_ids, _model->get_target_vocabulary(), - _model->preferred_size_multiple()); + _model->preferred_size_multiple(), + options.offset); } bool EncoderDecoderReplica::skip_scoring(const std::vector& source, diff --git a/src/scoring.cc b/src/scoring.cc index 1ccb64d91..23004b5aa 100644 --- a/src/scoring.cc +++ b/src/scoring.cc @@ -7,7 +7,8 @@ namespace ctranslate2 { layers::DecoderState& state, const std::vector>& sequences, const Vocabulary& vocabulary, - const dim_t preferred_size_multiple) { + const dim_t preferred_size_multiple, + const dim_t offset) { const dim_t batch_size = sequences.size(); const Device device = decoder.device(); @@ -57,7 +58,7 @@ namespace ctranslate2 { auto& result = results[b]; result.tokens.reserve(output_length); result.tokens_score.reserve(output_length); - for (dim_t t = 0; t < output_length; ++t) { + for (dim_t t = offset; t < output_length; ++t) { result.tokens.emplace_back(vocabulary.to_token(output_sequences[b][t])); result.tokens_score.emplace_back(scores.at({b, t})); }