From bb6b841200fca68dc7b08a0ddf7522c64cbc1143 Mon Sep 17 00:00:00 2001
From: ASenart <50336379+ASenart@users.noreply.github.com>
Date: Tue, 26 Dec 2023 10:09:40 +0100
Subject: [PATCH] Support of new option offset for ignoring token score of
 special tokens (#1592)

Co-authored-by: asenellart <antoine.senellart@systrangroup.com>
---
 include/ctranslate2/scoring.h      | 4 +++-
 python/cpp/translator.cc           | 9 ++++++++-
 src/models/language_model.cc       | 3 ++-
 src/models/sequence_to_sequence.cc | 3 ++-
 src/scoring.cc                     | 5 +++--
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/ctranslate2/scoring.h b/include/ctranslate2/scoring.h
index d1afd35c8..d48e6395a 100644
--- a/include/ctranslate2/scoring.h
+++ b/include/ctranslate2/scoring.h
@@ -12,6 +12,7 @@ namespace ctranslate2 {
   struct ScoringOptions {
     // Truncate the inputs after this many tokens (set 0 to disable truncation).
     size_t max_input_length = 1024;
+    dim_t offset = 0;
   };
 
   struct ScoringResult {
@@ -38,6 +39,7 @@ namespace ctranslate2 {
                   layers::DecoderState& state,
                   const std::vector<std::vector<size_t>>& sequences,
                   const Vocabulary& vocabulary,
-                  const dim_t preferred_size_multiple = 1);
+                  const dim_t preferred_size_multiple = 1,
+                  const dim_t offset=0);
 
 }
diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
index d920469fe..b46d7ab9e 100644
--- a/python/cpp/translator.cc
+++ b/python/cpp/translator.cc
@@ -228,10 +228,12 @@ namespace ctranslate2 {
                   size_t max_batch_size,
                   const std::string& batch_type_str,
                   size_t max_input_length,
+                  dim_t offset,
                   bool asynchronous) {
         const auto batch_type = str_to_batch_type(batch_type_str);
         ScoringOptions options;
         options.max_input_length = max_input_length;
+        options.offset = offset;
 
         std::shared_lock lock(_mutex);
         assert_model_is_ready();
@@ -252,6 +254,7 @@ namespace ctranslate2 {
                                 size_t read_batch_size,
                                 const std::string& batch_type_str,
                                 size_t max_input_length,
+                                dim_t offset,
                                 bool with_tokens_score,
                                 const TokenizeFn& source_tokenize_fn,
                                 const TokenizeFn& target_tokenize_fn,
@@ -263,7 +266,7 @@ namespace ctranslate2 {
         const auto batch_type = str_to_batch_type(batch_type_str);
         ScoringOptions options;
         options.max_input_length = max_input_length;
-
+        options.offset = offset;
         std::shared_lock lock(_mutex);
         assert_model_is_ready();
 
@@ -592,6 +595,7 @@ namespace ctranslate2 {
              py::arg("max_batch_size")=0,
              py::arg("batch_type")="examples",
              py::arg("max_input_length")=1024,
+             py::arg("offset") = 0,
              py::arg("asynchronous")=false,
              py::call_guard<py::gil_scoped_release>(),
              R"pbdoc(
@@ -606,6 +610,7 @@ namespace ctranslate2 {
                      minimized.
                    batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens".
                    max_input_length: Truncate inputs after this many tokens (0 to disable).
+                   offset: Ignore the first n tokens in target in score calculation.
                    asynchronous: Run the scoring asynchronously.
 
                  Returns:
@@ -621,6 +626,7 @@ namespace ctranslate2 {
              py::arg("read_batch_size")=0,
              py::arg("batch_type")="examples",
              py::arg("max_input_length")=1024,
+             py::arg("offset")=0,
              py::arg("with_tokens_score")=false,
              py::arg("source_tokenize_fn")=nullptr,
              py::arg("target_tokenize_fn")=nullptr,
@@ -649,6 +655,7 @@ namespace ctranslate2 {
                    batch_type: Whether :obj:`max_batch_size` and :obj:`read_batch_size` are the
                      number of "examples" or "tokens".
                    max_input_length: Truncate inputs after this many tokens (0 to disable).
+                   offset: Ignore the first n tokens in target in score calculation.
                    with_tokens_score: Include the token-level scores in the output file.
                    source_tokenize_fn: Function to tokenize source lines.
                    target_tokenize_fn: Function to tokenize target lines.
diff --git a/src/models/language_model.cc b/src/models/language_model.cc
index 466e42594..01ae7c8a4 100644
--- a/src/models/language_model.cc
+++ b/src/models/language_model.cc
@@ -122,7 +122,8 @@ namespace ctranslate2 {
                              state,
                              ids,
                              vocabulary,
-                             _model->preferred_size_multiple());
+                             _model->preferred_size_multiple(),
+                             options.offset);
     }
 
     bool DecoderReplica::skip_scoring(const std::vector<std::string>& tokens,
diff --git a/src/models/sequence_to_sequence.cc b/src/models/sequence_to_sequence.cc
index a7e64611f..ed4bb214b 100644
--- a/src/models/sequence_to_sequence.cc
+++ b/src/models/sequence_to_sequence.cc
@@ -256,7 +256,8 @@ namespace ctranslate2 {
                              state,
                              target_ids,
                              _model->get_target_vocabulary(),
-                             _model->preferred_size_multiple());
+                             _model->preferred_size_multiple(),
+                             options.offset);
     }
 
     bool EncoderDecoderReplica::skip_scoring(const std::vector<std::string>& source,
diff --git a/src/scoring.cc b/src/scoring.cc
index 1ccb64d91..23004b5aa 100644
--- a/src/scoring.cc
+++ b/src/scoring.cc
@@ -7,7 +7,8 @@ namespace ctranslate2 {
                   layers::DecoderState& state,
                   const std::vector<std::vector<size_t>>& sequences,
                   const Vocabulary& vocabulary,
-                  const dim_t preferred_size_multiple) {
+                  const dim_t preferred_size_multiple,
+                  const dim_t offset) {
     const dim_t batch_size = sequences.size();
     const Device device = decoder.device();
 
@@ -57,7 +58,7 @@ namespace ctranslate2 {
       auto& result = results[b];
       result.tokens.reserve(output_length);
       result.tokens_score.reserve(output_length);
-      for (dim_t t = 0; t < output_length; ++t) {
+      for (dim_t t = offset; t < output_length; ++t) {
         result.tokens.emplace_back(vocabulary.to_token(output_sequences[b][t]));
         result.tokens_score.emplace_back(scores.at<float>({b, t}));
       }