From f2d803967c756310e9615d0c3b2bc3143ad91a75 Mon Sep 17 00:00:00 2001
From: Thariq Nugrohotomo
 <87131717+thariq-nugrohotomo@users.noreply.github.com>
Date: Mon, 30 Oct 2023 16:33:18 +0700
Subject: [PATCH] Decoder start token should be `eos` instead of `cls`.

When I pass `cls` or `bos` as the initial decoder token, the output (first decoded token) rarely get correct.
But once I try to use `eos`, the output is correct, or at least similar with the output returned by `model.generate()`.

In the official code from Microsoft, they will fallback to `eos` if the token is not specified https://github.com/microsoft/unilm/blob/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr/generator.py#L84
---
 ...OCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb b/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb
index 58779c88..24f13343 100644
--- a/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb
+++ b/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb
@@ -1736,7 +1736,7 @@
       },
       "source": [
         "# set special tokens used for creating the decoder_input_ids from the labels\n",
-        "model.config.decoder_start_token_id = processor.tokenizer.cls_token_id\n",
+        "model.config.decoder_start_token_id = processor.tokenizer.eos_token_id\n",
         "model.config.pad_token_id = processor.tokenizer.pad_token_id\n",
         "# make sure vocab size is set correctly\n",
         "model.config.vocab_size = model.config.decoder.vocab_size\n",
@@ -1938,4 +1938,4 @@
       ]
     }
   ]
-}
\ No newline at end of file
+}