From f2d803967c756310e9615d0c3b2bc3143ad91a75 Mon Sep 17 00:00:00 2001 From: Thariq Nugrohotomo <87131717+thariq-nugrohotomo@users.noreply.github.com> Date: Mon, 30 Oct 2023 16:33:18 +0700 Subject: [PATCH] Decoder start token should be `eos` instead of `cls`. When I pass `cls` or `bos` as the initial decoder token, the output (first decoded token) rarely get correct. But once I try to use `eos`, the output is correct, or at least similar with the output returned by `model.generate()`. In the official code from Microsoft, they will fallback to `eos` if the token is not specified https://github.com/microsoft/unilm/blob/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr/generator.py#L84 --- ...OCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb b/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb index 58779c88..24f13343 100644 --- a/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb +++ b/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb @@ -1736,7 +1736,7 @@ }, "source": [ "# set special tokens used for creating the decoder_input_ids from the labels\n", - "model.config.decoder_start_token_id = processor.tokenizer.cls_token_id\n", + "model.config.decoder_start_token_id = processor.tokenizer.eos_token_id\n", "model.config.pad_token_id = processor.tokenizer.pad_token_id\n", "# make sure vocab size is set correctly\n", "model.config.vocab_size = model.config.decoder.vocab_size\n", @@ -1938,4 +1938,4 @@ ] } ] -} \ No newline at end of file +}