From 9f251b64f69f9fa0ce92513a90c6adc228bfd66c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 19 Sep 2023 11:47:17 +0000 Subject: [PATCH] update for non-legacy tokenizer --- data/dummy_conversation.json | 10 +++++++++- fastchat/train/train.py | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/data/dummy_conversation.json b/data/dummy_conversation.json index 2e3a97843..2bf582ac6 100644 --- a/data/dummy_conversation.json +++ b/data/dummy_conversation.json @@ -4412,6 +4412,14 @@ { "id": "identity_210", "conversations": [ + { + "from": "human", + "value": "What is up?" + }, + { + "from": "gpt", + "value": "Hello! How can I help you today?" + }, { "from": "human", "value": "Are you davinci-003?" @@ -10491,4 +10499,4 @@ } ] } -] \ No newline at end of file +] diff --git a/fastchat/train/train.py b/fastchat/train/train.py index 965eb0beb..3491fcfe0 100644 --- a/fastchat/train/train.py +++ b/fastchat/train/train.py @@ -133,6 +133,9 @@ def preprocess( # "-2" is hardcoded for the LLaMA tokenizer to make the offset correct. instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + if i != 0 and not tokenizer.legacy: + instruction_len -= 1 + # Ignore the user instructions target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID cur_len += turn_len