From 9f251b64f69f9fa0ce92513a90c6adc228bfd66c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 19 Sep 2023 11:47:17 +0000
Subject: [PATCH] update for non-legacy tokenizer

---
 data/dummy_conversation.json | 10 +++++++++-
 fastchat/train/train.py      |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/data/dummy_conversation.json b/data/dummy_conversation.json
index 2e3a97843..2bf582ac6 100644
--- a/data/dummy_conversation.json
+++ b/data/dummy_conversation.json
@@ -4412,6 +4412,14 @@
   {
     "id": "identity_210",
     "conversations": [
+      {
+        "from": "human",
+        "value": "What is up?"
+      },
+      {
+        "from": "gpt",
+        "value": "Hello! How can I help you today?"
+      },
       {
         "from": "human",
         "value": "Are you davinci-003?"
@@ -10491,4 +10499,4 @@
       }
     ]
   }
-]
\ No newline at end of file
+]
diff --git a/fastchat/train/train.py b/fastchat/train/train.py
index 965eb0beb..3491fcfe0 100644
--- a/fastchat/train/train.py
+++ b/fastchat/train/train.py
@@ -133,6 +133,9 @@ def preprocess(
             # "-2" is hardcoded for the LLaMA tokenizer to make the offset correct.
             instruction_len = len(tokenizer(parts[0]).input_ids) - 2
 
+            if i != 0 and not tokenizer.legacy:
+                instruction_len -= 1
+
             # Ignore the user instructions
             target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
             cur_len += turn_len