update for non-legacy tokenizer

lm-sys · Sep 19, 2023 · 9f251b6 · 9f251b6
1 parent bb8a8aa
commit 9f251b6
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 1 deletion.
diff --git a/data/dummy_conversation.json b/data/dummy_conversation.json
@@ -4412,6 +4412,14 @@
   {
     "id": "identity_210",
     "conversations": [
+      {
+        "from": "human",
+        "value": "What is up?"
+      },
+      {
+        "from": "gpt",
+        "value": "Hello! How can I help you today?"
+      },
       {
         "from": "human",
         "value": "Are you davinci-003?"
@@ -10491,4 +10499,4 @@
       }
     ]
   }
-]
+]
diff --git a/fastchat/train/train.py b/fastchat/train/train.py
@@ -133,6 +133,9 @@ def preprocess(
             # "-2" is hardcoded for the LLaMA tokenizer to make the offset correct.
             instruction_len = len(tokenizer(parts[0]).input_ids) - 2
 
+            if i != 0 and not tokenizer.legacy:
+                instruction_len -= 1
+
             # Ignore the user instructions
             target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
             cur_len += turn_len