Skip to content

Commit

Permalink
update for non-legacy tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy committed Sep 19, 2023
1 parent bb8a8aa commit 9f251b6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
10 changes: 9 additions & 1 deletion data/dummy_conversation.json
Original file line number Diff line number Diff line change
Expand Up @@ -4412,6 +4412,14 @@
{
"id": "identity_210",
"conversations": [
{
"from": "human",
"value": "What is up?"
},
{
"from": "gpt",
"value": "Hello! How can I help you today?"
},
{
"from": "human",
"value": "Are you davinci-003?"
Expand Down Expand Up @@ -10491,4 +10499,4 @@
}
]
}
]
]
3 changes: 3 additions & 0 deletions fastchat/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ def preprocess(
# "-2" is hardcoded for the LLaMA tokenizer to make the offset correct.
instruction_len = len(tokenizer(parts[0]).input_ids) - 2

if i != 0 and not tokenizer.legacy:
instruction_len -= 1

# Ignore the user instructions
target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
cur_len += turn_len
Expand Down

0 comments on commit 9f251b6

Please sign in to comment.