From 4ec17a7cdf07db4ec4dd6b6e01ba9b88d61b4f9f Mon Sep 17 00:00:00 2001 From: zhurunhua <1281592874@qq.com> Date: Sun, 21 Jul 2024 19:46:01 +0800 Subject: [PATCH] [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931) * cannot access local variable 'default_conversation' where it is not associated with a value set default value for 'default_conversation' * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- applications/Colossal-LLaMA/prepare_sft_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/applications/Colossal-LLaMA/prepare_sft_dataset.py b/applications/Colossal-LLaMA/prepare_sft_dataset.py index a857d6c0c696..fe57907601f6 100644 --- a/applications/Colossal-LLaMA/prepare_sft_dataset.py +++ b/applications/Colossal-LLaMA/prepare_sft_dataset.py @@ -10,7 +10,7 @@ import os from multiprocessing import cpu_count -from colossal_llama.dataset.conversation import LLaMA2_Conv +from colossal_llama.dataset.conversation import LLaMA2_Conv, LLaMA3_Conv from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft from datasets import dataset_dict, load_dataset from transformers import AddedToken, AutoTokenizer @@ -75,6 +75,8 @@ def main(): # Prepare to the tokenizer. tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) + default_conversation = LLaMA3_Conv + # Fix split issue: https://github.com/huggingface/transformers/issues/23833 if args.llama_version == 2: tokenizer.add_tokens(AddedToken("", normalized=False, special=True), special_tokens=True)