-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from bofenghuang/next
Update to version 2.2
- Loading branch information
Showing
75 changed files
with
6,531 additions
and
71,140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51,708 changes: 0 additions & 51,708 deletions
51,708
data/chat/converted_alpaca_data_cleaned_fr_52k.jsonl
This file was deleted.
Oops, something went wrong.
15,003 changes: 0 additions & 15,003 deletions
15,003
data/chat/converted_dolly_bactrian_fr_15k.jsonl
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env bash | ||
# Copyright 2023 Bofeng Huang | ||
|
||
# Train chat models using QLoRA (int4) | ||
|
||
export WANDB_PROJECT="llm-sft-chat" | ||
export OMP_NUM_THREADS="1" | ||
export TOKENIZERS_PARALLELISM="false" | ||
export BITSANDBYTES_NOWELCOME="1" | ||
# export CUDA_VISIBLE_DEVICES="0" | ||
|
||
# Model | ||
model_name_or_path=tiiuae/falcon-7b | ||
# model_name_or_path=tiiuae/falcon-40b | ||
|
||
# Dataset | ||
# Customize dataset here | ||
train_file=data/chat/oasst_20230412_fr_top1.jsonl | ||
model_max_length=2048 | ||
|
||
# Outdir | ||
run_name=falcon-7b-sft-chat-qlora | ||
output_dir=outputs/$run_name | ||
|
||
# Might need to adjust the batch size and other hyperparameters by yourself | ||
per_device_train_batch_size=8 | ||
per_device_eval_batch_size=4 | ||
gradient_accumulation_steps=8 | ||
|
||
# Further optimization | ||
# DeepSpeed Stage 2 | ||
# --deepspeed vigogne/configs/ds_config_zero2_no_offload.json \ | ||
|
||
torchrun \ | ||
vigogne/cli/train_sft.py \ | ||
--model_name_or_path $model_name_or_path \ | ||
--tokenizer_use_fast false \ | ||
--tokenizer_padding_side "right" \ | ||
--add_special_tokens '{"bos_token":">>ABSTRACT<<","pad_token":"<|endoftext|>"}' \ | ||
--train_file $train_file \ | ||
--output_dir $output_dir \ | ||
--overwrite_output_dir \ | ||
--run_name $run_name \ | ||
--processor_style "vigogne_chat_v3" \ | ||
--model_max_length $model_max_length \ | ||
--eval_split_ratio "0.01" \ | ||
--preprocessing_num_workers "8" \ | ||
--dataloader_num_workers "1" \ | ||
--adapter "qlora" \ | ||
--load_in_4bit \ | ||
--optim "paged_adamw_32bit" \ | ||
--lora_r "64" \ | ||
--lora_alpha "16" \ | ||
--lora_dropout "0.05" \ | ||
--lora_target_all_linear_layers \ | ||
--do_merge_lora \ | ||
--num_train_epochs "3" \ | ||
--per_device_train_batch_size $per_device_train_batch_size \ | ||
--per_device_eval_batch_size $per_device_eval_batch_size \ | ||
--gradient_accumulation_steps $gradient_accumulation_steps \ | ||
--learning_rate "1e-4" \ | ||
--warmup_ratio "0.03" \ | ||
--lr_scheduler_type "cosine" \ | ||
--weight_decay "0" \ | ||
--fp16 \ | ||
--gradient_checkpointing \ | ||
--ddp_find_unused_parameters false \ | ||
--log_level "info" \ | ||
--logging_steps "1" \ | ||
--logging_first_step \ | ||
--save_strategy "steps" \ | ||
--save_steps "100" \ | ||
--save_total_limit "3" \ | ||
--evaluation_strategy "steps" \ | ||
--eval_steps "100" \ | ||
--report_to "tensorboard" "wandb" |
53 changes: 30 additions & 23 deletions
53
examples/train/train_sft_chat_lora_int8.sh → examples/train/llama2/train_sft_chat.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,58 +1,65 @@ | ||
#!/usr/bin/env bash | ||
# Copyright 2023 Bofeng Huang | ||
|
||
export WANDB_PROJECT="llm-sft-chat-fr" | ||
# Train chat models using full fine-tuning + DeepSpeed Stage 3 | ||
|
||
export WANDB_PROJECT="llm-sft-chat" | ||
export OMP_NUM_THREADS="1" | ||
export TOKENIZERS_PARALLELISM="false" | ||
export BITSANDBYTES_NOWELCOME="1" | ||
export CUDA_VISIBLE_DEVICES="0,1,2,3" | ||
|
||
train_file=/path/to/train/chat/file.jsonl | ||
# Model | ||
model_name_or_path=meta-llama/Llama-2-7b-hf | ||
|
||
mode=chat | ||
# Dataset | ||
# Customize dataset here | ||
train_file=data/chat/oasst_20230412_fr_top1.jsonl | ||
model_max_length=2048 | ||
|
||
model_name_or_path=meta-llama/Llama-2-7b-hf | ||
output_dir=outputs/llama-2-7b-sft-chat-lora-int8 | ||
# Outdir | ||
run_name=llama-2-7b-sft-chat-fullfinetune | ||
output_dir=outputs/$run_name | ||
|
||
per_device_train_batch_size=8 | ||
# Might need to adjust the batch size and other hyperparameters by yourself | ||
per_device_train_batch_size=4 | ||
per_device_eval_batch_size=2 | ||
gradient_accumulation_steps=4 | ||
|
||
# Might need to adjust the batch size and other hyperparameters by yourself | ||
torchrun \ | ||
--nproc_per_node 4 \ | ||
vigogne/train/train_sft.py \ | ||
vigogne/cli/train_sft.py \ | ||
--deepspeed vigogne/configs/ds_config_zero3_no_offload.json \ | ||
--model_name_or_path $model_name_or_path \ | ||
--tokenizer_use_fast false \ | ||
--tokenizer_padding_side "right" \ | ||
--train_file $train_file \ | ||
--output_dir $output_dir \ | ||
--overwrite_output_dir \ | ||
--mode $mode \ | ||
--run_name $run_name \ | ||
--processor_style "vigogne_chat_v3" \ | ||
--model_max_length $model_max_length \ | ||
--eval_split_ratio "0.01" \ | ||
--preprocessing_num_workers "8" \ | ||
--dataloader_num_workers "1" \ | ||
--pack_into_block \ | ||
--block_size "2048" \ | ||
--load_in_8bit \ | ||
--lora_r "64" \ | ||
--lora_alpha "16" \ | ||
--lora_dropout "0.05" \ | ||
--target_modules "q_proj" "v_proj" "k_proj" "o_proj" "gate_proj" "up_proj" "down_proj" \ | ||
--num_train_epochs "3" \ | ||
--per_device_train_batch_size $per_device_train_batch_size \ | ||
--per_device_eval_batch_size $per_device_eval_batch_size \ | ||
--gradient_accumulation_steps $gradient_accumulation_steps \ | ||
--num_train_epochs "3" \ | ||
--learning_rate "1e-4" \ | ||
--optim "adamw_bnb_8bit" \ | ||
--learning_rate "2.5e-5" \ | ||
--warmup_ratio "0.03" \ | ||
--lr_scheduler_type "cosine" \ | ||
--weight_decay "0" \ | ||
--torch_compile \ | ||
--fp16 \ | ||
--gradient_checkpointing \ | ||
--ddp_find_unused_parameters false \ | ||
--log_level "info" \ | ||
--logging_steps "10" \ | ||
--logging_first_step true \ | ||
--logging_steps "1" \ | ||
--logging_first_step \ | ||
--save_strategy "steps" \ | ||
--save_steps "100" \ | ||
--save_total_limit "3" \ | ||
--report_to "tensorboard" "wandb" \ | ||
--do_train | ||
--evaluation_strategy "steps" \ | ||
--eval_steps "100" \ | ||
--report_to "tensorboard" "wandb" |
60 changes: 39 additions & 21 deletions
60
...les/train/train_sft_instruct_lora_int8.sh → examples/train/llama2/train_sft_chat_lora.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,58 +1,76 @@ | ||
#!/usr/bin/env bash | ||
# Copyright 2023 Bofeng Huang | ||
|
||
export WANDB_PROJECT="llm-sft-instruct-fr" | ||
# Train chat models using LoRA | ||
|
||
export WANDB_PROJECT="llm-sft-chat" | ||
export OMP_NUM_THREADS="1" | ||
export TOKENIZERS_PARALLELISM="false" | ||
export BITSANDBYTES_NOWELCOME="1" | ||
export CUDA_VISIBLE_DEVICES="0,1,2,3" | ||
# export CUDA_VISIBLE_DEVICES="0" | ||
|
||
train_file=/path/to/train/instruct/file.jsonl | ||
# Model | ||
model_name_or_path=meta-llama/Llama-2-7b-hf | ||
|
||
mode=instruct | ||
# Dataset | ||
# Customize dataset here | ||
train_file=data/chat/oasst_20230412_fr_top1.jsonl | ||
model_max_length=2048 | ||
|
||
model_name_or_path=meta-llama/Llama-2-7b-hf | ||
output_dir=outputs/llama-2-7b-sft-instruct-lora-int8 | ||
# Outdir | ||
run_name=llama-2-7b-sft-chat-lora | ||
output_dir=outputs/$run_name | ||
|
||
# Might need to adjust the batch size and other hyperparameters by yourself | ||
per_device_train_batch_size=8 | ||
gradient_accumulation_steps=4 | ||
per_device_eval_batch_size=4 | ||
gradient_accumulation_steps=8 | ||
|
||
# Further optimization | ||
# DeepSpeed Stage 2 | ||
# --deepspeed vigogne/configs/ds_config_zero2_no_offload.json \ | ||
# LLM.int8() | ||
# --load_in_8bit \ | ||
# 8bit optimizer | ||
# --optim "adamw_bnb_8bit" \ | ||
|
||
# Might need to adjust the batch size and other hyperparameters by yourself | ||
torchrun \ | ||
--nproc_per_node 4 \ | ||
vigogne/train/train_sft.py \ | ||
vigogne/cli/train_sft.py \ | ||
--model_name_or_path $model_name_or_path \ | ||
--tokenizer_use_fast false \ | ||
--tokenizer_padding_side "right" \ | ||
--train_file $train_file \ | ||
--output_dir $output_dir \ | ||
--overwrite_output_dir \ | ||
--mode $mode \ | ||
--run_name $run_name \ | ||
--processor_style "vigogne_chat_v3" \ | ||
--model_max_length $model_max_length \ | ||
--eval_split_ratio "0.01" \ | ||
--preprocessing_num_workers "8" \ | ||
--dataloader_num_workers "1" \ | ||
--pack_into_block \ | ||
--block_size "2048" \ | ||
--load_in_8bit \ | ||
--adapter "lora" \ | ||
--lora_r "64" \ | ||
--lora_alpha "16" \ | ||
--lora_dropout "0.05" \ | ||
--target_modules "q_proj" "v_proj" "k_proj" "o_proj" "gate_proj" "up_proj" "down_proj" \ | ||
--lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" "gate_proj" "up_proj" "down_proj" \ | ||
--do_merge_lora \ | ||
--num_train_epochs "3" \ | ||
--per_device_train_batch_size $per_device_train_batch_size \ | ||
--per_device_eval_batch_size $per_device_eval_batch_size \ | ||
--gradient_accumulation_steps $gradient_accumulation_steps \ | ||
--num_train_epochs "3" \ | ||
--learning_rate "1e-4" \ | ||
--warmup_ratio "0.03" \ | ||
--lr_scheduler_type "cosine" \ | ||
--weight_decay "0" \ | ||
--torch_compile \ | ||
--fp16 \ | ||
--gradient_checkpointing \ | ||
--ddp_find_unused_parameters false \ | ||
--log_level "info" \ | ||
--logging_steps "10" \ | ||
--logging_first_step true \ | ||
--logging_steps "1" \ | ||
--logging_first_step \ | ||
--save_strategy "steps" \ | ||
--save_steps "100" \ | ||
--save_total_limit "3" \ | ||
--report_to "tensorboard" "wandb" \ | ||
--do_train | ||
--evaluation_strategy "steps" \ | ||
--eval_steps "100" \ | ||
--report_to "tensorboard" "wandb" |
Oops, something went wrong.