diff --git a/dbgpt_hub/configs/ds_config.json b/dbgpt_hub/configs/stage2.json similarity index 99% rename from dbgpt_hub/configs/ds_config.json rename to dbgpt_hub/configs/stage2.json index 04158ad..e96d4d9 100644 --- a/dbgpt_hub/configs/ds_config.json +++ b/dbgpt_hub/configs/stage2.json @@ -20,4 +20,4 @@ "overlap_comm": false, "contiguous_gradients": true } - } \ No newline at end of file + } diff --git a/dbgpt_hub/configs/stage3.json b/dbgpt_hub/configs/stage3.json new file mode 100644 index 0000000..ea17347 --- /dev/null +++ b/dbgpt_hub/configs/stage3.json @@ -0,0 +1,32 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "stage3_max_live_parameters" : 1e9, + "stage3_max_reuse_distance" : 1e9, + "stage3_prefetch_bucket_size" : 5e8, + "stage3_param_persistence_threshold" : 1e6, + "sub_group_size" : 1e12, + "stage3_gather_16bit_weights_on_model_save": true + }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto" +} diff --git a/dbgpt_hub/predict/predict.py b/dbgpt_hub/predict/predict.py index c382980..e4653b7 100644 --- a/dbgpt_hub/predict/predict.py +++ b/dbgpt_hub/predict/predict.py @@ -1,6 +1,7 @@ import os import json import sys +from tqdm import tqdm ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ROOT_PATH) @@ -26,7 +27,7 @@ def inference(model: ChatModel, predict_data: List[Dict], **input_kwargs): res = [] # test # for item in predict_data[:20]: - for item in predict_data: + for item in tqdm(predict_data, desc="Inference Progress", unit="item"): response, _ = model.chat(query=item["input"], history=[], **input_kwargs) res.append(response) return res diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh index cbe2187..ca1112e 100644 --- a/dbgpt_hub/scripts/train_sft.sh +++ b/dbgpt_hub/scripts/train_sft.sh @@ -1,5 +1,7 @@ -wandb offline +wandb offline # Close wandb # v100 ,单卡 +current_date=$(date +"%Y%m%d_%H%M%S") +train_log="dbgpt_hub/output/train_${current_date}.log" CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --quantization_bit 4 \ --model_name_or_path /home/model/Baichuan2-13B-Chat \ @@ -22,13 +24,13 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --save_steps 10 \ --learning_rate 5e-5 \ --num_train_epochs 0.2 \ - --plot_loss + --plot_loss 2>&1 | tee ${train_log} # --bf16#v100不支持bf16 # test num_train_epochs set to 0.1 # 多卡,deepseed启动,A100 # deepspeed --num_gpus 2 dbgpt_hub/train/sft_train.py \ -# --deepspeed dbgpt_hub/configs/ds_config.json \ +# --deepspeed dbgpt_hub/configs/stage2.json \ # --quantization_bit 4 \ # --model_name_or_path /home/model_files/Llama-2-13b-chat-hf \ # --do_train \ @@ -51,4 +53,27 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ # --learning_rate 2e-4 \ # --num_train_epochs 0.1 \ # --plot_loss \ -# --bf16 \ No newline at end of file +# --bf16 2>&1 | tee ${train_log} + + +# 多卡,deepseed,全量微调 +# deepspeed --include localhost:4,5,6,7 dbgpt_hub/train/sft_train.py \ +# --dataset example_text2sql_train \ +# --model_name_or_path CodeLlama-7b-Instruct-hf \ +# --do_train \ +# --finetuning_type full \ +# --max_source_length 2048 \ +# --max_target_length 512 \ +# --template llama2 \ +# --output_dir dbgpt_hub/output/adapter/code-llama-7b-2048_epoch4_full \ +# --overwrite_cache \ +# --overwrite_output_dir \ +# --per_device_train_batch_size 4 \ +# --gradient_accumulation_steps 16 \ +# --lr_scheduler_type cosine_with_restarts \ +# --logging_steps 50 \ +# --learning_rate 2e-5 \ +# --num_train_epochs 4 \ +# --plot_loss \ +# --bf16 True\ +# --deepspeed dbgpt_hub/configs/stage3.json 2>&1 | tee ${train_log}