From d60822a9c3b49582184e38e8b508f515bb878b4f Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:15:07 +0800 Subject: [PATCH 1/7] Update predict.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加tqdm显示 --- dbgpt_hub/predict/predict.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbgpt_hub/predict/predict.py b/dbgpt_hub/predict/predict.py index c382980..e4653b7 100644 --- a/dbgpt_hub/predict/predict.py +++ b/dbgpt_hub/predict/predict.py @@ -1,6 +1,7 @@ import os import json import sys +from tqdm import tqdm ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ROOT_PATH) @@ -26,7 +27,7 @@ def inference(model: ChatModel, predict_data: List[Dict], **input_kwargs): res = [] # test # for item in predict_data[:20]: - for item in predict_data: + for item in tqdm(predict_data, desc="Inference Progress", unit="item"): response, _ = model.chat(query=item["input"], history=[], **input_kwargs) res.append(response) return res From 61caf9443ff1461aef0cb916b2f358e01b74a109 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:27:53 +0800 Subject: [PATCH 2/7] Update train_sft.sh --- dbgpt_hub/scripts/train_sft.sh | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh index cbe2187..7ff9230 100644 --- a/dbgpt_hub/scripts/train_sft.sh +++ b/dbgpt_hub/scripts/train_sft.sh @@ -1,5 +1,7 @@ -wandb offline +wandb offline # Close wandb # v100 ,单卡 +current_date=$(date +"%Y%m%d_%H%M%S") +train_log="outputs/train_${current_date}.log" CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --quantization_bit 4 \ --model_name_or_path /home/model/Baichuan2-13B-Chat \ @@ -22,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --save_steps 10 \ --learning_rate 5e-5 \ --num_train_epochs 0.2 \ - --plot_loss + --plot_loss 2>&1 | tee ${train_log} # --bf16#v100不支持bf16 # test num_train_epochs set to 0.1 @@ -51,4 +53,27 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ # --learning_rate 2e-4 \ # --num_train_epochs 0.1 \ # --plot_loss \ -# --bf16 \ No newline at end of file +# --bf16 2>&1 | tee ${train_log} + + +# 多卡,deepseed,全量微调 +# deepspeed --include localhost:4,5,6,7 dbgpt_hub/train/sft_train.py \ +# --dataset example_text2sql_train \ +# --model_name_or_path CodeLlama-7b-Instruct-hf \ +# --do_train \ +# --finetuning_type full \ +# --max_source_length 2048 \ +# --max_target_length 512 \ +# --template llama2 \ +# --output_dir dbgpt_hub/output/adapter/code_llama-7b-2048_epoch4_full \ +# --overwrite_cache \ +# --overwrite_output_dir \ +# --per_device_train_batch_size 4 \ +# --gradient_accumulation_steps 16 \ +# --lr_scheduler_type cosine_with_restarts \ +# --logging_steps 50 \ +# --learning_rate 2e-5 \ +# --num_train_epochs 4 \ +# --plot_loss \ +# --bf16 True\ +# --deepspeed dbgpt_hub/configs/stage3.json 2>&1 | tee ${train_log} From b42fd01045afafc2fd66613bf79ae1f82c860689 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:28:50 +0800 Subject: [PATCH 3/7] Update train_sft.sh --- dbgpt_hub/scripts/train_sft.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh index 7ff9230..24f36c7 100644 --- a/dbgpt_hub/scripts/train_sft.sh +++ b/dbgpt_hub/scripts/train_sft.sh @@ -1,7 +1,7 @@ wandb offline # Close wandb # v100 ,单卡 current_date=$(date +"%Y%m%d_%H%M%S") -train_log="outputs/train_${current_date}.log" +train_log="dbgpt_hub/output/train_${current_date}.log" CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --quantization_bit 4 \ --model_name_or_path /home/model/Baichuan2-13B-Chat \ @@ -65,7 +65,7 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ # --max_source_length 2048 \ # --max_target_length 512 \ # --template llama2 \ -# --output_dir dbgpt_hub/output/adapter/code_llama-7b-2048_epoch4_full \ +# --output_dir dbgpt_hub/output/adapter/code-llama-7b-2048_epoch4_full \ # --overwrite_cache \ # --overwrite_output_dir \ # --per_device_train_batch_size 4 \ From 940a31c905d7d7afb8f80b077c2155e0ab131412 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:30:37 +0800 Subject: [PATCH 4/7] Create satge3.json --- dbgpt_hub/configs/satge3.json | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 dbgpt_hub/configs/satge3.json diff --git a/dbgpt_hub/configs/satge3.json b/dbgpt_hub/configs/satge3.json new file mode 100644 index 0000000..ea17347 --- /dev/null +++ b/dbgpt_hub/configs/satge3.json @@ -0,0 +1,32 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "stage3_max_live_parameters" : 1e9, + "stage3_max_reuse_distance" : 1e9, + "stage3_prefetch_bucket_size" : 5e8, + "stage3_param_persistence_threshold" : 1e6, + "sub_group_size" : 1e12, + "stage3_gather_16bit_weights_on_model_save": true + }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto" +} From fff1bf655f0ef2ed936ec283bfcc59af1a00681d Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:31:20 +0800 Subject: [PATCH 5/7] Rename ds_config.json to stage2.json --- dbgpt_hub/configs/{ds_config.json => stage2.json} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename dbgpt_hub/configs/{ds_config.json => stage2.json} (99%) diff --git a/dbgpt_hub/configs/ds_config.json b/dbgpt_hub/configs/stage2.json similarity index 99% rename from dbgpt_hub/configs/ds_config.json rename to dbgpt_hub/configs/stage2.json index 04158ad..e96d4d9 100644 --- a/dbgpt_hub/configs/ds_config.json +++ b/dbgpt_hub/configs/stage2.json @@ -20,4 +20,4 @@ "overlap_comm": false, "contiguous_gradients": true } - } \ No newline at end of file + } From 3c6edc0d8ce9c33b8052374c9e5e850692e174f6 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:31:35 +0800 Subject: [PATCH 6/7] Rename satge3.json to stage3.json --- dbgpt_hub/configs/{satge3.json => stage3.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dbgpt_hub/configs/{satge3.json => stage3.json} (100%) diff --git a/dbgpt_hub/configs/satge3.json b/dbgpt_hub/configs/stage3.json similarity index 100% rename from dbgpt_hub/configs/satge3.json rename to dbgpt_hub/configs/stage3.json From 9d95c65445444ad8e94f01f8a8d639ddd78c3cd4 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:33:49 +0800 Subject: [PATCH 7/7] Update train_sft.sh --- dbgpt_hub/scripts/train_sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh index 24f36c7..ca1112e 100644 --- a/dbgpt_hub/scripts/train_sft.sh +++ b/dbgpt_hub/scripts/train_sft.sh @@ -30,7 +30,7 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ # 多卡,deepseed启动,A100 # deepspeed --num_gpus 2 dbgpt_hub/train/sft_train.py \ -# --deepspeed dbgpt_hub/configs/ds_config.json \ +# --deepspeed dbgpt_hub/configs/stage2.json \ # --quantization_bit 4 \ # --model_name_or_path /home/model_files/Llama-2-13b-chat-hf \ # --do_train \