Merge pull request #114 from eosphoros-ai/zhanghy-sketchzh-patch-1

【Update 】Update the progress bar when forecasting, add stage3 full fine-tuning, and add sft logs
eosphoros-ai · Nov 2, 2023 · 0fc03d9 · 0fc03d9
2 parents 840521c + 9d95c65
commit 0fc03d9
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 6 deletions.
diff --git a/dbgpt_hub/configs/ds_config.json → dbgpt_hub/configs/stage2.json b/dbgpt_hub/configs/ds_config.json → dbgpt_hub/configs/stage2.json
@@ -20,4 +20,4 @@
       "overlap_comm": false,
       "contiguous_gradients": true
     }
-  }
+  }
diff --git a/dbgpt_hub/configs/stage3.json b/dbgpt_hub/configs/stage3.json
@@ -0,0 +1,32 @@
+{
+  "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+  },
+  "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+          "device": "cpu",
+          "pin_memory": true
+      },
+      "offload_param": {
+          "device": "cpu",
+          "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "stage3_max_live_parameters" : 1e9,
+      "stage3_max_reuse_distance" : 1e9,
+      "stage3_prefetch_bucket_size" : 5e8,
+      "stage3_param_persistence_threshold" : 1e6,
+      "sub_group_size" : 1e12,
+      "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto"
+}
diff --git a/dbgpt_hub/predict/predict.py b/dbgpt_hub/predict/predict.py
@@ -1,6 +1,7 @@
 import os
 import json
 import sys
+from tqdm import tqdm
 
 ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(ROOT_PATH)
@@ -26,7 +27,7 @@ def inference(model: ChatModel, predict_data: List[Dict], **input_kwargs):
     res = []
     # test
     # for item in predict_data[:20]:
-    for item in predict_data:
+    for item in tqdm(predict_data, desc="Inference Progress", unit="item"):
         response, _ = model.chat(query=item["input"], history=[], **input_kwargs)
         res.append(response)
     return res

diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh
@@ -1,5 +1,7 @@
-wandb offline
+wandb offline # Close wandb
 # v100 ,单卡
+current_date=$(date +"%Y%m%d_%H%M%S")
+train_log="dbgpt_hub/output/train_${current_date}.log"
 CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
     --quantization_bit 4 \
     --model_name_or_path /home/model/Baichuan2-13B-Chat \
@@ -22,13 +24,13 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
     --save_steps 10 \
     --learning_rate 5e-5 \
     --num_train_epochs 0.2 \
-    --plot_loss 
+    --plot_loss 2>&1 | tee ${train_log}
     # --bf16#v100不支持bf16
     # test  num_train_epochs set to 0.1
 
 # 多卡，deepseed启动，A100
 # deepspeed --num_gpus 2  dbgpt_hub/train/sft_train.py \
-#     --deepspeed dbgpt_hub/configs/ds_config.json \
+#     --deepspeed dbgpt_hub/configs/stage2.json \
 #     --quantization_bit 4 \
 #     --model_name_or_path /home/model_files/Llama-2-13b-chat-hf \
 #     --do_train \
@@ -51,4 +53,27 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
 #     --learning_rate 2e-4 \
 #     --num_train_epochs 0.1 \
 #     --plot_loss \
-#     --bf16
+#     --bf16 2>&1 | tee ${train_log}
+
+
+# 多卡，deepseed，全量微调
+# deepspeed --include localhost:4,5,6,7  dbgpt_hub/train/sft_train.py \
+#     --dataset example_text2sql_train \
+#     --model_name_or_path CodeLlama-7b-Instruct-hf \
+#     --do_train \
+#     --finetuning_type full \
+#     --max_source_length 2048 \
+#     --max_target_length 512 \
+#     --template llama2 \
+#     --output_dir dbgpt_hub/output/adapter/code-llama-7b-2048_epoch4_full \
+#     --overwrite_cache \
+#     --overwrite_output_dir \
+#     --per_device_train_batch_size 4 \
+#     --gradient_accumulation_steps 16 \
+#     --lr_scheduler_type cosine_with_restarts \
+#     --logging_steps 50 \
+#     --learning_rate 2e-5 \
+#     --num_train_epochs 4 \
+#     --plot_loss \
+#     --bf16 True\
+#     --deepspeed dbgpt_hub/configs/stage3.json 2>&1 | tee ${train_log}
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,4 +20,4 @@ @@
           "overlap_comm": false,
           "contiguous_gradients": true
         }
-      }
+      }