Skip to content

Commit

Permalink
Merge pull request #114 from eosphoros-ai/zhanghy-sketchzh-patch-1
Browse files Browse the repository at this point in the history
【Update 】Update the progress bar when forecasting, add stage3 full fine-tuning, and add sft logs
  • Loading branch information
wangzaistone authored Nov 2, 2023
2 parents 840521c + 9d95c65 commit 0fc03d9
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"overlap_comm": false,
"contiguous_gradients": true
}
}
}
32 changes: 32 additions & 0 deletions dbgpt_hub/configs/stage3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"stage3_max_live_parameters" : 1e9,
"stage3_max_reuse_distance" : 1e9,
"stage3_prefetch_bucket_size" : 5e8,
"stage3_param_persistence_threshold" : 1e6,
"sub_group_size" : 1e12,
"stage3_gather_16bit_weights_on_model_save": true
},
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto"
}
3 changes: 2 additions & 1 deletion dbgpt_hub/predict/predict.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
import sys
from tqdm import tqdm

ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ROOT_PATH)
Expand All @@ -26,7 +27,7 @@ def inference(model: ChatModel, predict_data: List[Dict], **input_kwargs):
res = []
# test
# for item in predict_data[:20]:
for item in predict_data:
for item in tqdm(predict_data, desc="Inference Progress", unit="item"):
response, _ = model.chat(query=item["input"], history=[], **input_kwargs)
res.append(response)
return res
Expand Down
33 changes: 29 additions & 4 deletions dbgpt_hub/scripts/train_sft.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
wandb offline
wandb offline # Close wandb
# v100 ,单卡
current_date=$(date +"%Y%m%d_%H%M%S")
train_log="dbgpt_hub/output/train_${current_date}.log"
CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
--quantization_bit 4 \
--model_name_or_path /home/model/Baichuan2-13B-Chat \
Expand All @@ -22,13 +24,13 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
--save_steps 10 \
--learning_rate 5e-5 \
--num_train_epochs 0.2 \
--plot_loss
--plot_loss 2>&1 | tee ${train_log}
# --bf16#v100不支持bf16
# test num_train_epochs set to 0.1

# 多卡,deepseed启动,A100
# deepspeed --num_gpus 2 dbgpt_hub/train/sft_train.py \
# --deepspeed dbgpt_hub/configs/ds_config.json \
# --deepspeed dbgpt_hub/configs/stage2.json \
# --quantization_bit 4 \
# --model_name_or_path /home/model_files/Llama-2-13b-chat-hf \
# --do_train \
Expand All @@ -51,4 +53,27 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \
# --learning_rate 2e-4 \
# --num_train_epochs 0.1 \
# --plot_loss \
# --bf16
# --bf16 2>&1 | tee ${train_log}


# 多卡,deepseed,全量微调
# deepspeed --include localhost:4,5,6,7 dbgpt_hub/train/sft_train.py \
# --dataset example_text2sql_train \
# --model_name_or_path CodeLlama-7b-Instruct-hf \
# --do_train \
# --finetuning_type full \
# --max_source_length 2048 \
# --max_target_length 512 \
# --template llama2 \
# --output_dir dbgpt_hub/output/adapter/code-llama-7b-2048_epoch4_full \
# --overwrite_cache \
# --overwrite_output_dir \
# --per_device_train_batch_size 4 \
# --gradient_accumulation_steps 16 \
# --lr_scheduler_type cosine_with_restarts \
# --logging_steps 50 \
# --learning_rate 2e-5 \
# --num_train_epochs 4 \
# --plot_loss \
# --bf16 True\
# --deepspeed dbgpt_hub/configs/stage3.json 2>&1 | tee ${train_log}

0 comments on commit 0fc03d9

Please sign in to comment.