You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if [ -n "$MLP_ROLE_INDEX" ]; then
export NODE_RANK=$MLP_ROLE_INDEX
echo "Using MLP_ROLE_INDEX for NODE_RANK: $NODE_RANK"
elif [ -n "$NODE_RANK" ]; then
echo "Using NODE_RANK: $NODE_RANK"
else
echo "Error: NODE_RANK or MLP_ROLE_INDEX environment variable must be set (e.g., export NODE_RANK=0)."
exit 1
fi
--- Configure NPROC_PER_NODE based on NODE_RANK ---
if [ "$NODE_RANK" -eq 0 ]; then
export NPROC_PER_NODE=$NPROC_PER_NODE_NODE0
echo "NPROC_PER_NODE set to: $NPROC_PER_NODE (for Node 0)"
else
export NPROC_PER_NODE=$NPROC_PER_NODE_OTHERS
echo "NPROC_PER_NODE set to: $NPROC_PER_NODE (for Node Rank > 0)"
fi
报错信息:
[2025-02-24 15:47:13] [TM][WARNING] [forward] Request failed for 1740383233880208906, code 6
[2025-02-24 15:47:14] [TM][WARNING] [RejectInvalidRequests] Skipping invalid infer request for id 1740383234397829963, code = 6
[2025-02-24 15:47:14] [TM][WARNING] [forward] Request failed for 1740383234397829963, code 6
脚本:
#!/bin/bash
export NNODES=4
export MASTER_PORT=${MASTER_PORT:-34229}
export GPUS=${GPUS:-8}
export NPROC_PER_NODE_NODE0=6
export NPROC_PER_NODE_OTHERS=6
export NCCL_IB_DISABLE=0
export NCCL_P2P_DISABLE=0
export NCCL_SHM_DISABLE=0
export TF_CPP_MIN_LOG_LEVEL=3
export LAUNCHER=pytorch
--- Determine NODE_RANK (Priority: MLP_ROLE_INDEX > NODE_RANK) ---
if [ -n "$MLP_ROLE_INDEX" ]; then
export NODE_RANK=$MLP_ROLE_INDEX
echo "Using MLP_ROLE_INDEX for NODE_RANK: $NODE_RANK"
elif [ -n "$NODE_RANK" ]; then
echo "Using NODE_RANK: $NODE_RANK"
else
echo "Error: NODE_RANK or MLP_ROLE_INDEX environment variable must be set (e.g., export NODE_RANK=0)."
exit 1
fi
--- Configure NPROC_PER_NODE based on NODE_RANK ---
if [ "$NODE_RANK" -eq 0 ]; then
export NPROC_PER_NODE=$NPROC_PER_NODE_NODE0
echo "NPROC_PER_NODE set to: $NPROC_PER_NODE (for Node 0)"
else
export NPROC_PER_NODE=$NPROC_PER_NODE_OTHERS
echo "NPROC_PER_NODE set to: $NPROC_PER_NODE (for Node Rank > 0)"
fi
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
echo "NNODES: $NNODES"
echo "NODE_RANK: $NODE_RANK"
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"
echo "NPROC_PER_NODE: $NPROC_PER_NODE"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "----------------------------------------"
--- Distributed Training Command using torchrun ---
torchrun
--nnodes=$NNODES
--node_rank=$NODE_RANK
--master_addr=$MLP_WORKER_0_HOST
--nproc_per_node=$NPROC_PER_NODE
--master_port=$MLP_WORKER_0_PORT
swift/cli/rlhf.py
--rlhf_type grpo
--reward_funcs traj_acc_think format
--model_type internvl2_5
--use_lmdeploy true
--lmdeploy_session_len 2048
--lmdeploy_cache_max_entry_count 0.8
--train_type full
--torch_dtype bfloat16
--max_completion_length 512
--num_train_epochs 3
--per_device_train_batch_size 2
--per_device_eval_batch_size 2
--learning_rate 1e-6
--gradient_accumulation_steps 16
--eval_steps 25
--save_steps 25
--save_total_limit 10
--logging_steps 1
--max_length 4096
--output_dir output_multi_node_think_zero
--warmup_ratio 0.05
--dataloader_num_workers 32
--dataset_num_proc 32
--num_generations 12
--log_completions True
--temperature 0.9
--top_p 0.9
--top_k 50
--async_generate true
--deepspeed zero1
--num_iterations 2
--num_infer_workers 2
--attn_impl flash_attn
--system 'examples/train/grpo/prompt_think.txt'
The text was updated successfully, but these errors were encountered: