Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_all2all.sh

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_allreduce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
export NCCL_IB_DISABLE=0
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
sleep 1
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
sleep 1
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce ./paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name ./llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
sleep 1
rm -r ./profiler_log
116 changes: 116 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh

Large diffs are not rendered by default.

160 changes: 160 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_computation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -x
unset CUDA_VISIBLE_DEVICES

unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT
unset PADDLE_TRAINERS_NUM
unset PADDLE_TRAINER_ID
unset PADDLE_WORKERS_IP_PORT_LIST
unset PADDLE_TRAINERS
unset PADDLE_NUM_GRADIENT_SERVERS

source <path_to_your_own_python>

task_name="qwen"
dir_name="profile_computation"
rm -rf output/$dir_name/$task_name/
rm -rf "output/$dir_name/$task_name""_log"

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH

TRAINER="./train_qwen.py"
LAUNCHER="python -u -m paddle.distributed.launch"
LAUNCHER="${LAUNCHER} --gpus 7" # 设置需要使用的GPU
LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""

export LAUNCHER=$LAUNCHER

# [max_steps] [logging_steps] [enable_auto_parallel]
TRAIN_ARGS="
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 25 \
--logging_steps 1 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--skip_profile_timer false \
--skip_memory_metrics 0 \
--save_total_limit 2 \
--device gpu \
--dataloader_num_workers 1 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
"

# [seq_length] [num_hidden_layers]
MODEL_ARGS="
--model_name_or_path "llama" \
--tokenizer_name_or_path "llama" \
--num_hidden_layers 2 \
--intermediate_size 25600 \
--vocab_size 32000 \
--hidden_size 5120 \
--seq_length 1024 \
--num_attention_heads 64 \
--num_key_value_heads 8 \
"

# [mbsz, accumulation_steps] [recompute] [amp]
CONFIG_ARGS="
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--recompute true \
--recompute_use_reentrant true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
"

# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
PARALLEL_ARGS=(
--to_static 0
--sharding_parallel_degree 1
--sharding "stage2"
--tensor_parallel_degree 2
--sequence_parallel true
--pipeline_parallel_degree 2
--virtual_pp_degree 1
--pipeline_schedule_mode "1F1B"
--sep_parallel_degree 1
--pipeline_parallel_config "enable_send_recv_overlap"
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
--sharding_parallel_config "enable_overlap enable_release_grads"
--tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy"
)

# [fused] [flash_attention]
DEFAULT_OPTIMIZER="
--fuse_attention_ffn true \
--fuse_attention_qkv true \
--fused_linear_param_grad_add 1 \
--fuse_sequence_parallel_allreduce true \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm false \
--enable_linear_fused_grad_add true \
"

# [data]
DATA_ARGS="
--input_dir ./data \
--split 949,50,1 \
--max_seq_length 16384"

# [runtime profiler]
RUNTIME_PROFILE_ARGS="
--profile_time_flag 1 \
--profile_forward_only 1 \
--save_time_flag 1 \
"

# [model profiler] [sequence type]
MODEL_PROFILER_ARGS="
--profile_type computation \
--profile_mode sequence \
--profile_fixed_batch_size 1 \
--layernum_min 1 \
--layernum_max 2 \
--profile_min_seq_length 4096 \
--profile_max_seq_length 16384 \
--profile_seq_length_step 4096 \
--num_layertype 1 \
"

python ./profile.py \
$MODEL_ARGS \
$TRAIN_ARGS \
$CONFIG_ARGS \
"${PARALLEL_ARGS[@]}" \
$DEFAULT_OPTIMIZER \
$DATA_ARGS \
$RUNTIME_PROFILE_ARGS \
$MODEL_PROFILER_ARGS
32 changes: 32 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_hardware.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

launch="${interpreter} -u -m paddle.distributed.launch"
launch="${launch} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7"

export INTERPRETER=${interpreter}
export LAUNCHER=${launch}
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
export NCCL_IB_DISABLE=0

PROFILE_HARDWARE_ARGS=(
--num_nodes $nnodes
--num_gpus_per_node 8
--backend 'paddle'
--max_pp_deg 8
--max_tp_deg 8
)

${interpreter} profile_hardware.py \
"${PROFILE_HARDWARE_ARGS[@]}"
161 changes: 161 additions & 0 deletions llm/auto_parallel/galvatron/scripts/profile_memory.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -x
unset CUDA_VISIBLE_DEVICES

unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT
unset PADDLE_TRAINERS_NUM
unset PADDLE_TRAINER_ID
unset PADDLE_WORKERS_IP_PORT_LIST
unset PADDLE_TRAINERS
unset PADDLE_NUM_GRADIENT_SERVERS

source <path_to_your_own_python>

task_name="qwen_profile_memory"
dir_name="profile_memory"
rm -rf output/$dir_name/$task_name/
rm -rf "output/$dir_name/$task_name""_log"

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH

TRAINER="./train_qwen.py"
LAUNCHER="python -u -m paddle.distributed.launch"
LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7" # 设置需要使用的GPU
LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""

export LAUNCHER=$LAUNCHER
export PROFILE_WORLD_SIZE=8

# [max_steps] [logging_steps] [enable_auto_parallel]
TRAIN_ARGS="
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 10 \
--logging_steps 1 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--skip_profile_timer false \
--skip_memory_metrics 0 \
--save_total_limit 2 \
--device gpu \
--dataloader_num_workers 1 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
"

# [seq_length] [num_hidden_layers]
MODEL_ARGS="
--model_name_or_path "llama" \
--tokenizer_name_or_path "llama" \
--num_hidden_layers 2 \
--intermediate_size 25600 \
--vocab_size 32000 \
--hidden_size 5120 \
--seq_length 1024 \
--num_attention_heads 64 \
--num_key_value_heads 8 \
"

# [mbsz, accumulation_steps] [recompute] [amp]
CONFIG_ARGS="
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--recompute false \
--recompute_use_reentrant true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
"

# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
PARALLEL_ARGS=(
--to_static 0
--sharding_parallel_degree 1
--sharding "stage2"
--tensor_parallel_degree 2
--sequence_parallel true
--pipeline_parallel_degree 2
--virtual_pp_degree 1
--pipeline_schedule_mode "1F1B"
--sep_parallel_degree 1
--pipeline_parallel_config "enable_send_recv_overlap"
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
--sharding_parallel_config "enable_overlap"
--tensor_parallel_config "enable_mp_async_allreduce"
)

# [fused] [flash_attention]
DEFAULT_OPTIMIZER_ARGS="
--fuse_attention_ffn true \
--fuse_attention_qkv true \
--fused_linear_param_grad_add 1 \
--fuse_sequence_parallel_allreduce true \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm true \
--enable_linear_fused_grad_add true \
"

# [data]
DATA_ARGS="
--input_dir ./data \
--split 949,50,1 \
--max_seq_length 16384"

# [runtime profiler]
RUNTIME_PROFILE_ARGS="
--profile_memory_flag 1 \
--save_memory_flag 1 \
"

# [model profiler] [static type]
MODEL_PROFILER_ARGS="
--profile_type memory \
--profile_mode static \
--profile_fixed_batch_size 8 \
--layernum_min 1 \
--layernum_max 2 \
--profile_fixed_seq_length_list 16384 \
--num_layertype 1 \
--max_tp_deg 8 \
--max_per_device_train_batch_size 4 \
"

python ./profile.py \
$MODEL_ARGS \
$TRAIN_ARGS \
$CONFIG_ARGS \
"${PARALLEL_ARGS[@]}" \
$DEFAULT_OPTIMIZER_ARGS \
$DATA_ARGS \
$RUNTIME_PROFILE_ARGS \
$MODEL_PROFILER_ARGS

Loading
Loading