diff --git a/training/metax/gpt3_6.7B-paddle/README.md b/training/metax/gpt3_6.7B-paddle/README.md new file mode 100644 index 000000000..09e623705 --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/README.md @@ -0,0 +1,59 @@ + +# Paddle版本运行指南 + +## 数据下载 + +```shell +mkdir GPT-3-data # 后续在training/run_benchmarks/config/test_conf.py中修改数据位置使用 +cd GPT-3-data +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz +``` + +## 基于FlagPerf运行 + +``` +cd FlagPerf/training +sudo -E python3 ./run_benchmarks/run.py +``` + + +### GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - CPU型号: Montage Jintide(R) C8458P + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本: paddle-2.6.0 + - 依赖软件版本:sentencepiece + +#### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ------------------------------ | ------------------------------------------- | +| 任务类别 | 文本分类、文本生成 | | +| 模型 | gpt3 | | +| 数据集 | gpt_en_dataset | | +| 配置文件 | config | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 并行策略 | parallel_strategy,见“性能指标” | DP, TP, PP, SP | +| 硬件设备简称 | metax C500 (64G * 8) | | +| 硬件存储使用 | memory(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 吞吐量 | throughput,见“性能指标” | 训练吞吐量 | + +* 性能指标 + +| 配置 | config | precision | fix_hp | parallel_strategy | throughput | +| ------- | ------- | --------- | ------ | ---------------- | ------------ | +| GPT3-6.7B | ------- | --------- | ------ | ---------------- | ------------ | +| C500单机8卡(1x8*64G) | config_TP1PP1SH2SP8C50040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=64, (global bs = 2M tokens) | flash_attention=True, recompute=true, use_fused_rms_norm=false, sharding="stage2", sharding_degree=8 | | +| C500单机8卡(1x8*64G) | config_TP2PP1SH1SP4C50040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=128, (global bs = 2M tokens) | flash_attention=True, recompute=true, use_fused_rms_norm=false, sharding="stage1", sharding_degree=4, tensor_parallel_degree=2 | | +| | | | | | | diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py new file mode 100644 index 000000000..745a043b0 --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path = "gpt3-6.7B-en" +tokenizer_name_or_path = "gpt3-6.7B-en" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 5000000 +learning_rate = 1.2e-4 +min_learning_rate = 1.2e-5 +warmup_steps = 188 +decay_steps = 130 * 1024 * 1024 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-08 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 8 +gradient_accumulation_steps = 64 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage2" +recompute = True +recompute_granularity = "full" diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py new file mode 100644 index 000000000..95cfd5346 --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path = "gpt3-6.7B-en" +tokenizer_name_or_path = "gpt3-6.7B-en" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 5000000 +learning_rate = 1.2e-4 +min_learning_rate = 1.2e-5 +warmup_steps = 188 +decay_steps = 130 * 1024 * 1024 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-08 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 4 +gradient_accumulation_steps = 128 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage1" +recompute = True +recompute_granularity = "full" diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py new file mode 100644 index 000000000..11500166a --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path = "gpt3-6.7B-en" +tokenizer_name_or_path = "gpt3-6.7B-en" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 5000000 +learning_rate = 1.2e-4 +min_learning_rate = 1.2e-5 +warmup_steps = 188 +decay_steps = 130 * 1024 * 1024 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-08 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 4 +gradient_accumulation_steps = 128 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage2" +recompute = True +recompute_granularity = "full" diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py new file mode 100644 index 000000000..62ac76a3b --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path = "gpt3-6.7B-en" +tokenizer_name_or_path = "gpt3-6.7B-en" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 5000000 +learning_rate = 1.2e-4 +min_learning_rate = 1.2e-5 +warmup_steps = 188 +decay_steps = 130 * 1024 * 1024 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-08 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1024 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 4 +sharding_parallel_degree = 1 +gradient_accumulation_steps = 512 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage1" +recompute = True +recompute_granularity = "full" diff --git a/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh b/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh new file mode 100644 index 000000000..6d1dc2885 --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh @@ -0,0 +1,7 @@ +# ================================================= + # Export variables + # ================================================= +export SET_DEVICE_NUMA_PREFERRED=1 +export MACA_SMALL_PAGESIZE_ENABLE=1 +export PYTORCH_ENABLE_SAME_RAND_A100=1 + diff --git a/training/metax/gpt3_6.7B-paddle/config/requirements.txt b/training/metax/gpt3_6.7B-paddle/config/requirements.txt new file mode 100644 index 000000000..c134f3063 --- /dev/null +++ b/training/metax/gpt3_6.7B-paddle/config/requirements.txt @@ -0,0 +1,2 @@ +regex +tool_helpers diff --git a/training/metax/gpt3_6.7B-paddle/extern/.gitkeep b/training/metax/gpt3_6.7B-paddle/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb