diff --git a/training/metax/gpt3_6.7B-paddle/README.md b/training/metax/gpt3_6.7B-paddle/README.md
new file mode 100644
index 000000000..09e623705
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/README.md
@@ -0,0 +1,59 @@
+
+# Paddle版本运行指南
+
+## 数据下载
+
+```shell
+mkdir GPT-3-data # 后续在training/run_benchmarks/config/test_conf.py中修改数据位置使用
+cd GPT-3-data
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
+```
+
+## 基于FlagPerf运行
+
+```
+cd FlagPerf/training
+sudo -E python3 ./run_benchmarks/run.py
+```
+
+
+### GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - CPU型号: Montage Jintide(R) C8458P
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本: paddle-2.6.0
+   - 依赖软件版本：sentencepiece
+
+#### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                         | 特殊说明                                    |
+| -------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别       | 文本分类、文本生成             |                                             |
+| 模型           | gpt3                    |                                             |
+| 数据集         | gpt_en_dataset              |                                             |
+| 配置文件       | config                    |                                             |
+| 数据精度       | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 并行策略       | parallel_strategy,见“性能指标” | DP, TP, PP, SP          |
+| 硬件设备简称   | metax C500 (64G * 8) |                                             |
+| 硬件存储使用   | memory(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 吞吐量       | throughput,见“性能指标”           | 训练吞吐量 |
+
+* 性能指标
+
+| 配置     | config | precision | fix_hp | parallel_strategy | throughput   |
+| ------- | ------- | --------- | ------ | ---------------- | ------------ |
+| GPT3-6.7B | ------- | --------- | ------ | ---------------- | ------------ |
+| C500单机8卡（1x8*64G） | config_TP1PP1SH2SP8C50040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=64, (global bs = 2M tokens) | flash_attention=True, recompute=true, use_fused_rms_norm=false, sharding="stage2", sharding_degree=8 |      |
+| C500单机8卡（1x8*64G） | config_TP2PP1SH1SP4C50040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=128, (global bs = 2M tokens) | flash_attention=True, recompute=true, use_fused_rms_norm=false, sharding="stage1", sharding_degree=4, tensor_parallel_degree=2 |      |
+|  |  |  |  |  |  |
diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py
new file mode 100644
index 000000000..745a043b0
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/config_TP1PP1SH2SP8C50040Gx1x8.py
@@ -0,0 +1,43 @@
+# model info
+model_name_or_path = "gpt3-6.7B-en"
+tokenizer_name_or_path = "gpt3-6.7B-en"
+continue_training = 0
+split = "998,1,1"
+max_seq_length = 2048
+
+# training info
+dataloader_num_workers = 1
+max_steps = 100
+save_steps = 10000
+eval_steps = 5000000
+learning_rate = 1.2e-4
+min_learning_rate = 1.2e-5
+warmup_steps = 188
+decay_steps = 130 * 1024 * 1024
+lr_scheduler_type = "cosine"
+adam_beta1 = 0.9
+adam_beta2 = 0.95
+adam_epsilon = 1e-08
+max_grad_norm = 1.0
+target_loss = 1.0
+target_ppl = 0.6
+logging_steps = 1
+log_freq = 1
+seed = 42
+
+# for parallel
+per_device_train_batch_size = 2
+per_device_eval_batch_size = 1
+tensor_parallel_degree = 1
+pipeline_parallel_degree = 1
+sharding_parallel_degree = 8
+gradient_accumulation_steps = 64
+use_flash_attention = 1
+fuse_attention_qkv = 0
+use_fused_rms_norm = 0
+fp16 = True
+fp16_opt_level = "O2"
+scale_loss = 1024
+sharding = "stage2"
+recompute = True
+recompute_granularity = "full"
diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py
new file mode 100644
index 000000000..95cfd5346
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH1SP4C50040Gx1x8.py
@@ -0,0 +1,43 @@
+# model info
+model_name_or_path = "gpt3-6.7B-en"
+tokenizer_name_or_path = "gpt3-6.7B-en"
+continue_training = 0
+split = "998,1,1"
+max_seq_length = 2048
+
+# training info
+dataloader_num_workers = 1
+max_steps = 100
+save_steps = 10000
+eval_steps = 5000000
+learning_rate = 1.2e-4
+min_learning_rate = 1.2e-5
+warmup_steps = 188
+decay_steps = 130 * 1024 * 1024
+lr_scheduler_type = "cosine"
+adam_beta1 = 0.9
+adam_beta2 = 0.95
+adam_epsilon = 1e-08
+max_grad_norm = 1.0
+target_loss = 1.0
+target_ppl = 0.6
+logging_steps = 1
+log_freq = 1
+seed = 42
+
+# for parallel
+per_device_train_batch_size = 2
+per_device_eval_batch_size = 1
+tensor_parallel_degree = 2
+pipeline_parallel_degree = 1
+sharding_parallel_degree = 4
+gradient_accumulation_steps = 128
+use_flash_attention = 1
+fuse_attention_qkv = 0
+use_fused_rms_norm = 0
+fp16 = True
+fp16_opt_level = "O2"
+scale_loss = 1024
+sharding = "stage1"
+recompute = True
+recompute_granularity = "full"
diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py
new file mode 100644
index 000000000..11500166a
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP1SH2SP4C50040Gx1x8.py
@@ -0,0 +1,43 @@
+# model info
+model_name_or_path = "gpt3-6.7B-en"
+tokenizer_name_or_path = "gpt3-6.7B-en"
+continue_training = 0
+split = "998,1,1"
+max_seq_length = 2048
+
+# training info
+dataloader_num_workers = 1
+max_steps = 100
+save_steps = 10000
+eval_steps = 5000000
+learning_rate = 1.2e-4
+min_learning_rate = 1.2e-5
+warmup_steps = 188
+decay_steps = 130 * 1024 * 1024
+lr_scheduler_type = "cosine"
+adam_beta1 = 0.9
+adam_beta2 = 0.95
+adam_epsilon = 1e-08
+max_grad_norm = 1.0
+target_loss = 1.0
+target_ppl = 0.6
+logging_steps = 1
+log_freq = 1
+seed = 42
+
+# for parallel
+per_device_train_batch_size = 2
+per_device_eval_batch_size = 1
+tensor_parallel_degree = 2
+pipeline_parallel_degree = 1
+sharding_parallel_degree = 4
+gradient_accumulation_steps = 128
+use_flash_attention = 1
+fuse_attention_qkv = 0
+use_fused_rms_norm = 0
+fp16 = True
+fp16_opt_level = "O2"
+scale_loss = 1024
+sharding = "stage2"
+recompute = True
+recompute_granularity = "full"
diff --git a/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py
new file mode 100644
index 000000000..62ac76a3b
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/config_TP2PP4SH1SP1C50040Gx1x8.py
@@ -0,0 +1,43 @@
+# model info
+model_name_or_path = "gpt3-6.7B-en"
+tokenizer_name_or_path = "gpt3-6.7B-en"
+continue_training = 0
+split = "998,1,1"
+max_seq_length = 2048
+
+# training info
+dataloader_num_workers = 1
+max_steps = 100
+save_steps = 10000
+eval_steps = 5000000
+learning_rate = 1.2e-4
+min_learning_rate = 1.2e-5
+warmup_steps = 188
+decay_steps = 130 * 1024 * 1024
+lr_scheduler_type = "cosine"
+adam_beta1 = 0.9
+adam_beta2 = 0.95
+adam_epsilon = 1e-08
+max_grad_norm = 1.0
+target_loss = 1.0
+target_ppl = 0.6
+logging_steps = 1
+log_freq = 1
+seed = 42
+
+# for parallel
+per_device_train_batch_size = 2
+per_device_eval_batch_size = 1024
+tensor_parallel_degree = 2
+pipeline_parallel_degree = 4
+sharding_parallel_degree = 1
+gradient_accumulation_steps = 512
+use_flash_attention = 1
+fuse_attention_qkv = 0
+use_fused_rms_norm = 0
+fp16 = True
+fp16_opt_level = "O2"
+scale_loss = 1024
+sharding = "stage1"
+recompute = True
+recompute_granularity = "full"
diff --git a/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh b/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh
new file mode 100644
index 000000000..6d1dc2885
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/environment_variables.sh
@@ -0,0 +1,7 @@
+# =================================================
+ # Export variables
+ # =================================================
+export SET_DEVICE_NUMA_PREFERRED=1
+export MACA_SMALL_PAGESIZE_ENABLE=1
+export PYTORCH_ENABLE_SAME_RAND_A100=1
+ 
diff --git a/training/metax/gpt3_6.7B-paddle/config/requirements.txt b/training/metax/gpt3_6.7B-paddle/config/requirements.txt
new file mode 100644
index 000000000..c134f3063
--- /dev/null
+++ b/training/metax/gpt3_6.7B-paddle/config/requirements.txt
@@ -0,0 +1,2 @@
+regex
+tool_helpers
diff --git a/training/metax/gpt3_6.7B-paddle/extern/.gitkeep b/training/metax/gpt3_6.7B-paddle/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb