diff --git a/base/benchmarks/computation-BF16/main.py b/base/benchmarks/computation-BF16/main.py
index 9a0afeebd..1bcf255f8 100644
--- a/base/benchmarks/computation-BF16/main.py
+++ b/base/benchmarks/computation-BF16/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-FP16/main.py b/base/benchmarks/computation-FP16/main.py
index 452851b06..5ceefca7e 100644
--- a/base/benchmarks/computation-FP16/main.py
+++ b/base/benchmarks/computation-FP16/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-FP32/main.py b/base/benchmarks/computation-FP32/main.py
index 36ed84e53..cd2161fd5 100644
--- a/base/benchmarks/computation-FP32/main.py
+++ b/base/benchmarks/computation-FP32/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-FP64/main.py b/base/benchmarks/computation-FP64/main.py
index edd41cf88..d25f4a7c9 100644
--- a/base/benchmarks/computation-FP64/main.py
+++ b/base/benchmarks/computation-FP64/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-TF32/main.py b/base/benchmarks/computation-TF32/main.py
index f950d12dd..c94cd588c 100644
--- a/base/benchmarks/computation-TF32/main.py
+++ b/base/benchmarks/computation-TF32/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/drivers/utils.py b/base/benchmarks/drivers/utils.py
index 256b346a3..dfb9b31c9 100644
--- a/base/benchmarks/drivers/utils.py
+++ b/base/benchmarks/drivers/utils.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 
 
@@ -36,4 +41,4 @@ def get_memory_capacity(vendor, rank):
         return torch.cuda.get_device_properties(rank).total_memory
     else:
         print("unspecified vendor {}, return -1.0".format(vendor))
-        return -1.0
\ No newline at end of file
+        return -1.0
diff --git a/base/benchmarks/main_memory-bandwidth/main.py b/base/benchmarks/main_memory-bandwidth/main.py
index 3cea75f9c..8b6830cd7 100644
--- a/base/benchmarks/main_memory-bandwidth/main.py
+++ b/base/benchmarks/main_memory-bandwidth/main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/container_main.py b/base/container_main.py
index 51bd2665f..629988653 100644
--- a/base/container_main.py
+++ b/base/container_main.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
 import time
 from loguru import logger
 import os
diff --git a/base/toolkits/computation-BF16/nvidia/gemm.cu b/base/toolkits/computation-BF16/nvidia/gemm.cu
index 400b6bca6..43fb61441 100644
--- a/base/toolkits/computation-BF16/nvidia/gemm.cu
+++ b/base/toolkits/computation-BF16/nvidia/gemm.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/computation-FP16/nvidia/gemm.cu b/base/toolkits/computation-FP16/nvidia/gemm.cu
index e1d7f7e0a..69b5da611 100644
--- a/base/toolkits/computation-FP16/nvidia/gemm.cu
+++ b/base/toolkits/computation-FP16/nvidia/gemm.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/computation-FP32/nvidia/gemm.cu b/base/toolkits/computation-FP32/nvidia/gemm.cu
index 2c44e60fb..6c6e1a27a 100644
--- a/base/toolkits/computation-FP32/nvidia/gemm.cu
+++ b/base/toolkits/computation-FP32/nvidia/gemm.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/computation-FP64/nvidia/gemm.cu b/base/toolkits/computation-FP64/nvidia/gemm.cu
index d3228c6d5..78e0b49a6 100644
--- a/base/toolkits/computation-FP64/nvidia/gemm.cu
+++ b/base/toolkits/computation-FP64/nvidia/gemm.cu
@@ -1,3 +1,6 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License")
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/computation-INT8/nvidia/gemm.cu b/base/toolkits/computation-INT8/nvidia/gemm.cu
index 3571b1408..ec179a9a9 100644
--- a/base/toolkits/computation-INT8/nvidia/gemm.cu
+++ b/base/toolkits/computation-INT8/nvidia/gemm.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/computation-TF32/nvidia/gemm.cu b/base/toolkits/computation-TF32/nvidia/gemm.cu
index 60c79b7f4..eeadf82bc 100644
--- a/base/toolkits/computation-TF32/nvidia/gemm.cu
+++ b/base/toolkits/computation-TF32/nvidia/gemm.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <chrono>
diff --git a/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu b/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu
index dd5e1a0ad..deb81dd3c 100644
--- a/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu
+++ b/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu
@@ -1,3 +1,7 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
 #include <stdio.h>
 #include <cuda_runtime.h>
 
diff --git a/inference/requirements.txt b/inference/requirements.txt
new file mode 100644
index 000000000..8e6ceeab2
--- /dev/null
+++ b/inference/requirements.txt
@@ -0,0 +1 @@
+loguru==0.7.2
diff --git a/training/benchmarks/llama3_8B/megatron/README.md b/training/benchmarks/llama3_8B/megatron/README.md
new file mode 100644
index 000000000..b9f6da3c4
--- /dev/null
+++ b/training/benchmarks/llama3_8B/megatron/README.md
@@ -0,0 +1,52 @@
+# 模型信息
+- Introduction
+
+LLaMA3 is a new generation of large language models developed by Meta. The first batch of publicly released models includes two sizes: 8B and 70B parameters. We used Megatron Core-v0.6.0 and implemented the LLaMA3-8B pre-training computation task based on the Megatron framework, following the algorithm architecture and configurations of LLaMA3.
+
+- Technical Blog(there's no paper until Apr. 28th, 2024)
+  [LLAMA3](https://llama.meta.com/llama3/) 
+
+- 模型代码来源 
+
+Meta仅开源了Llama3的模型权重文件、tokenizer等，不包括预训练代码、预训练实现、预训练数据集等。出于上述事实，本评测样例基于开源Megatron框架，使用开源wudao数据集，在[meta-llama/Meta-Llama-3-8B · Hugging Face](https://huggingface.co/meta-llama/Meta-Llama-3-8B)设计的LLaMA3-8B算法结构上进行预训练，来进行AI硬件评测。测试样例代码为FlagPerf编写。需要下载或准备的文件见**数据准备**小节，依赖的外部软件或信息见**依赖**小节。
+
+
+# 数据准备
+
+### 模型配置及tokenizer准备
+
+本测试样例为预训练case，需要下载tokenizer（如不确定tokenizer包含哪些文件，可下载llama3-8B所有文件，尽管我们不需要使用模型权重文件），不需要下载模型代码或模型权重。tokenizer需向llama3官方申请并下载8B所用版本，并在data_dir下创建llama3_8b_hf目录，按照huggingface要求的格式进行处理或存放。了解data\_dir需要阅读FlagPerf有关训练的文档，或直接修改FlagPerf/training/run_benchmarks/config/test_conf.py中CASES变量中的value。
+
+出于对Llama3开源协议的遵守，尽管不推荐，不了解相关背景的用户仍可以参考[unsloth/llama-3-8b at main (huggingface.co)](https://huggingface.co/unsloth/llama-3-8b/tree/main)仓库示例格式存放tokenizer相关文件（不需要模型权重部分，如不确定，请保留全部文件）。其中，data_dir下面创建的llama3_8b_hf目录名称可更改。如更改，需在形如FlagPerf/training/nvidia/llama3\_8B-megatron/config/config\_A100\_1x8.py的配置文件中同步修改。默认为llama3\_8b_hf。
+
+
+### 数据集准备
+
+本测试样例数据使用智源研究院wudao数据集，需向llama3官方申请tokenizer并进行预处理。测试数据集的内容不是影响AI硬件评测结果的核心因素，可以参考或使用阿里灵骏团队开放的预处理好的数据集[Pai-Megatron-Patch/examples/llama3/README.md at main · alibaba/Pai-Megatron-Patch (github.com)](https://github.com/alibaba/Pai-Megatron-Patch/blob/main/examples/llama3/README.md#%E6%95%B0%E6%8D%AE%E9%9B%86%E5%92%8C%E6%A8%A1%E5%9E%8B%E4%B8%8B%E8%BD%BD)
+
+在上述README文件中找到并执行
+
+`wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/llama3-datasets/wudao_llama3bpe_content_document.binwget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/llama3-datasets/wudao_llama3bpe_content_document.idx`
+
+将上述两个文件（.bin与.idx，不清楚文件原理可以参考Megatron-LM仓库相关介绍）放置于data_dir下。
+
+# 依赖
+
+### llama3（算法结构、tokenizer）
+
+META LLAMA 3 COMMUNITY LICENSE AGREEMENT
+Meta Llama 3 Version Release Date: April 18, 2024“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.“Documentation” means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https://llama.meta.com/get-started/.“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.“MetaLlama 3” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.“Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement.“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.1. License Rights and Redistribution.a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.b. Redistribution and Use.i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama 3” at the beginning of any such AI model name.
+ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
+iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement.v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Meta Llama 3 or derivative works thereof).2. Additional Commercial Terms. If, on the Meta Llama 3 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.5. Intellectual Property.a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+
+### wudao数据集（数据集内容）
+
+[BAAI-WuDao/WuDaoMM: WuDaoMM this is a data project (github.com)](https://github.com/BAAI-WuDao/WuDaoMM?tab=readme-ov-file)
+
+### 灵骏团队（megatron格式数据文件，供参考）
+
+[alibaba/Pai-Megatron-Patch: The official repo of Pai-Megatron-Patch for LLM & VLM large scale training developed by Alibaba Cloud. (github.com)](https://github.com/alibaba/Pai-Megatron-Patch/tree/main)
+
+### megatron-core（计算任务实现）
+
+[NVIDIA/Megatron-LM: Ongoing research training transformer models at scale (github.com)](https://github.com/NVIDIA/Megatron-LM/tree/main?tab=License-1-ov-file#readme)
diff --git a/training/benchmarks/llama3_8B/megatron/arguments.patch b/training/benchmarks/llama3_8B/megatron/arguments.patch
new file mode 100644
index 000000000..9bda43d3d
--- /dev/null
+++ b/training/benchmarks/llama3_8B/megatron/arguments.patch
@@ -0,0 +1,7 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+1411a1412
+>                                 'Llama3Tokenizer',
diff --git a/training/benchmarks/llama3_8B/megatron/pretrain_llama3.sh b/training/benchmarks/llama3_8B/megatron/pretrain_llama3.sh
new file mode 100755
index 000000000..4eedec946
--- /dev/null
+++ b/training/benchmarks/llama3_8B/megatron/pretrain_llama3.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+# args
+
+DATA_DIR=$1
+GPUS_PER_NODE=$2
+NNODES=$3
+NODE_RANK=$4
+MASTER_ADDR=$5
+MASTER_PORT=$6
+MEGAPATH=$7
+MBS=$8
+ITERS=$9
+TP=${10}
+PP=${11}
+TDIR=${12}
+ADAPT=${13}
+
+# Runs the "LLaMA3-8B" parameter model
+export PYTHONPATH=$PYTHONPATH:$MEGAPATH
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# algorithm args
+
+MODEL_ARGS=" \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --swiglu \
+    --normalization RMSNorm \
+    --norm-epsilon 1.0e-5 \
+    --global-batch-size 512 \
+    --attention-softmax-in-fp32"
+
+OPT_ARGS=" \
+    --lr 1.0e-5 \
+    --train-iters $ITERS \
+    --lr-decay-iters $ITERS \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-6 \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.008"
+    
+ALGO_ARGS="$MODEL_ARGS $OPT_ARGS"
+
+# data args
+
+DATA_ARGS=" \
+    --data-path $DATA_DIR/wudao_llama3bpe_content_document \
+    --tokenizer-type Llama3Tokenizer \
+    --tokenizer-model $TDIR \
+    --split 100,0,0
+"
+
+# training args
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+TRAINING_ARGS=" \
+    --micro-batch-size $MBS \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --sequence-parallel \
+    --bf16
+"
+
+# vendor args
+
+VENDOR_ARGS=" \
+    --transformer-impl transformer_engine \
+    --use-distributed-optimizer \
+    --use-mcore-models \
+    --use-flash-attn
+"
+
+OUTPUT_ARGS=" --log-interval 1"
+
+source $ADAPT
+run_cmd="torchrun $DISTRIBUTED_ARGS $MEGAPATH/pretrain_gpt.py \
+    $ALGO_ARGS \
+    $DATA_ARGS \
+    $TRAINING_ARGS \
+    $VENDOR_ARGS \
+    $OUTPUT_ARGS"
+
+echo ${run_cmd}
+eval ${run_cmd}
diff --git a/training/benchmarks/llama3_8B/megatron/run_pretraining.py b/training/benchmarks/llama3_8B/megatron/run_pretraining.py
new file mode 100644
index 000000000..620c77d82
--- /dev/null
+++ b/training/benchmarks/llama3_8B/megatron/run_pretraining.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--nproc_per_node", type=int, required=True)
+    parser.add_argument("--nnodes", type=int, required=True)
+    parser.add_argument("--node_rank", type=int, required=True)
+    parser.add_argument("--master_addr", type=str, required=True)
+    parser.add_argument("--master_port", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+    config_dir_path = os.path.dirname(args.flagperf_config_file)
+
+    module = import_module(config_file)
+
+    localbs = getattr(module, 'localbs')
+    train_steps = getattr(module, 'train_steps')
+    theoryflops = getattr(module, 'theoryflops')
+    megapath = getattr(module, 'megatron_path')
+    tensor_parallel = getattr(module, 'tensor_parallel')
+    pipeline_parallel = getattr(module, 'pipeline_parallel')
+    tokenizer = getattr(module, 'tokenizer_path')
+    
+    nnodes = args.nnodes
+    nproc_per_node = args.nproc_per_node
+    node_rank = args.node_rank
+    master_addr = args.master_addr
+    master_port = args.master_port
+    data_dir = args.data_dir
+    tokenizer_dir = os.path.join(args.data_dir, tokenizer)
+
+
+    task_log_file = os.path.join(args.log_dir, "megatron.log.txt")
+    
+    # merge llama3 patch
+
+    
+    origin_file = os.path.join(megapath, "megatron/training/arguments.py")
+    exec_cmd = "patch --silent --forward " + origin_file + " arguments.patch -o tmp.py;mv tmp.py " + origin_file
+    exec_cmd = exec_cmd + ";"
+    
+    origin_file = os.path.join(megapath, "megatron/training/tokenizer/tokenizer.py")
+    exec_cmd = exec_cmd + "patch --silent --forward " + origin_file + " tokenizer.patch -o tmp.py;mv tmp.py " + origin_file
+    exec_cmd = exec_cmd + ";"
+    
+    # bash pretrain_llama3.sh
+    
+    exec_cmd = exec_cmd + "bash pretrain_llama3.sh"
+    
+    # args
+
+    exec_cmd = exec_cmd + " " + data_dir
+    exec_cmd = exec_cmd + " " + str(nproc_per_node)
+    exec_cmd = exec_cmd + " " + str(nnodes)
+    exec_cmd = exec_cmd + " " + str(node_rank)
+    exec_cmd = exec_cmd + " " + str(master_addr)
+    exec_cmd = exec_cmd + " " + str(master_port)
+    exec_cmd = exec_cmd + " " + megapath
+    exec_cmd = exec_cmd + " " + str(localbs)
+    exec_cmd = exec_cmd + " " + str(train_steps)
+    exec_cmd = exec_cmd + " " + str(tensor_parallel)
+    exec_cmd = exec_cmd + " " + str(pipeline_parallel)
+    exec_cmd = exec_cmd + " " + str(tokenizer_dir)
+    exec_cmd = exec_cmd + " " + os.path.join(config_dir_path, "training_adapter.sh")
+    print(exec_cmd)
+
+    with open(task_log_file, "w") as f:
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
+        p.wait()
+
+    time_per_step = -1.0
+    with open(task_log_file) as f:
+        for line in f.readlines():
+            if "elapsed time per iteration (ms): " in line:
+                info = line.split("|")[2]
+                steptime = info.split(":")[1]
+                time_per_step = float(steptime) / 1000
+
+    whole_tps = 512 * 8192 / time_per_step
+    chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
+    print("System tokens per second: ", whole_tps)
+    print("Tokens/p/s: ", chip_tps)
+    print("MFU: ", chip_tps * 8000000000.0 * 6 / theoryflops)
diff --git a/training/benchmarks/llama3_8B/megatron/tokenizer.patch b/training/benchmarks/llama3_8B/megatron/tokenizer.patch
new file mode 100644
index 000000000..3c131e2cf
--- /dev/null
+++ b/training/benchmarks/llama3_8B/megatron/tokenizer.patch
@@ -0,0 +1,52 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+43a44,45
+>     elif args.tokenizer_type == 'Llama3Tokenizer':
+>         tokenizer = _Llama3Tokenizer(args.tokenizer_model)
+439a442,484
+> 
+> class _Llama3Tokenizer(MegatronTokenizer):
+>     def __init__(self, model_file):
+>         super().__init__("placeholder")
+>         from transformers import AutoTokenizer
+>         self.tokenizer = AutoTokenizer.from_pretrained(
+>             model_file,
+>             padding_side="right",
+>             use_fast=False,
+>             trust_remote_code=True
+>         )
+>         self.extra_vocab_size = 256
+> 
+>     @property
+>     def vocab_size(self):
+>         return self.tokenizer.vocab_size + self.extra_vocab_size
+> 
+>     @property
+>     def vocab(self):
+>         return self.tokenizer.encoder
+> 
+>     @property
+>     def inv_vocab(self):
+>         return self.tokenizer.decoder
+> 
+>     def tokenize(self, text):
+>         return self.tokenizer.encode(text)
+> 
+>     def detokenize(self, token_ids):
+>         return self.tokenizer.decode(token_ids)
+> 
+>     @property
+>     def eod(self):
+>         return self.tokenizer.eos_token_id
+> 
+>     @property
+>     def eos_token(self):
+>         return self.tokenizer.eos_token
+> 
+>     @property
+>     def pad_token_id(self):
+>         return self.tokenizer.pad_token_id
+>                 
diff --git a/training/nvidia/docker_image/megatron_core060/Dockerfile b/training/nvidia/docker_image/megatron_core060/Dockerfile
new file mode 100644
index 000000000..c5d46ae5a
--- /dev/null
+++ b/training/nvidia/docker_image/megatron_core060/Dockerfile
@@ -0,0 +1,5 @@
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+RUN /bin/bash -c "git clone -b core_v0.6.0 https://github.com/NVIDIA/Megatron-LM.git"
diff --git a/training/nvidia/docker_image/megatron_core060/megatron_core060_install.sh b/training/nvidia/docker_image/megatron_core060/megatron_core060_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/nvidia/docker_image/megatron_core060/megatron_core060_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/nvidia/llama3_8B-megatron/README.md b/training/nvidia/llama3_8B-megatron/README.md
new file mode 100644
index 000000000..3cf5170f6
--- /dev/null
+++ b/training/nvidia/llama3_8B-megatron/README.md
@@ -0,0 +1,61 @@
+
+### Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+
+    - 机器型号: NVIDIA DGX A800(80G) 
+    - 加速卡型号: NVIDIA_A800-SXM4-80GB
+    - CPU型号: AMD EPYC7742-64core@1.5G
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-126-generic     
+   - 加速卡驱动版本：470.141.10
+   - Docker 版本：20.10.18
+   - 训练框架版本：megatron-core tag:core_v0.6.0
+   - 依赖软件版本：sentencepiece==0.2.0, transformers==4.40.1
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron-core
+   - 实施细节：PP2DP4TP1
+
+- ##### 优化策略
+
+   - flash attention 2
+   - recompute-activations
+   - transformer-engine impl
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_A100x1x8.py中所写，在本case中默认为1。**厂商适配时可任意更改**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_A100x1x8.py中所写，在本case中默认为8192，原则上不可更改
+  3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。在本case中默认为512，使得globalbatchsize=4M tokens。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | llama3_8b             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama3 tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选fp32/amp/fp16/bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP4TP1                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参                        |
+| 硬件设备简称  | nvidia A100           |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置            | precision | parallel  | fix_hp | token/p/s | 是否精度对齐 | mem   | MFU   |
+| ------------- | --------- | --------- | ------ | --------- | ------ | ----- | ----- |
+| A100单机8卡（1x8） | bf16      | PP2DP4TP1 | /      | 3698.7    | True   | 76/80 | 56.9% |
+
diff --git a/training/nvidia/llama3_8B-megatron/config/config_A100x1x8.py b/training/nvidia/llama3_8B-megatron/config/config_A100x1x8.py
new file mode 100644
index 000000000..71ca6d08a
--- /dev/null
+++ b/training/nvidia/llama3_8B-megatron/config/config_A100x1x8.py
@@ -0,0 +1,7 @@
+tokenizer_path = "llama3_8b_hf"
+localbs = 1
+train_steps = 300
+theoryflops = 312000000000000.0
+megatron_path = "/workspace/Megatron-LM" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 1
+pipeline_parallel = 2
diff --git a/training/nvidia/llama3_8B-megatron/config/requirements.txt b/training/nvidia/llama3_8B-megatron/config/requirements.txt
new file mode 100644
index 000000000..20cdc820b
--- /dev/null
+++ b/training/nvidia/llama3_8B-megatron/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece==0.2.0
+transformers==4.40.1
diff --git a/training/nvidia/llama3_8B-megatron/config/training_adapter.sh b/training/nvidia/llama3_8B-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..d95369427
--- /dev/null
+++ b/training/nvidia/llama3_8B-megatron/config/training_adapter.sh
@@ -0,0 +1 @@
+echo "[Prompt] nvidia adaption is NULL, for other Vendors"
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 1b4aaf984..ad912f2dd 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -62,12 +62,13 @@
 '''
 CASES = {
     # nvidia cases
-    "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
-    "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
-    "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
+    "llama3_8b:megatron_core060:A100:1:8:1": "/data/llama3_8b_pretrain"
+    # "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
+    # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
+    # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
-    #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
-    #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune",
+    # "llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
+    # "aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune",
     # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "efficientnet:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",