NVIDIA · kaiyux · Dec 24, 2024 · Dec 24, 2024
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: Blossom-CI
+on:
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'
+            required: false
+          args:
+            description: 'argument'
+            required: false
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: |
+         github.event.comment.body == '/build' && (github.actor == 'niukuo' || github.actor == 'niukuo')
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  Upload-Log:
+    name: Upload log
+    runs-on: blossom
+    if : github.event_name == 'workflow_dispatch'
+    steps:
+      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ docs/source/llm-api-examples/llm_*.rst
 # Testing
 .coverage.*
 results_trt/
+llm-test-workspace/
 
 # build/debug
 *.safetensors

diff --git a/.gitmodules b/.gitmodules
@@ -17,3 +17,6 @@
 [submodule "3rdparty/pybind11"]
 	path = 3rdparty/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "3rdparty/xgrammar"]
+	path = 3rdparty/xgrammar
+	url = https://github.com/mlc-ai/xgrammar.git
diff --git a/3rdparty/xgrammar b/3rdparty/xgrammar
diff --git a/README.md b/README.md
@@ -5,25 +5,34 @@ TensorRT-LLM
 <h4> A TensorRT Toolbox for Optimized Large Language Model Inference</h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
+[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.15.0-green)](./tensorrt_llm/version.py)
+[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.16.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
-[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
+[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
 
 ---
 <div align="left">
 
 ## Latest News
+* [2024/11/19] Llama 3.2 Full-Stack Optimizations Unlock High Performance on NVIDIA GPUs
+[➡️ link](https://developer.nvidia.com/blog/llama-3-2-full-stack-optimizations-unlock-high-performance-on-nvidia-gpus/?ncid=so-link-721194)
+<div align="center">
+<img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/11/three-llamas-holding-number-10-signs-1.jpg" width="50%">
+<div align="left">
+
+* [2024/11/09] 🚀🚀🚀 3x Faster AllReduce with NVSwitch and TensorRT-LLM MultiShot
+[➡️ link](https://developer.nvidia.com/blog/3x-faster-allreduce-with-nvswitch-and-tensorrt-llm-multishot/)
+
+* [2024/11/09] ✨ NVIDIA advances the AI ecosystem with the AI model of LG AI Research 🙌
+[➡️ link](https://blogs.nvidia.co.kr/blog/nvidia-lg-ai-research/)
 
 * [2024/11/02] 🌟🌟🌟 NVIDIA and LlamaIndex Developer Contest
 🙌 Enter for a chance to win prizes including an NVIDIA® GeForce RTX™ 4080 SUPER GPU, DLI credits, and more🙌
 [➡️ link](https://developer.nvidia.com/llamaindex-developer-contest)
-<div align="center">
-<img src="docs/source/media/image-11-02-2024.png" width="50%">
-<div align="left">
 
 * [2024/10/28] 🏎️🏎️🏎️ NVIDIA GH200 Superchip Accelerates Inference by 2x in Multiturn Interactions with Llama Models
 [➡️ link](https://developer.nvidia.com/blog/nvidia-gh200-superchip-accelerates-inference-by-2x-in-multiturn-interactions-with-llama-models/)

diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT TARGET cxxopts::cxxopts)
 endif()
 
 function(add_benchmark test_name test_src)
-  add_executable(${test_name} ${test_src})
+  add_executable(${test_name} ${test_src} utils/utils.cpp)
 
   target_link_libraries(
     ${test_name} PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm
@@ -40,3 +40,4 @@ endfunction()
 add_benchmark(gptSessionBenchmark gptSessionBenchmark.cpp)
 add_benchmark(bertBenchmark bertBenchmark.cpp)
 add_benchmark(gptManagerBenchmark gptManagerBenchmark.cpp)
+add_benchmark(disaggServerBenchmark disaggServerBenchmark.cpp)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -352,3 +352,46 @@ If you want to obtain context and generation logits, you could build an enigne w
 If you want to get the logits, you could run gptSessionBenchmark with `--print_all_logits`. This will print a large number of logit values and has a certain impact on performance.
 
 *Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
+
+
+### 4.launch C++ disaggServerBenchmark
+Currently ,TensorRT-LLM has limited support for disaggregated inference, where context and generation phases of a request can run on different executors. `disaggServerBenchmark` is a tool to benchmark disaggregated inference.
+
+#### Usage
+For detailed usage, you can do the following
+```
+cd cpp/build
+
+# You can directly execute the binary for help information
+./benchmarks/disaggServerBenchmark --help
+```
+`disaggServerBenchmark` only supports `decoder-only` models.
+Here is the basic usage:
+```
+mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
+--generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
+```
+This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
+
+for example:
+```
+mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
+
+# need 6 gpus and 7 processes to launch the benchmark.
+```
+
+#### Known Issues
+
+##### 1. error `All available sequence slots are used`
+
+If generation_engine's pp_size >1, the error "All available sequence slots are used" may occur, setting and adjusting the parameter `--request_rate` may help alleviate the problem.
+
+##### 2.KVCache transfers are by default via PCIE on single node.
+Currently, because of the dependency libraries,KVCache transfers are by default via PCIE on single node.
+
+If you want to use NVLink, please check the UCX version in the container by running:
+```
+ucx_info -v
+```
+If the UCX version is less than or equal to 1.17, set `UCX_RNDV_FRAG_MEM_TYPE=cuda` to enable KvCache transfers using NVLink.
+If the UCX version is 1.18, please set `UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda` to enable KvCache transfers using NVLink.