NVIDIA · joyang-nv · Sep 1, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -9,5 +9,6 @@ examples/**/.git
 examples/**/*.bin
 examples/**/*.engine
 examples/**/*.onnx
+examples/**/*.safetensors
 examples/**/c-model
 examples/models/core/gpt/gpt*
@@ -1,10 +1,5 @@
 # This file defines code ownership rules for the repository.
 
-# The following rule should only be uncommented on release branches (e.g., release/0.19).
-# The rule below requires that any PR to release/**/* branches must be approved by at least one member
-# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
-# Without approval from a member of this team, PRs cannot be merged to release branches.
-# * @NVIDIA/trt-llm-release-branch-approval
 
 ## TensorRT-LLM Infra
 ### CI
@@ -160,3 +155,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 # from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
 /tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
 /tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
+
+# The following rule should only be uncommented on release branches (e.g., release/0.19).
+# The rule below requires that any PR to release/**/* branches must be approved by at least one member
+# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
+# Without approval from a member of this team, PRs cannot be merged to release branches.
+# * @NVIDIA/trt-llm-release-branch-approval
-# The following rule should only be uncommented on release branches (e.g., release/0.19).
-# The rule below requires that any PR to release/**/* branches must be approved by at least one member
-# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
-# Without approval from a member of this team, PRs cannot be merged to release branches.
-# * @NVIDIA/trt-llm-release-branch-approval
+# Release-branch only — DO NOT uncomment on main.
+# Important: CODEOWNERS is last-match-wins. Uncommenting the catch-all below on a release branch
+# will override all specific owners above (including API review committees). If you need AND semantics
+# (release approver in addition to per-path owners), maintain a separate CODEOWNERS on release/**
+# that appends @NVIDIA/trt-llm-release-branch-approval to each path, or enforce via Rulesets “Required reviewers”.
+# Example for release/* (keep commented here):
+# * @NVIDIA/trt-llm-release-branch-approval
-# The following rule should only be uncommented on release branches (e.g., release/0.19).
-# The rule below requires that any PR to release/**/* branches must be approved by at least one member
-# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
-# Without approval from a member of this team, PRs cannot be merged to release branches.
-# * @NVIDIA/trt-llm-release-branch-approval
+# Release-branch only — DO NOT uncomment on main.
+# Important: CODEOWNERS is last-match-wins. Uncommenting the catch-all below on a release branch
+# will override all specific owners above (including API review committees). If you need AND semantics
+# (release approver in addition to per-path owners), maintain a separate CODEOWNERS on release/**
+# that appends @NVIDIA/trt-llm-release-branch-approval to each path, or enforce via Rulesets “Required reviewers”.
+# Example for release/* (keep commented here):
+# * @NVIDIA/trt-llm-release-branch-approval
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -392,8 +392,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         std::vector<int64_t> output_shape = {num_rows, unpadded_hidden_size_val};
         auto output = torch::empty(output_shape, input.options().dtype(mOutputDtype));
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
         kernels::MoeMinLatencyParams min_latency_params{};
@@ -553,8 +553,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         min_latency_params.experts_to_token_score = static_cast<float*>(experts_to_token_score.data_ptr());
         min_latency_params.active_expert_global_ids = static_cast<int*>(active_expert_global_ids.data_ptr());
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
 
@@ -709,6 +709,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
     // e.g. 16 nvfp4 elements are packed into a single int64 element
     int64_t mInnerDimMultiplier;
     char* mProfileWorkspace = nullptr;
+    WorkspaceInfo workspace_info;
 
     bool mUseDeepSeekFP8BlockScaling = false;
     bool mUseW4GroupScaling = false;
@@ -757,9 +758,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile);
     }
 
-    WorkspaceInfo getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
+    WorkspaceInfo const& getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int num_experts, int experts_per_token, ActivationType activation_type,
-        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode)
+        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode, cudaStream_t stream)
     {
         size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(num_rows, hidden_size, inter_size, num_experts,
             experts_per_token, activation_type, parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling,
@@ -768,15 +769,29 @@ class FusedMoeRunner : public torch::CustomClassHolder
 
         std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
 
-        size_t total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+        int64_t const total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
 
-        WorkspaceInfo info{};
-        info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
-            torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-        info.src_to_dest_map
-            = common::nextWorkspacePtr(static_cast<int8_t*>(info.workspace.data_ptr()), moe_workspace_size);
+        bool is_capturing = tensorrt_llm::common::isCapturing(stream);
+        // Always allocate workspace when capturing cuda graph to avoid illegal memory access during replay
+        if (is_capturing || workspace_info.workspace.numel() < total_workspace_size)
+        {
+            if (is_capturing)
+            {
+                TLLM_LOG_DEBUG(
+                    "Allocating MoE workspace with %ld bytes size during cuda graph capture", total_workspace_size);
+            }
+            else
+            {
+                TLLM_LOG_DEBUG("MoE workspace size is not enough, increase the size from %ld bytes to %ld bytes",
+                    workspace_info.workspace.numel(), total_workspace_size);
+            }
+            workspace_info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
+                torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+        }
+        workspace_info.src_to_dest_map
+            = common::nextWorkspacePtr(static_cast<int8_t*>(workspace_info.workspace.data_ptr()), moe_workspace_size);
 
-        return info;
+        return workspace_info;
     }
 
     kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,

@@ -174,7 +174,8 @@ FROM wheel AS tritonbuild
 WORKDIR /src/tensorrt_llm
 RUN pip install /src/tensorrt_llm/build/tensorrt_llm*.whl
 COPY ./triton_backend/ ./triton_backend/
-RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh
+ARG TRITON_BASE_TAG
+RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
 
-ARG TRITON_BASE_TAG
-RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
+ARG TRITON_BASE_TAG
+RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3*}"
-ARG TRITON_BASE_TAG
-RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
+ARG TRITON_BASE_TAG
+RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3*}"
 
 FROM release AS tritonrelease

@@ -137,7 +137,7 @@ CODE_DIR          ?= /code/tensorrt_llm
 EXTRA_VOLUMES     ?=
 CCACHE_DIR        ?= $(CODE_DIR)/cpp/.ccache
 CONAN_DIR         ?= $(CODE_DIR)/cpp/.conan
-USER_CACHE_DIR    ?= $(HOME_DIR)/.cache
+USER_CACHE_DIR    ?= $(shell readlink -f "${HOME_DIR}/.cache")
 RUN_CMD           ?=
 CONTAINER_NAME    ?= tensorrt_llm
 WORK_DIR          ?= $(CODE_DIR)

@@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation!
    blogs/XQA-kernel.md
    blogs/tech_blog/*
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Use TensorRT Engine
+   :hidden:
+
+   legacy/tensorrt_quickstart.md
 
 Indices and tables
 ==================

@@ -0,0 +1,9 @@
+# LLM API with TensorRT Engine
+A simple inference example with TinyLlama using the LLM API:
+
+```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
+    :language: python
+    :linenos:
+```
+
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
@@ -0,0 +1,33 @@
+from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm import LLM, SamplingParams
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm import LLM, SamplingParams
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from tensorrt_llm import LLM, SamplingParams
+
+
+def main():
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
-from tensorrt_llm import LLM, SamplingParams
-
-
-def main():
-
-    # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+import argparse
+from tensorrt_llm._tensorrt_engine import LLM, SamplingParams
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--engine_dir", type=str, default=None)
+    args = parser.parse_args()
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", engine_dir=args.engine_dir)
+    # ...
-from tensorrt_llm import LLM, SamplingParams
-
-
-def main():
-
-    # Model could accept HF model name, a path to local HF model,
-    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+import argparse
+from tensorrt_llm._tensorrt_engine import LLM, SamplingParams
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--engine_dir", type=str, default=None)
+    args = parser.parse_args()
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", engine_dir=args.engine_dir)
+    # ...
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    for output in llm.generate(prompts, sampling_params):
+        print(
+            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        )
+
+    # Got output like
+    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+    # Prompt: 'The capital of France is', Generated text: 'Paris.'
+    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+
+
+if __name__ == '__main__':
+    main()
@@ -29,8 +29,7 @@ def example_cuda_graph_config():
         cuda_graph_config=cuda_graph_config,  # Enable CUDA graphs
         max_batch_size=4,
         max_seq_len=512,
-        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8,
-                                      enable_block_reuse=True))
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
 
     prompts = [
         "Hello, my name is",
@@ -56,7 +55,7 @@ def example_kv_cache_config():
                        max_batch_size=8,
                        max_seq_len=1024,
                        kv_cache_config=KvCacheConfig(
-                           free_gpu_memory_fraction=0.85,
+                           free_gpu_memory_fraction=0.5,
                            enable_block_reuse=True))
 
     prompts = [

@@ -1,11 +1,17 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import BuildConfig, SamplingParams
-from tensorrt_llm import BuildConfig, SamplingParams
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from tensorrt_llm import BuildConfig, SamplingParams
-from tensorrt_llm import BuildConfig, SamplingParams
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from tensorrt_llm import BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
-from tensorrt_llm import BuildConfig, SamplingParams
-from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
+from tensorrt_llm import BuildConfig, SamplingParams, LLM
-from tensorrt_llm import BuildConfig, SamplingParams
-from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
+from tensorrt_llm import BuildConfig, SamplingParams, LLM
 
 
 def main():
 
+    build_config = BuildConfig()
+    build_config.max_batch_size = 256
+    build_config.max_num_tokens = 1024
+
     # Model could accept HF model name, a path to local HF model,
     # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              build_config=build_config)
 
     # Sample prompts.
     prompts = [

@@ -122,6 +122,15 @@ def add_multimodal_args(parser):
          " ├── __init__.py"
          " ├── <model_name>.py"
          " └── <sub_dirs>"))
+    # Add multiturn conversation related parameters
+    parser.add_argument("--multiturn",
+                        action="store_true",
+                        help="Enable multi-turn conversation mode.")
+    parser.add_argument(
+        "--conversation_turns",
+        type=int,
+        default=2,
+        help="Number of conversation turns for automated testing.")
     return parser
 
 
@@ -188,6 +197,80 @@ def main():
         f"Unsupported model_type: {model_type} found!\n" \
         f"Supported types: {MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types()}"
 
+    # If multiturn mode is enabled
+    if args.multiturn:
+        # Run predefined multiturn conversation examples
+        assert args.prompt is not None, "Please provide a prompt for multiturn conversation."
+        assert args.media is not None, "Please provide media for multiturn conversation."
+        # Determine how many turns to run
+        max_turns = min(args.conversation_turns, len(args.prompt))
+        generated_outputs = []  # Store generated outputs for return
+
+        # Initialize conversation history with the first prompt
+        conversation_history = args.prompt[0] if args.prompt else ""
+
+        for i in range(max_turns):
+            print(f"\n--- Turn {i+1} ---")
+
+            try:
+                # Use multimodal input loader to process input with conversation context
+                # Use accumulated conversation history instead of just the current prompt
+                cur_prompt = conversation_history
+                inputs = default_multimodal_input_loader(
+                    tokenizer=llm.tokenizer,
+                    model_dir=llm._hf_model_dir,
+                    model_type=model_type,
+                    modality=args.modality,
+                    prompts=[cur_prompt],
+                    media=args.media,
+                    image_data_format="pt",
+                    num_frames=8,
+                    device="cpu")
+
-                inputs = default_multimodal_input_loader(
-                    tokenizer=llm.tokenizer,
-                    model_dir=llm._hf_model_dir,
-                    model_type=model_type,
-                    modality=args.modality,
-                    prompts=[cur_prompt],
-                    media=args.media,
-                    image_data_format="pt",
-                    num_frames=8,
-                    device="cpu")
+                # For nested-media (e.g., image_audio = [[img, aud], [img, aud], …]),
+                # pick one sample to pair with a single-turn prompt. For flat media
+                # (image/video/audio), 1 prompt + N media is supported by the loader.
+                media_for_turn = args.media
+                if isinstance(media_for_turn, list) and media_for_turn and isinstance(media_for_turn[0], list):
+                    media_for_turn = [media_for_turn[0]]
+
+                inputs = default_multimodal_input_loader(
+                    tokenizer=llm.tokenizer,
+                    model_dir=str(llm._hf_model_dir),
+                    model_type=model_type,
+                    modality=args.modality,
+                    prompts=[cur_prompt],
+                    media=media_for_turn,
+                    image_data_format=image_format,
+                    num_frames=args.num_frames,
+                    device=args.device)
-                inputs = default_multimodal_input_loader(
-                    tokenizer=llm.tokenizer,
-                    model_dir=llm._hf_model_dir,
-                    model_type=model_type,
-                    modality=args.modality,
-                    prompts=[cur_prompt],
-                    media=args.media,
-                    image_data_format="pt",
-                    num_frames=8,
-                    device="cpu")
+                # For nested-media (e.g., image_audio = [[img, aud], [img, aud], …]),
+                # pick one sample to pair with a single-turn prompt. For flat media
+                # (image/video/audio), 1 prompt + N media is supported by the loader.
+                media_for_turn = args.media
+                if isinstance(media_for_turn, list) and media_for_turn and isinstance(media_for_turn[0], list):
+                    media_for_turn = [media_for_turn[0]]
+
+                inputs = default_multimodal_input_loader(
+                    tokenizer=llm.tokenizer,
+                    model_dir=str(llm._hf_model_dir),
+                    model_type=model_type,
+                    modality=args.modality,
+                    prompts=[cur_prompt],
+                    media=media_for_turn,
+                    image_data_format=image_format,
+                    num_frames=args.num_frames,
+                    device=args.device)
+                lora_request = None
+                if args.load_lora:
+                    if model_class is None:
+                        raise ValueError(
+                            "model_class must be provided when load_lora is True"
+                        )
+                    lora_request = model_class.lora_request(
+                        len(inputs), args.modality, llm._hf_model_dir)
+
+                # Generate response
+                outputs = llm.generate(inputs,
+                                       sampling_params,
+                                       lora_request=lora_request)
+                assert outputs and len(
+                    outputs) > 0 and outputs[0].outputs and len(
+                        outputs[0].outputs) > 0
+                response = outputs[0].outputs[0].text.strip()
+
+                # Store generated output
+                generated_outputs.append({
+                    "turn": i + 1,
+                    "user_input": cur_prompt,
+                    "assistant_response": response,
+                    "media": args.media
+                })
+
+                conversation_history = conversation_history + "\n" + response
+                if i + 1 < len(args.prompt):
+                    conversation_history = conversation_history + "\n" + args.prompt[
+                        i + 1]
+
+            except Exception as e:
+                print(f"Error in turn {i+1}: {e}")
+                import traceback
+                traceback.print_exc()
+                continue
+
+        for i, output in enumerate(generated_outputs):
+            print(
+                f"[{i}] Prompt: {output['user_input']!r}, Generated text: {output['assistant_response']!r}"
+            )
+        return
+
+    # Original single-turn processing logic
     # set prompts and media to example prompts and images if they are not provided
     if args.prompt is None:
         args.prompt = example_medias_and_prompts[args.modality]["prompt"]