NVIDIA
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions b/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 6 additions & 5 deletions b/‎.github/CODEOWNERS‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 28 additions & 13 deletions b/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 28 additions & 13 deletions
diff --git a/‎docker/Dockerfile.multi‎
Lines changed: 2 additions & 1 deletion b/‎docker/Dockerfile.multi‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docker/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎docker/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/index.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/index.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/legacy/tensorrt_quickstart.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/source/legacy/tensorrt_quickstart.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/llm-api/_tensorrt_engine/quickstart_example.py‎
Lines changed: 33 additions & 0 deletions b/‎examples/llm-api/_tensorrt_engine/quickstart_example.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎examples/llm-api/llm_runtime.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/llm-api/llm_runtime.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/llm-api/quickstart_example.py‎
Lines changed: 8 additions & 2 deletions b/‎examples/llm-api/quickstart_example.py‎
Lines changed: 8 additions & 2 deletions
@@ -9,5 +9,6 @@ examples/**/.git
 examples/**/*.bin
 examples/**/*.engine
 examples/**/*.onnx
+examples/**/*.safetensors
 examples/**/c-model
 examples/models/core/gpt/gpt*
@@ -1,10 +1,5 @@
 # This file defines code ownership rules for the repository.
 
-# The following rule should only be uncommented on release branches (e.g., release/0.19).
-# The rule below requires that any PR to release/**/* branches must be approved by at least one member
-# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
-# Without approval from a member of this team, PRs cannot be merged to release branches.
-# * @NVIDIA/trt-llm-release-branch-approval
 
 ## TensorRT-LLM Infra
 ### CI
@@ -160,3 +155,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 # from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
 /tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
 /tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
+
+# The following rule should only be uncommented on release branches (e.g., release/0.19).
+# The rule below requires that any PR to release/**/* branches must be approved by at least one member
+# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
+# Without approval from a member of this team, PRs cannot be merged to release branches.
+# * @NVIDIA/trt-llm-release-branch-approval
@@ -392,8 +392,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         std::vector<int64_t> output_shape = {num_rows, unpadded_hidden_size_val};
         auto output = torch::empty(output_shape, input.options().dtype(mOutputDtype));
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
         kernels::MoeMinLatencyParams min_latency_params{};
@@ -553,8 +553,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         min_latency_params.experts_to_token_score = static_cast<float*>(experts_to_token_score.data_ptr());
         min_latency_params.active_expert_global_ids = static_cast<int*>(active_expert_global_ids.data_ptr());
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
 
@@ -709,6 +709,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
     // e.g. 16 nvfp4 elements are packed into a single int64 element
     int64_t mInnerDimMultiplier;
     char* mProfileWorkspace = nullptr;
+    WorkspaceInfo workspace_info;
 
     bool mUseDeepSeekFP8BlockScaling = false;
     bool mUseW4GroupScaling = false;
@@ -757,9 +758,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile);
     }
 
-    WorkspaceInfo getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
+    WorkspaceInfo const& getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int num_experts, int experts_per_token, ActivationType activation_type,
-        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode)
+        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode, cudaStream_t stream)
     {
         size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(num_rows, hidden_size, inter_size, num_experts,
             experts_per_token, activation_type, parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling,
@@ -768,15 +769,29 @@ class FusedMoeRunner : public torch::CustomClassHolder
 
         std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
 
-        size_t total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+        int64_t const total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
 
-        WorkspaceInfo info{};
-        info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
-            torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-        info.src_to_dest_map
-            = common::nextWorkspacePtr(static_cast<int8_t*>(info.workspace.data_ptr()), moe_workspace_size);
+        bool is_capturing = tensorrt_llm::common::isCapturing(stream);
+        // Always allocate workspace when capturing cuda graph to avoid illegal memory access during replay
+        if (is_capturing || workspace_info.workspace.numel() < total_workspace_size)
+        {
+            if (is_capturing)
+            {
+                TLLM_LOG_DEBUG(
+                    "Allocating MoE workspace with %ld bytes size during cuda graph capture", total_workspace_size);
+            }
+            else
+            {
+                TLLM_LOG_DEBUG("MoE workspace size is not enough, increase the size from %ld bytes to %ld bytes",
+                    workspace_info.workspace.numel(), total_workspace_size);
+            }
+            workspace_info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
+                torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+        }
+        workspace_info.src_to_dest_map
+            = common::nextWorkspacePtr(static_cast<int8_t*>(workspace_info.workspace.data_ptr()), moe_workspace_size);
 
-        return info;
+        return workspace_info;
     }
 
     kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,
 
@@ -174,7 +174,8 @@ FROM wheel AS tritonbuild
 WORKDIR /src/tensorrt_llm
 RUN pip install /src/tensorrt_llm/build/tensorrt_llm*.whl
 COPY ./triton_backend/ ./triton_backend/
-RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh
+ARG TRITON_BASE_TAG
+RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
 
 
 FROM release AS tritonrelease
 
@@ -137,7 +137,7 @@ CODE_DIR          ?= /code/tensorrt_llm
 EXTRA_VOLUMES     ?=
 CCACHE_DIR        ?= $(CODE_DIR)/cpp/.ccache
 CONAN_DIR         ?= $(CODE_DIR)/cpp/.conan
-USER_CACHE_DIR    ?= $(HOME_DIR)/.cache
+USER_CACHE_DIR    ?= $(shell readlink -f "${HOME_DIR}/.cache")
 RUN_CMD           ?=
 CONTAINER_NAME    ?= tensorrt_llm
 WORK_DIR          ?= $(CODE_DIR)
 
@@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation!
    blogs/XQA-kernel.md
    blogs/tech_blog/*
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Use TensorRT Engine
+   :hidden:
+
+   legacy/tensorrt_quickstart.md
 
 Indices and tables
 ==================
 
@@ -0,0 +1,9 @@
+# LLM API with TensorRT Engine
+A simple inference example with TinyLlama using the LLM API:
+
+```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
+    :language: python
+    :linenos:
+```
+
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
@@ -0,0 +1,33 @@
+from tensorrt_llm import LLM, SamplingParams
+
+
+def main():
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    for output in llm.generate(prompts, sampling_params):
+        print(
+            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        )
+
+    # Got output like
+    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+    # Prompt: 'The capital of France is', Generated text: 'Paris.'
+    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+
+
+if __name__ == '__main__':
+    main()
@@ -29,8 +29,7 @@ def example_cuda_graph_config():
         cuda_graph_config=cuda_graph_config,  # Enable CUDA graphs
         max_batch_size=4,
         max_seq_len=512,
-        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8,
-                                      enable_block_reuse=True))
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
 
     prompts = [
         "Hello, my name is",
@@ -56,7 +55,7 @@ def example_kv_cache_config():
                        max_batch_size=8,
                        max_seq_len=1024,
                        kv_cache_config=KvCacheConfig(
-                           free_gpu_memory_fraction=0.85,
+                           free_gpu_memory_fraction=0.5,
                            enable_block_reuse=True))
 
     prompts = [
 
@@ -1,11 +1,17 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
 
 
 def main():
 
+    build_config = BuildConfig()
+    build_config.max_batch_size = 256
+    build_config.max_num_tokens = 1024
+
     # Model could accept HF model name, a path to local HF model,
     # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              build_config=build_config)
 
     # Sample prompts.
     prompts = [