[None][doc] add legacy section for tensorrt engine (NVIDIA#6724)

Superjomn · dominicshanshan · commit fbb5f50ecf12 · 2025-08-26T23:49:00.000-07:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation!
    blogs/XQA-kernel.md
    blogs/tech_blog/*
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Use TensorRT Engine
+   :hidden:
+
+   legacy/tensorrt_quickstart.md
 
 Indices and tables
 ==================
diff --git a/docs/source/legacy/tensorrt_quickstart.md b/docs/source/legacy/tensorrt_quickstart.md
@@ -0,0 +1,9 @@
+# LLM API with TensorRT Engine
+A simple inference example with TinyLlama using the LLM API:
+
+```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
+    :language: python
+    :linenos:
+```
+
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py
@@ -0,0 +1,33 @@
+from tensorrt_llm import LLM, SamplingParams
+
+
+def main():
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    for output in llm.generate(prompts, sampling_params):
+        print(
+            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        )
+
+    # Got output like
+    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+    # Prompt: 'The capital of France is', Generated text: 'Paris.'
+    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/llm-api/llm_runtime.py b/examples/llm-api/llm_runtime.py
@@ -29,8 +29,7 @@ def example_cuda_graph_config():
         cuda_graph_config=cuda_graph_config,  # Enable CUDA graphs
         max_batch_size=4,
         max_seq_len=512,
-        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8,
-                                      enable_block_reuse=True))
+        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
 
     prompts = [
         "Hello, my name is",
@@ -56,7 +55,7 @@ def example_kv_cache_config():
                        max_batch_size=8,
                        max_seq_len=1024,
                        kv_cache_config=KvCacheConfig(
-                           free_gpu_memory_fraction=0.85,
+                           free_gpu_memory_fraction=0.5,
                            enable_block_reuse=True))
 
     prompts = [
diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py
@@ -1,11 +1,17 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
 
 
 def main():
 
+    build_config = BuildConfig()
+    build_config.max_batch_size = 256
+    build_config.max_num_tokens = 1024
+
     # Model could accept HF model name, a path to local HF model,
     # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              build_config=build_config)
 
     # Sample prompts.
     prompts = [
diff --git a/tests/integration/defs/llmapi/test_llm_examples.py b/tests/integration/defs/llmapi/test_llm_examples.py
@@ -155,11 +155,18 @@ def test_llmapi_speculative_decoding_ngram(llm_root, engine_dir, llm_venv):
                         "llm_speculative_decoding.py", "NGRAM")
 
 
-@pytest.mark.skip(reason="https://nvbugs/5365825")
+@pytest.mark.skip(reason="https://nvbugs/5365825"
+                  )  # maybe unrelated, but this test will always timeout
 def test_llmapi_sampling(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_sampling.py")
 
 
-@pytest.mark.skip(reason="https://nvbugs/5365825")
+@pytest.mark.skip(reason="https://nvbugs/5365825"
+                  )  # maybe unrelated, but this test will always timeout
 def test_llmapi_runtime(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_runtime.py")
+
+
+def test_llmapi_tensorrt_engine(llm_root, engine_dir, llm_venv):
+    _run_llmapi_example(llm_root, engine_dir, llm_venv,
+                        "_tensorrt_engine/quickstart_example.py")
diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml
@@ -30,4 +30,5 @@ l0_sanity_check:
       - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_ngram
       - llmapi/test_llm_examples.py::test_llmapi_sampling
       - llmapi/test_llm_examples.py::test_llmapi_runtime
+      - llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine
       - examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]