init doc

Superjomn · Superjomn · commit 52af25ff0a90 · 2025-08-08T11:25:32.000+08:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -150,6 +150,12 @@ Welcome to TensorRT-LLM's Documentation!
    blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
    blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
 
+.. toctree::
+   :maxdepth: 2
+   :caption: LLM with TensorRT Engine
+   :hidden:
+
+   legacy/tensorrt_quickstart.md
 
 Indices and tables
 ==================
diff --git a/docs/source/legacy/tensorrt_quickstart.md b/docs/source/legacy/tensorrt_quickstart.md
@@ -0,0 +1,9 @@
+# LLM API with TensorRT Engine
+A simple inference example with TinyLlama using the LLM API:
+
+```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
+    :language: python
+    :linenos:
+```
+
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py
@@ -0,0 +1,33 @@
+from tensorrt_llm import LLM, SamplingParams
+
+
+def main():
+
+    # Model could accept HF model name, a path to local HF model,
+    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    for output in llm.generate(prompts, sampling_params):
+        print(
+            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        )
+
+    # Got output like
+    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+    # Prompt: 'The capital of France is', Generated text: 'Paris.'
+    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py
@@ -1,11 +1,17 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM  # NOTE the change
 
 
 def main():
 
+    build_config = BuildConfig()
+    build_config.max_batch_size = 256
+    build_config.max_num_tokens = 1024
+
     # Model could accept HF model name, a path to local HF model,
     # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              build_config=build_config)
 
     # Sample prompts.
     prompts = [