Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ Welcome to TensorRT-LLM's Documentation!
blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md

.. toctree::
:maxdepth: 2
:caption: Use TensorRT Engine
:hidden:

legacy/tensorrt_quickstart.md

Indices and tables
==================
Expand Down
9 changes: 9 additions & 0 deletions docs/source/legacy/tensorrt_quickstart.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# LLM API with TensorRT Engine
A simple inference example with TinyLlama using the LLM API:

```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
:language: python
:linenos:
```

For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
33 changes: 33 additions & 0 deletions examples/llm-api/_tensorrt_engine/quickstart_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from tensorrt_llm import LLM, SamplingParams


def main():

# Model could accept HF model name, a path to local HF model,
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# Sample prompts.
prompts = [
"Hello, my name is",
"The capital of France is",
"The future of AI is",
]

# Create a sampling params.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

for output in llm.generate(prompts, sampling_params):
print(
f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
)

# Got output like
# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
# Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
# Prompt: 'The capital of France is', Generated text: 'Paris.'
# Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'


if __name__ == '__main__':
main()
5 changes: 2 additions & 3 deletions examples/llm-api/llm_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def example_cuda_graph_config():
cuda_graph_config=cuda_graph_config, # Enable CUDA graphs
max_batch_size=4,
max_seq_len=512,
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8,
enable_block_reuse=True))
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))

prompts = [
"Hello, my name is",
Expand All @@ -56,7 +55,7 @@ def example_kv_cache_config():
max_batch_size=8,
max_seq_len=1024,
kv_cache_config=KvCacheConfig(
free_gpu_memory_fraction=0.85,
free_gpu_memory_fraction=0.5,
enable_block_reuse=True))

prompts = [
Expand Down
10 changes: 8 additions & 2 deletions examples/llm-api/quickstart_example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import BuildConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM # NOTE the change


def main():

build_config = BuildConfig()
build_config.max_batch_size = 256
build_config.max_num_tokens = 1024

# Model could accept HF model name, a path to local HF model,
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
build_config=build_config)

# Sample prompts.
prompts = [
Expand Down
11 changes: 9 additions & 2 deletions tests/integration/defs/llmapi/test_llm_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,18 @@ def test_llmapi_speculative_decoding_ngram(llm_root, engine_dir, llm_venv):
"llm_speculative_decoding.py", "NGRAM")


@pytest.mark.skip(reason="https://nvbugs/5365825")
@pytest.mark.skip(reason="https://nvbugs/5365825"
) # maybe unrelated, but this test will always timeout
def test_llmapi_sampling(llm_root, engine_dir, llm_venv):
_run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_sampling.py")


@pytest.mark.skip(reason="https://nvbugs/5365825")
@pytest.mark.skip(reason="https://nvbugs/5365825"
) # maybe unrelated, but this test will always timeout
def test_llmapi_runtime(llm_root, engine_dir, llm_venv):
_run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_runtime.py")


def test_llmapi_tensorrt_engine(llm_root, engine_dir, llm_venv):
_run_llmapi_example(llm_root, engine_dir, llm_venv,
"_tensorrt_engine/quickstart_example.py")
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_sanity_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ l0_sanity_check:
- llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_ngram
- llmapi/test_llm_examples.py::test_llmapi_sampling
- llmapi/test_llm_examples.py::test_llmapi_runtime
- llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine
- examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]