diff --git a/docs/source/index.rst b/docs/source/index.rst index cb04be70253..34288cd17eb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -150,6 +150,12 @@ Welcome to TensorRT-LLM's Documentation! blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md +.. toctree:: + :maxdepth: 2 + :caption: Use TensorRT Engine + :hidden: + + legacy/tensorrt_quickstart.md Indices and tables ================== diff --git a/docs/source/legacy/tensorrt_quickstart.md b/docs/source/legacy/tensorrt_quickstart.md new file mode 100644 index 00000000000..df62aa38d73 --- /dev/null +++ b/docs/source/legacy/tensorrt_quickstart.md @@ -0,0 +1,9 @@ +# LLM API with TensorRT Engine +A simple inference example with TinyLlama using the LLM API: + +```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py + :language: python + :linenos: +``` + +For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md). diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py new file mode 100644 index 00000000000..400a241c0e9 --- /dev/null +++ b/examples/llm-api/_tensorrt_engine/quickstart_example.py @@ -0,0 +1,33 @@ +from tensorrt_llm import LLM, SamplingParams + + +def main(): + + # Model could accept HF model name, a path to local HF model, + # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. + llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The capital of France is", + "The future of AI is", + ] + + # Create a sampling params. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + for output in llm.generate(prompts, sampling_params): + print( + f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}" + ) + + # Got output like + # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming' + # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the' + # Prompt: 'The capital of France is', Generated text: 'Paris.' + # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are' + + +if __name__ == '__main__': + main() diff --git a/examples/llm-api/llm_runtime.py b/examples/llm-api/llm_runtime.py index deebdd68eb8..40c6af6a9b0 100644 --- a/examples/llm-api/llm_runtime.py +++ b/examples/llm-api/llm_runtime.py @@ -29,8 +29,7 @@ def example_cuda_graph_config(): cuda_graph_config=cuda_graph_config, # Enable CUDA graphs max_batch_size=4, max_seq_len=512, - kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8, - enable_block_reuse=True)) + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5)) prompts = [ "Hello, my name is", @@ -56,7 +55,7 @@ def example_kv_cache_config(): max_batch_size=8, max_seq_len=1024, kv_cache_config=KvCacheConfig( - free_gpu_memory_fraction=0.85, + free_gpu_memory_fraction=0.5, enable_block_reuse=True)) prompts = [ diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py index 400a241c0e9..a6ba9ec5598 100644 --- a/examples/llm-api/quickstart_example.py +++ b/examples/llm-api/quickstart_example.py @@ -1,11 +1,17 @@ -from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm import BuildConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM # NOTE the change def main(): + build_config = BuildConfig() + build_config.max_batch_size = 256 + build_config.max_num_tokens = 1024 + # Model could accept HF model name, a path to local HF model, # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. - llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + build_config=build_config) # Sample prompts. prompts = [ diff --git a/tests/integration/defs/llmapi/test_llm_examples.py b/tests/integration/defs/llmapi/test_llm_examples.py index 993372eb540..3b6961c31d1 100644 --- a/tests/integration/defs/llmapi/test_llm_examples.py +++ b/tests/integration/defs/llmapi/test_llm_examples.py @@ -155,11 +155,18 @@ def test_llmapi_speculative_decoding_ngram(llm_root, engine_dir, llm_venv): "llm_speculative_decoding.py", "NGRAM") -@pytest.mark.skip(reason="https://nvbugs/5365825") +@pytest.mark.skip(reason="https://nvbugs/5365825" + ) # maybe unrelated, but this test will always timeout def test_llmapi_sampling(llm_root, engine_dir, llm_venv): _run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_sampling.py") -@pytest.mark.skip(reason="https://nvbugs/5365825") +@pytest.mark.skip(reason="https://nvbugs/5365825" + ) # maybe unrelated, but this test will always timeout def test_llmapi_runtime(llm_root, engine_dir, llm_venv): _run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_runtime.py") + + +def test_llmapi_tensorrt_engine(llm_root, engine_dir, llm_venv): + _run_llmapi_example(llm_root, engine_dir, llm_venv, + "_tensorrt_engine/quickstart_example.py") diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml index f713e8a1497..cbf68424178 100644 --- a/tests/integration/test_lists/test-db/l0_sanity_check.yml +++ b/tests/integration/test_lists/test-db/l0_sanity_check.yml @@ -30,4 +30,5 @@ l0_sanity_check: - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_ngram - llmapi/test_llm_examples.py::test_llmapi_sampling - llmapi/test_llm_examples.py::test_llmapi_runtime + - llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine - examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]