From 22b39e11f2eca7dd70bcef3760cd5da149412d00 Mon Sep 17 00:00:00 2001 From: Kameshwara Pavan Kumar Mantha <25398886+pavanjava@users.noreply.github.com> Date: Thu, 15 Aug 2024 04:08:37 +0530 Subject: [PATCH] llama_index serving integration documentation (#6973) Co-authored-by: pavanmantha --- docs/source/serving/integrations.rst | 1 + .../serving/serving_with_llamaindex.rst | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 docs/source/serving/serving_with_llamaindex.rst diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 680ea523dfe94..7882e14f3b849 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -12,3 +12,4 @@ Integrations deploying_with_lws deploying_with_dstack serving_with_langchain + serving_with_llamaindex diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst new file mode 100644 index 0000000000000..038e961344e47 --- /dev/null +++ b/docs/source/serving/serving_with_llamaindex.rst @@ -0,0 +1,27 @@ +.. _run_on_llamaindex: + +Serving with llama_index +============================ + +vLLM is also available via `llama_index `_ . + +To install llamaindex, run + +.. code-block:: console + + $ pip install llama-index-llms-vllm -q + +To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. + +.. code-block:: python + + from llama_index.llms.vllm import Vllm + + llm = Vllm( + model="microsoft/Orca-2-7b", + tensor_parallel_size=4, + max_new_tokens=100, + vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, + ) + +Please refer to this `Tutorial `_ for more details.