From 21205655d1ea003ec0885d5e2aeb7a9c8842b083 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 12 Dec 2024 14:37:29 -0500 Subject: [PATCH 01/27] Initial conversion Signed-off-by: Rafael Vasquez --- docs/source/automatic_prefix_caching/apc.md | 102 +++ docs/source/automatic_prefix_caching/apc.rst | 110 --- docs/source/community/meetups.md | 15 + docs/source/community/meetups.rst | 16 - .../contributing/dockerfile/dockerfile.md | 50 ++ .../contributing/dockerfile/dockerfile.rst | 50 -- .../{overview.rst => overview.md} | 137 ++- .../contributing/profiling/profiling_index.md | 41 + .../profiling/profiling_index.rst | 48 - .../{arch_overview.rst => arch_overview.md} | 250 +++--- docs/source/design/huggingface_integration.md | 36 + .../source/design/huggingface_integration.rst | 40 - .../input_processing_pipeline.md | 19 + .../input_processing_pipeline.rst | 20 - .../input_processing/model_inputs_index.md | 43 + .../input_processing/model_inputs_index.rst | 39 - docs/source/design/kernel/paged_attention.md | 527 +++++++++++ docs/source/design/kernel/paged_attention.rst | 525 ----------- .../multimodal/adding_multimodal_plugin.md | 16 + .../multimodal/adding_multimodal_plugin.rst | 17 - ...ltimodal_index.rst => multimodal_index.md} | 61 +- docs/source/design/plugin_system.md | 54 ++ docs/source/design/plugin_system.rst | 62 -- ...ync_llm_engine.rst => async_llm_engine.md} | 5 +- docs/source/dev/engine/engine_index.md | 17 + docs/source/dev/engine/engine_index.rst | 13 - .../engine/{llm_engine.rst => llm_engine.md} | 5 +- .../dev/offline_inference/{llm.rst => llm.md} | 5 +- .../{llm_inputs.rst => llm_inputs.md} | 9 +- .../dev/offline_inference/offline_index.md | 8 + .../dev/offline_inference/offline_index.rst | 8 - .../{pooling_params.rst => pooling_params.md} | 5 +- ...sampling_params.rst => sampling_params.md} | 5 +- .../getting_started/amd-installation.md | 166 ++++ .../getting_started/amd-installation.rst | 178 ---- .../getting_started/arm-installation.md | 46 + .../getting_started/arm-installation.rst | 50 -- .../getting_started/cpu-installation.md | 154 ++++ .../getting_started/cpu-installation.rst | 164 ---- docs/source/getting_started/debugging.md | 139 +++ docs/source/getting_started/debugging.rst | 141 --- .../getting_started/examples/api_client.md | 8 + .../getting_started/examples/aqlm_example.md | 8 + .../getting_started/examples/cpu_offload.md | 8 + .../examples/examples_index.md | 44 + .../examples/examples_index.template.md | 8 + .../examples/examples_index.template.rst | 8 - .../examples/florence2_inference.md | 8 + .../examples/gguf_inference.md | 8 + .../gradio_openai_chatbot_webserver.md | 8 + .../examples/gradio_webserver.md | 8 + .../examples/llm_engine_example.md | 8 + .../lora_with_quantization_inference.md | 8 + .../examples/multilora_inference.md | 8 + .../examples/offline_chat_with_tools.md | 8 + .../examples/offline_inference.md | 8 + .../examples/offline_inference_arctic.md | 8 + .../offline_inference_audio_language.md | 8 + .../examples/offline_inference_chat.md | 8 + .../examples/offline_inference_distributed.md | 8 + .../examples/offline_inference_embedding.md | 8 + .../offline_inference_encoder_decoder.md | 8 + .../offline_inference_mlpspeculator.md | 8 + .../examples/offline_inference_neuron.md | 8 + ...line_inference_neuron_int8_quantization.md | 8 + .../examples/offline_inference_pixtral.md | 8 + .../examples/offline_inference_tpu.md | 8 + .../offline_inference_vision_language.md | 8 + ...ine_inference_vision_language_embedding.md | 8 + ...e_inference_vision_language_multi_image.md | 8 + .../examples/offline_inference_with_prefix.md | 8 + .../offline_inference_with_profiler.md | 8 + .../examples/offline_profile.md | 8 + .../examples/openai_chat_completion_client.md | 8 + ...i_chat_completion_client_for_multimodal.md | 8 + ...penai_chat_completion_client_with_tools.md | 8 + ...ai_chat_embedding_client_for_multimodal.md | 8 + .../examples/openai_completion_client.md | 8 + .../examples/openai_embedding_client.md | 8 + .../examples/save_sharded_state.md | 8 + .../examples/tensorize_vllm_model.md | 8 + .../getting_started/gaudi-installation.md | 388 +++++++++ .../getting_started/gaudi-installation.rst | 402 --------- docs/source/getting_started/installation.md | 199 +++++ docs/source/getting_started/installation.rst | 214 ----- .../getting_started/neuron-installation.md | 132 +++ .../getting_started/neuron-installation.rst | 140 --- .../getting_started/openvino-installation.md | 104 +++ .../getting_started/openvino-installation.rst | 116 --- docs/source/getting_started/quickstart.md | 174 ++++ docs/source/getting_started/quickstart.rst | 181 ---- .../getting_started/tpu-installation.md | 193 ++++ .../getting_started/tpu-installation.rst | 200 ----- .../getting_started/xpu-installation.md | 74 ++ .../getting_started/xpu-installation.rst | 80 -- docs/source/index.md | 197 +++++ docs/source/index.rst | 191 ---- docs/source/models/adding_model.md | 155 ++++ docs/source/models/adding_model.rst | 159 ---- .../models/enabling_multimodal_inputs.md | 143 +++ .../models/enabling_multimodal_inputs.rst | 147 ---- docs/source/models/generative_models.md | 138 +++ docs/source/models/generative_models.rst | 146 ---- docs/source/models/pooling_models.md | 92 ++ docs/source/models/pooling_models.rst | 99 --- docs/source/models/supported_models.md | 824 ++++++++++++++++++ docs/source/performance/benchmarks.md | 28 + docs/source/performance/benchmarks.rst | 33 - docs/source/quantization/auto_awq.md | 78 ++ docs/source/quantization/auto_awq.rst | 79 -- docs/source/quantization/bnb.md | 39 + docs/source/quantization/bnb.rst | 43 - docs/source/quantization/fp8.md | 192 ++++ docs/source/quantization/fp8.rst | 204 ----- docs/source/quantization/fp8_e4m3_kvcache.md | 44 + docs/source/quantization/fp8_e4m3_kvcache.rst | 47 - docs/source/quantization/fp8_e5m2_kvcache.md | 31 + docs/source/quantization/fp8_e5m2_kvcache.rst | 34 - docs/source/quantization/gguf.md | 72 ++ docs/source/quantization/gguf.rst | 73 -- docs/source/quantization/int8.md | 136 +++ docs/source/quantization/int8.rst | 145 --- ...ted_hardware.rst => supported_hardware.md} | 264 +++--- docs/source/serving/deploying_with_bentoml.md | 7 + .../source/serving/deploying_with_bentoml.rst | 8 - .../serving/deploying_with_cerebrium.md | 109 +++ .../serving/deploying_with_cerebrium.rst | 112 --- docs/source/serving/deploying_with_docker.md | 48 + docs/source/serving/deploying_with_docker.rst | 53 -- docs/source/serving/deploying_with_dstack.md | 102 +++ docs/source/serving/deploying_with_dstack.rst | 103 --- ...g_with_helm.rst => deploying_with_helm.md} | 44 +- docs/source/serving/deploying_with_k8s.md | 171 ++++ docs/source/serving/deploying_with_k8s.rst | 175 ---- docs/source/serving/deploying_with_kserve.md | 7 + docs/source/serving/deploying_with_kserve.rst | 8 - docs/source/serving/deploying_with_kubeai.md | 15 + docs/source/serving/deploying_with_kubeai.rst | 17 - docs/source/serving/deploying_with_lws.md | 11 + docs/source/serving/deploying_with_lws.rst | 12 - docs/source/serving/deploying_with_nginx.md | 133 +++ docs/source/serving/deploying_with_nginx.rst | 142 --- docs/source/serving/deploying_with_triton.md | 5 + docs/source/serving/deploying_with_triton.rst | 6 - docs/source/serving/distributed_serving.md | 105 +++ docs/source/serving/distributed_serving.rst | 107 --- docs/source/serving/integrations.md | 17 + docs/source/serving/integrations.rst | 17 - docs/source/serving/metrics.md | 38 + docs/source/serving/metrics.rst | 38 - docs/source/serving/run_on_sky.md | 345 ++++++++ docs/source/serving/run_on_sky.rst | 366 -------- docs/source/serving/serving_with_langchain.md | 30 + .../source/serving/serving_with_langchain.rst | 31 - .../source/serving/serving_with_llamaindex.md | 26 + .../serving/serving_with_llamaindex.rst | 27 - .../source/serving/serving_with_llamastack.md | 38 + .../serving/serving_with_llamastack.rst | 42 - docs/source/serving/tensorizer.md | 16 + docs/source/serving/tensorizer.rst | 15 - ...ity_matrix.rst => compatibility_matrix.md} | 378 ++++---- .../usage/{engine_args.rst => engine_args.md} | 14 +- docs/source/usage/env_vars.md | 15 + docs/source/usage/env_vars.rst | 14 - docs/source/usage/{faq.rst => faq.md} | 31 +- docs/source/usage/lora.md | 215 +++++ docs/source/usage/lora.rst | 225 ----- docs/source/usage/multimodal_inputs.md | 398 +++++++++ docs/source/usage/multimodal_inputs.rst | 404 --------- .../usage/{performance.rst => performance.md} | 53 +- docs/source/usage/spec_decode.md | 205 +++++ docs/source/usage/spec_decode.rst | 210 ----- docs/source/usage/structured_outputs.md | 260 ++++++ docs/source/usage/structured_outputs.rst | 267 ------ 174 files changed, 8161 insertions(+), 7256 deletions(-) create mode 100644 docs/source/automatic_prefix_caching/apc.md delete mode 100644 docs/source/automatic_prefix_caching/apc.rst create mode 100644 docs/source/community/meetups.md delete mode 100644 docs/source/community/meetups.rst create mode 100644 docs/source/contributing/dockerfile/dockerfile.md delete mode 100644 docs/source/contributing/dockerfile/dockerfile.rst rename docs/source/contributing/{overview.rst => overview.md} (51%) create mode 100644 docs/source/contributing/profiling/profiling_index.md delete mode 100644 docs/source/contributing/profiling/profiling_index.rst rename docs/source/design/{arch_overview.rst => arch_overview.md} (54%) create mode 100644 docs/source/design/huggingface_integration.md delete mode 100644 docs/source/design/huggingface_integration.rst create mode 100644 docs/source/design/input_processing/input_processing_pipeline.md delete mode 100644 docs/source/design/input_processing/input_processing_pipeline.rst create mode 100644 docs/source/design/input_processing/model_inputs_index.md delete mode 100644 docs/source/design/input_processing/model_inputs_index.rst create mode 100644 docs/source/design/kernel/paged_attention.md delete mode 100644 docs/source/design/kernel/paged_attention.rst create mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.rst rename docs/source/design/multimodal/{multimodal_index.rst => multimodal_index.md} (61%) create mode 100644 docs/source/design/plugin_system.md delete mode 100644 docs/source/design/plugin_system.rst rename docs/source/dev/engine/{async_llm_engine.rst => async_llm_engine.md} (59%) create mode 100644 docs/source/dev/engine/engine_index.md delete mode 100644 docs/source/dev/engine/engine_index.rst rename docs/source/dev/engine/{llm_engine.rst => llm_engine.md} (60%) rename docs/source/dev/offline_inference/{llm.rst => llm.md} (67%) rename docs/source/dev/offline_inference/{llm_inputs.rst => llm_inputs.md} (78%) create mode 100644 docs/source/dev/offline_inference/offline_index.md delete mode 100644 docs/source/dev/offline_inference/offline_index.rst rename docs/source/dev/{pooling_params.rst => pooling_params.md} (55%) rename docs/source/dev/{sampling_params.rst => sampling_params.md} (55%) create mode 100644 docs/source/getting_started/amd-installation.md delete mode 100644 docs/source/getting_started/amd-installation.rst create mode 100644 docs/source/getting_started/arm-installation.md delete mode 100644 docs/source/getting_started/arm-installation.rst create mode 100644 docs/source/getting_started/cpu-installation.md delete mode 100644 docs/source/getting_started/cpu-installation.rst create mode 100644 docs/source/getting_started/debugging.md delete mode 100644 docs/source/getting_started/debugging.rst create mode 100644 docs/source/getting_started/examples/api_client.md create mode 100644 docs/source/getting_started/examples/aqlm_example.md create mode 100644 docs/source/getting_started/examples/cpu_offload.md create mode 100644 docs/source/getting_started/examples/examples_index.md create mode 100644 docs/source/getting_started/examples/examples_index.template.md delete mode 100644 docs/source/getting_started/examples/examples_index.template.rst create mode 100644 docs/source/getting_started/examples/florence2_inference.md create mode 100644 docs/source/getting_started/examples/gguf_inference.md create mode 100644 docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md create mode 100644 docs/source/getting_started/examples/gradio_webserver.md create mode 100644 docs/source/getting_started/examples/llm_engine_example.md create mode 100644 docs/source/getting_started/examples/lora_with_quantization_inference.md create mode 100644 docs/source/getting_started/examples/multilora_inference.md create mode 100644 docs/source/getting_started/examples/offline_chat_with_tools.md create mode 100644 docs/source/getting_started/examples/offline_inference.md create mode 100644 docs/source/getting_started/examples/offline_inference_arctic.md create mode 100644 docs/source/getting_started/examples/offline_inference_audio_language.md create mode 100644 docs/source/getting_started/examples/offline_inference_chat.md create mode 100644 docs/source/getting_started/examples/offline_inference_distributed.md create mode 100644 docs/source/getting_started/examples/offline_inference_embedding.md create mode 100644 docs/source/getting_started/examples/offline_inference_encoder_decoder.md create mode 100644 docs/source/getting_started/examples/offline_inference_mlpspeculator.md create mode 100644 docs/source/getting_started/examples/offline_inference_neuron.md create mode 100644 docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md create mode 100644 docs/source/getting_started/examples/offline_inference_pixtral.md create mode 100644 docs/source/getting_started/examples/offline_inference_tpu.md create mode 100644 docs/source/getting_started/examples/offline_inference_vision_language.md create mode 100644 docs/source/getting_started/examples/offline_inference_vision_language_embedding.md create mode 100644 docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md create mode 100644 docs/source/getting_started/examples/offline_inference_with_prefix.md create mode 100644 docs/source/getting_started/examples/offline_inference_with_profiler.md create mode 100644 docs/source/getting_started/examples/offline_profile.md create mode 100644 docs/source/getting_started/examples/openai_chat_completion_client.md create mode 100644 docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md create mode 100644 docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md create mode 100644 docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md create mode 100644 docs/source/getting_started/examples/openai_completion_client.md create mode 100644 docs/source/getting_started/examples/openai_embedding_client.md create mode 100644 docs/source/getting_started/examples/save_sharded_state.md create mode 100644 docs/source/getting_started/examples/tensorize_vllm_model.md create mode 100644 docs/source/getting_started/gaudi-installation.md delete mode 100644 docs/source/getting_started/gaudi-installation.rst create mode 100644 docs/source/getting_started/installation.md delete mode 100644 docs/source/getting_started/installation.rst create mode 100644 docs/source/getting_started/neuron-installation.md delete mode 100644 docs/source/getting_started/neuron-installation.rst create mode 100644 docs/source/getting_started/openvino-installation.md delete mode 100644 docs/source/getting_started/openvino-installation.rst create mode 100644 docs/source/getting_started/quickstart.md delete mode 100644 docs/source/getting_started/quickstart.rst create mode 100644 docs/source/getting_started/tpu-installation.md delete mode 100644 docs/source/getting_started/tpu-installation.rst create mode 100644 docs/source/getting_started/xpu-installation.md delete mode 100644 docs/source/getting_started/xpu-installation.rst create mode 100644 docs/source/index.md delete mode 100644 docs/source/index.rst create mode 100644 docs/source/models/adding_model.md delete mode 100644 docs/source/models/adding_model.rst create mode 100644 docs/source/models/enabling_multimodal_inputs.md delete mode 100644 docs/source/models/enabling_multimodal_inputs.rst create mode 100644 docs/source/models/generative_models.md delete mode 100644 docs/source/models/generative_models.rst create mode 100644 docs/source/models/pooling_models.md delete mode 100644 docs/source/models/pooling_models.rst create mode 100644 docs/source/models/supported_models.md create mode 100644 docs/source/performance/benchmarks.md delete mode 100644 docs/source/performance/benchmarks.rst create mode 100644 docs/source/quantization/auto_awq.md delete mode 100644 docs/source/quantization/auto_awq.rst create mode 100644 docs/source/quantization/bnb.md delete mode 100644 docs/source/quantization/bnb.rst create mode 100644 docs/source/quantization/fp8.md delete mode 100644 docs/source/quantization/fp8.rst create mode 100644 docs/source/quantization/fp8_e4m3_kvcache.md delete mode 100644 docs/source/quantization/fp8_e4m3_kvcache.rst create mode 100644 docs/source/quantization/fp8_e5m2_kvcache.md delete mode 100644 docs/source/quantization/fp8_e5m2_kvcache.rst create mode 100644 docs/source/quantization/gguf.md delete mode 100644 docs/source/quantization/gguf.rst create mode 100644 docs/source/quantization/int8.md delete mode 100644 docs/source/quantization/int8.rst rename docs/source/quantization/{supported_hardware.rst => supported_hardware.md} (84%) create mode 100644 docs/source/serving/deploying_with_bentoml.md delete mode 100644 docs/source/serving/deploying_with_bentoml.rst create mode 100644 docs/source/serving/deploying_with_cerebrium.md delete mode 100644 docs/source/serving/deploying_with_cerebrium.rst create mode 100644 docs/source/serving/deploying_with_docker.md delete mode 100644 docs/source/serving/deploying_with_docker.rst create mode 100644 docs/source/serving/deploying_with_dstack.md delete mode 100644 docs/source/serving/deploying_with_dstack.rst rename docs/source/serving/{deploying_with_helm.rst => deploying_with_helm.md} (88%) create mode 100644 docs/source/serving/deploying_with_k8s.md delete mode 100644 docs/source/serving/deploying_with_k8s.rst create mode 100644 docs/source/serving/deploying_with_kserve.md delete mode 100644 docs/source/serving/deploying_with_kserve.rst create mode 100644 docs/source/serving/deploying_with_kubeai.md delete mode 100644 docs/source/serving/deploying_with_kubeai.rst create mode 100644 docs/source/serving/deploying_with_lws.md delete mode 100644 docs/source/serving/deploying_with_lws.rst create mode 100644 docs/source/serving/deploying_with_nginx.md delete mode 100644 docs/source/serving/deploying_with_nginx.rst create mode 100644 docs/source/serving/deploying_with_triton.md delete mode 100644 docs/source/serving/deploying_with_triton.rst create mode 100644 docs/source/serving/distributed_serving.md delete mode 100644 docs/source/serving/distributed_serving.rst create mode 100644 docs/source/serving/integrations.md delete mode 100644 docs/source/serving/integrations.rst create mode 100644 docs/source/serving/metrics.md delete mode 100644 docs/source/serving/metrics.rst create mode 100644 docs/source/serving/run_on_sky.md delete mode 100644 docs/source/serving/run_on_sky.rst create mode 100644 docs/source/serving/serving_with_langchain.md delete mode 100644 docs/source/serving/serving_with_langchain.rst create mode 100644 docs/source/serving/serving_with_llamaindex.md delete mode 100644 docs/source/serving/serving_with_llamaindex.rst create mode 100644 docs/source/serving/serving_with_llamastack.md delete mode 100644 docs/source/serving/serving_with_llamastack.rst create mode 100644 docs/source/serving/tensorizer.md delete mode 100644 docs/source/serving/tensorizer.rst rename docs/source/usage/{compatibility_matrix.rst => compatibility_matrix.md} (72%) rename docs/source/usage/{engine_args.rst => engine_args.md} (76%) create mode 100644 docs/source/usage/env_vars.md delete mode 100644 docs/source/usage/env_vars.rst rename docs/source/usage/{faq.rst => faq.md} (61%) create mode 100644 docs/source/usage/lora.md delete mode 100644 docs/source/usage/lora.rst create mode 100644 docs/source/usage/multimodal_inputs.md delete mode 100644 docs/source/usage/multimodal_inputs.rst rename docs/source/usage/{performance.rst => performance.md} (61%) create mode 100644 docs/source/usage/spec_decode.md delete mode 100644 docs/source/usage/spec_decode.rst create mode 100644 docs/source/usage/structured_outputs.md delete mode 100644 docs/source/usage/structured_outputs.rst diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md new file mode 100644 index 0000000000000..c0c141c5fb7ef --- /dev/null +++ b/docs/source/automatic_prefix_caching/apc.md @@ -0,0 +1,102 @@ +(apc)= + +# Introduction + +## What is Automatic Prefix Caching + +Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. + +```{note} +Technical details on how vLLM implements APC are in the next page. +``` + +## Enabling APC in vLLM + +Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: + +```python +import time +from vllm import LLM, SamplingParams + + +# A prompt containing a large markdown table. The table is randomly generated by GPT-4. +LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | +|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| +| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | +| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | +| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | +| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | +| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | +| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | +| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | +| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | +| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | +| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| +| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | +| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | +| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | +| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | +| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | +| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | +| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | +| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | +| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | +| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | +| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | +| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | +| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| +| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | +| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | +| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | +| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | +| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | +| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | +| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | +""" + + +def get_generation_time(llm, sampling_params, prompts): + # time the generation + start_time = time.time() + output = llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + # print the output and generation time + print(f"Output: {output[0].outputs[0].text}") + print(f"Generation time: {end_time - start_time} seconds.") + + +# set enable_prefix_caching=True to enable APC +llm = LLM( + model='lmsys/longchat-13b-16k', + enable_prefix_caching=True +) + +sampling_params = SamplingParams(temperature=0, max_tokens=100) + +# Querying the age of John Doe +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", +) + +# Querying the age of Zack Blue +# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", +) +``` + +## Example workloads + +We describe two example workloads, where APC can provide huge performance benefit: + +- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. +- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. + +## Limits + +APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst deleted file mode 100644 index 0d70c74689bf9..0000000000000 --- a/docs/source/automatic_prefix_caching/apc.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _apc: - -Introduction -============ - -What is Automatic Prefix Caching --------------------------------- - -Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. - - -.. note:: - - Technical details on how vLLM implements APC are in the next page. - - - -Enabling APC in vLLM --------------------- - -Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example: - -.. code-block:: python - - import time - from vllm import LLM, SamplingParams - - - # A prompt containing a large markdown table. The table is randomly generated by GPT-4. - LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ - | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | - |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| - | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | - | 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | - | 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | - | 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | - | 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | - | 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | - | 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | - | 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | - | 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | - | 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| - | 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | - | 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | - | 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | - | 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | - | 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | - | 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | - | 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | - | 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | - | 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | - | 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | - | 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | - | 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | - | 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| - | 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | - | 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | - | 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | - | 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | - | 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | - | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | - | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | - """ - - - def get_generation_time(llm, sampling_params, prompts): - # time the generation - start_time = time.time() - output = llm.generate(prompts, sampling_params=sampling_params) - end_time = time.time() - # print the output and generation time - print(f"Output: {output[0].outputs[0].text}") - print(f"Generation time: {end_time - start_time} seconds.") - - - # set enable_prefix_caching=True to enable APC - llm = LLM( - model='lmsys/longchat-13b-16k', - enable_prefix_caching=True - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=100) - - # Querying the age of John Doe - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", - ) - - # Querying the age of Zack Blue - # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", - ) - -Example workloads ------------------ - -We describe two example workloads, where APC can provide huge performance benefit: - -- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. -- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. - - -Limits ------- -APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md new file mode 100644 index 0000000000000..43fa9ee616096 --- /dev/null +++ b/docs/source/community/meetups.md @@ -0,0 +1,15 @@ +(meetups)= + +# vLLM Meetups + +We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: + +- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) +- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) +- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) +- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing) +- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing) +- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg) +- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing) + +We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst deleted file mode 100644 index c87f01aa263b3..0000000000000 --- a/docs/source/community/meetups.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. _meetups: - -vLLM Meetups -============ - -We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - -- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ -- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ -- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ -- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ -- `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ -- `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ -- `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ - -We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__. diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md new file mode 100644 index 0000000000000..d72b99fe017b6 --- /dev/null +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -0,0 +1,50 @@ +# Dockerfile + +See [here](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for the main Dockerfile to construct +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html). + +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: + +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) + +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) + +- COPY --from=... dependencies (with a dashed line and an empty arrow head) + +- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) + + > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > :align: center + > :alt: query + > :width: 100% + > ``` + > + > Made using: + > + > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): + > + > ```bash + > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile + > ``` + > + > or in case you want to run it directly with the docker image: + > + > ```bash + > docker run \ + > --rm \ + > --user "$(id -u):$(id -g)" \ + > --workdir /workspace \ + > --volume "$(pwd)":/workspace \ + > ghcr.io/patrickhoefler/dockerfilegraph:alpine \ + > --output png \ + > --dpi 200 \ + > --max-label-length 50 \ + > --filename Dockerfile \ + > --legend + > ``` + > + > (To run it for a different file, you can pass in a different argument to the flag `--filename`.) diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst deleted file mode 100644 index 9c17c27aa61bf..0000000000000 --- a/docs/source/contributing/dockerfile/dockerfile.rst +++ /dev/null @@ -1,50 +0,0 @@ -Dockerfile -==================== - -See `here `__ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. - -Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: - -- All build stages -- The default build target (highlighted in grey) -- External images (with dashed borders) - -The edges of the build graph represent: - -- FROM ... dependencies (with a solid line and a full arrow head) -- COPY --from=... dependencies (with a dashed line and an empty arrow head) -- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) - - .. figure:: ../../assets/dev/dockerfile-stages-dependency.png - :alt: query - :width: 100% - :align: center - - Made using: https://github.com/patrickhoefler/dockerfilegraph - - Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): - - .. code:: bash - - dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile - - or in case you want to run it directly with the docker image: - - .. code:: bash - - docker run \ - --rm \ - --user "$(id -u):$(id -g)" \ - --workdir /workspace \ - --volume "$(pwd)":/workspace \ - ghcr.io/patrickhoefler/dockerfilegraph:alpine \ - --output png \ - --dpi 200 \ - --max-label-length 50 \ - --filename Dockerfile \ - --legend - - (To run it for a different file, you can pass in a different argument to the flag `--filename`.) - - \ No newline at end of file diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md similarity index 51% rename from docs/source/contributing/overview.rst rename to docs/source/contributing/overview.md index 4cea0afdaea74..53e8e78f08e72 100644 --- a/docs/source/contributing/overview.rst +++ b/docs/source/contributing/overview.md @@ -1,5 +1,4 @@ -Contributing to vLLM -===================== +# Contributing to vLLM Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: @@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! -License -------- +## License -See `LICENSE `_. +See [LICENSE](https://github.com/vllm-project/vllm/tree/main/LICENSE). -Developing ----------- +## Developing -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. -Testing -------- +## Testing -.. code-block:: bash +```bash +pip install -r requirements-dev.txt - pip install -r requirements-dev.txt +# linting and formatting +bash format.sh +# Static type checking +mypy +# Unit tests +pytest tests/ +``` - # linting and formatting - bash format.sh - # Static type checking - mypy - # Unit tests - pytest tests/ +```{note} +Currently, the repository does not pass the `mypy` tests. +``` -.. note:: Currently, the repository does not pass the ``mypy`` tests. +# Contribution Guidelines -Contribution Guidelines -======================= +## Issues -Issues ------- +If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. +```{important} +If you discover a security vulnerability, please follow the instructions [here](https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability). +``` -.. important:: - If you discover a security vulnerability, please follow the instructions `here `_. - -Pull Requests & Code Reviews ----------------------------- +## Pull Requests & Code Reviews Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process. -DCO and Signed-off-by -^^^^^^^^^^^^^^^^^^^^^ +### DCO and Signed-off-by -When contributing changes to this project, you must agree to the `DCO `_. -Commits must include a ``Signed-off-by:`` header which certifies agreement with -the terms of the `DCO `_. +When contributing changes to this project, you must agree to the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). -Using ``-s`` with ``git commit`` will automatically add this header. +Using `-s` with `git commit` will automatically add this header. -PR Title and Classification -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### PR Title and Classification Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: -- ``[Bugfix]`` for bug fixes. -- ``[CI/Build]`` for build or continuous integration improvements. -- ``[Doc]`` for documentation fixes and improvements. -- ``[Model]`` for adding a new model or improving an existing model. Model name +- `[Bugfix]` for bug fixes. +- `[CI/Build]` for build or continuous integration improvements. +- `[Doc]` for documentation fixes and improvements. +- `[Model]` for adding a new model or improving an existing model. Model name should appear in the title. -- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, - ``LLM`` class, etc.) -- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. -- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, - ``AsyncLLMEngine``, ``Scheduler``, etc.) -- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should - appear in the prefix (e.g., ``[Hardware][AMD]``). -- ``[Misc]`` for PRs that do not fit the above categories. Please use this +- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server, + `LLM` class, etc.) +- `[Kernel]` for changes affecting CUDA kernels or other compute kernels. +- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`, + `AsyncLLMEngine`, `Scheduler`, etc.) +- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., `[Hardware][AMD]`). +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -.. note:: - If the PR spans more than one category, please include all relevant prefixes. +```{note} +If the PR spans more than one category, please include all relevant prefixes. +``` -Code Quality -^^^^^^^^^^^^ +### Code Quality The PR needs to meet the following code quality standards: -- We adhere to `Google Python style guide - `_ and `Google C++ style guide - `_. -- Pass all linter checks. Please use `format.sh - `_ to format your +- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). +- Pass all linter checks. Please use [format.sh](https://github.com/vllm-project/vllm/blob/main/format.sh) to format your code. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This includes both unit tests and integration tests. -- Please add documentation to ``docs/source/`` if the PR modifies the +- Please add documentation to `docs/source/` if the PR modifies the user-facing behaviors of vLLM. It helps vLLM users understand and utilize the new features or changes. -Adding or Changing Kernels -^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Adding or Changing Kernels Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. - Make sure custom ops are registered following PyTorch guidelines: - `Custom C++ and CUDA Operators `_ - and `The Custom Operators Manual `_. -- Custom operations that return ``Tensors`` require meta-functions. + [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial) + and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU). +- Custom operations that return `Tensors` require meta-functions. Meta-functions should be implemented and registered in Python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions. -- Use `torch.library.opcheck() `_ +- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck) to test the function registration and meta-function for any registered ops. - See ``tests/kernels`` for examples. + See `tests/kernels` for examples. - When changing the C++ signature of an existing op, the schema must be updated to reflect the changes. - If a new custom type is needed, see the following document: - `Custom Class Support in PT2 `_. + [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA). -Notes for Large Changes -^^^^^^^^^^^^^^^^^^^^^^^ +### Notes for Large Changes Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag -it with ``rfc-required`` and might not go through the PR. +it with `rfc-required` and might not go through the PR. -What to Expect for the Reviews -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### What to Expect for the Reviews The goal of the vLLM team is to be a *transparent reviewing machine*. We would like to make the review process transparent and efficient and make sure no @@ -150,15 +138,14 @@ review process: - After the PR is assigned, the reviewer will provide status updates every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team. -- After the review, the reviewer will put an ``action-required`` label on the PR +- After the review, the reviewer will put an `action-required` label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR. - Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. -Thank You ---------- +## Thank You Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. All of your contributions help make vLLM a great tool and community for everyone! diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md new file mode 100644 index 0000000000000..04e01da556231 --- /dev/null +++ b/docs/source/contributing/profiling/profiling_index.md @@ -0,0 +1,41 @@ +# Profiling vLLM + +We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` + +The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. + +When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. + +```{warning} +Only enable profiling in a development environment. +``` + +Traces can be visualized using . + +```{tip} +Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. +``` + +```{tip} +To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. +Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. +`export VLLM_RPC_TIMEOUT=1800000` +``` + +## Example commands and usage: + +### Offline Inference: + +Refer to [examples/offline_inference_with_profiler.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py) for an example. + +### OpenAI Server: + +```bash +VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B +``` + +benchmark_serving.py: + +```bash +python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 +``` diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst deleted file mode 100644 index a422b1fcda521..0000000000000 --- a/docs/source/contributing/profiling/profiling_index.rst +++ /dev/null @@ -1,48 +0,0 @@ -============== -Profiling vLLM -============== - -We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` - -The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set. - -When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag. - -.. warning:: - - Only enable profiling in a development environment. - - -Traces can be visualized using https://ui.perfetto.dev/. - -.. tip:: - - Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. - -.. tip:: - - To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. - Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. - ``export VLLM_RPC_TIMEOUT=1800000`` - -Example commands and usage: -=========================== - -Offline Inference: ------------------- - -Refer to `examples/offline_inference_with_profiler.py `_ for an example. - - -OpenAI Server: --------------- - -.. code-block:: bash - - VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B - -benchmark_serving.py: - -.. code-block:: bash - - python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 \ No newline at end of file diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md similarity index 54% rename from docs/source/design/arch_overview.rst rename to docs/source/design/arch_overview.md index bc3f509f0a66e..a798ca3d2425c 100644 --- a/docs/source/design/arch_overview.rst +++ b/docs/source/design/arch_overview.md @@ -1,25 +1,24 @@ -.. _arch_overview: +(arch-overview)= -Architecture Overview -====================== +# Architecture Overview This document provides an overview of the vLLM architecture. -.. contents:: Table of Contents - :local: - :depth: 2 +```{contents} Table of Contents +:depth: 2 +:local: true +``` -Entrypoints ------------ +## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png - :alt: Entrypoints Diagram +```{image} /assets/design/arch_overview/entrypoints.excalidraw.png +:alt: Entrypoints Diagram +``` -LLM Class -^^^^^^^^^ +### LLM Class The LLM class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference @@ -27,75 +26,70 @@ server. Here is a sample of `LLM` class usage: -.. code-block:: python +```python +from vllm import LLM, SamplingParams - from vllm import LLM, SamplingParams +# Define a list of input prompts +prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", +] - # Define a list of input prompts - prompts = [ - "Hello, my name is", - "The capital of France is", - "The largest ocean is", - ] +# Define sampling parameters +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Define sampling parameters - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Initialize the LLM engine with the OPT-125M model +llm = LLM(model="facebook/opt-125m") - # Initialize the LLM engine with the OPT-125M model - llm = LLM(model="facebook/opt-125m") +# Generate outputs for the input prompts +outputs = llm.generate(prompts, sampling_params) - # Generate outputs for the input prompts - outputs = llm.generate(prompts, sampling_params) +# Print the generated outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` - # Print the generated outputs - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -More API details can be found in the :doc:`Offline Inference +More API details can be found in the {doc}`Offline Inference ` section of the API docs. -The code for the `LLM` class can be found in `vllm/entrypoints/llm.py -`_. +The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py). -OpenAI-compatible API server -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### OpenAI-compatible API server The second primary interface to vLLM is via its OpenAI-compatible API server. This server can be started using the `vllm serve` command. -.. code-block:: bash - - vllm serve +```bash +vllm serve +``` -The code for the `vllm` CLI can be found in `vllm/scripts.py -`_. +The code for the `vllm` CLI can be found in [vllm/scripts.py](https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py). Sometimes you may see the API server entrypoint used directly instead of via the `vllm` CLI command. For example: -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --model +```bash +python -m vllm.entrypoints.openai.api_server --model +``` -That code can be found in `vllm/entrypoints/openai/api_server.py -`_. +That code can be found in [vllm/entrypoints/openai/api_server.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py). -More details on the API server can be found in the :doc:`OpenAI Compatible +More details on the API server can be found in the {doc}`OpenAI Compatible Server ` document. -LLM Engine ----------- +## LLM Engine The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png - :alt: LLMEngine Diagram +```{image} /assets/design/arch_overview/llm_engine.excalidraw.png +:alt: LLMEngine Diagram +``` -LLMEngine -^^^^^^^^^ +### LLMEngine The `LLMEngine` class is the core component of the vLLM engine. It is responsible for receiving requests from clients and generating outputs from the @@ -105,21 +99,15 @@ processing. - **Input Processing**: Handles tokenization of input text using the specified tokenizer. - - **Scheduling**: Chooses which requests are processed in each step. - - **Model Execution**: Manages the execution of the language model, including distributed execution across multiple GPUs. - - **Output Processing**: Processes the outputs generated by the model, decoding the token IDs from a language model into human-readable text. -The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_. - -.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py +The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py]. -AsyncLLMEngine -^^^^^^^^^^^^^^ +### AsyncLLMEngine The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class. It uses `asyncio` to create a background loop that continuously processes @@ -128,54 +116,46 @@ can handle multiple concurrent requests and stream outputs to clients. The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo API server that serves as a simpler example in -`vllm/entrypoints/api_server.py`_. - -.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py +[vllm/entrypoints/api_server.py]. -The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_. +The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py]. -.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py - -Worker ------- +## Worker A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their -``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while -``local_rank`` is mainly used for assigning the accelerator device and accessing +`rank` and `local_rank`. `rank` is used for global orchestration, while +`local_rank` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. -Model Runner ------------- +## Model Runner Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. -Model ------ +## Model Every model runner object has one model object, which is the actual -``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various +`torch.nn.Module` instance. See {ref}`huggingface_integration` for how various configurations affect the class we ultimately get. -Class Hierarchy ---------------- +## Class Hierarchy The following figure shows the class hierarchy of vLLM: - .. figure:: /assets/design/hierarchy.png - :alt: query - :width: 100% - :align: center +> ```{figure} /assets/design/hierarchy.png +> :align: center +> :alt: query +> :width: 100% +> ``` There are several important design choices behind this class hierarchy: -1. **Extensibility**: All classes in the hierarchy accept a configuration object -containing all the necessary information. The `VllmConfig -`__ +1\. **Extensibility**: All classes in the hierarchy accept a configuration object +containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036) class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily @@ -188,7 +168,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. -2. **Uniformity**: The model runner needs a unified interface to create and +2\. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the @@ -200,46 +180,46 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -.. note:: - - To support this change, all vLLM models' signatures have been updated to: - - .. code-block:: python - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: - - .. code-block:: python - - class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - - from vllm.config import VllmConfig - class MyNewModel(MyOldModel): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - - if __version__ >= "0.6.4": - MyModel = MyNewModel - else: - MyModel = MyOldModel - - This way, the model can work with both old and new versions of vLLM. - -3. **Sharding and Quantization at Initialization**: Certain features require +````{note} +To support this change, all vLLM models' signatures have been updated to: + +```python +def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): +``` + +To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + +```python +class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + +from vllm.config import VllmConfig +class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + +if __version__ >= "0.6.4": + MyModel = MyNewModel +else: + MyModel = MyOldModel +``` + +This way, the model can work with both old and new versions of vLLM. +```` + +3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model @@ -252,23 +232,27 @@ initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea -applies to quantization. Note that we also add an additional argument ``prefix`` +applies to quantization. Note that we also add an additional argument `prefix` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where -different parts of the model are quantized differently. The ``prefix`` is -usually an empty string for the top-level model and a string like ``"vision"`` -or ``"language"`` for the sub-models. In general, it matches the name of the +different parts of the model are quantized differently. The `prefix` is +usually an empty string for the top-level model and a string like `"vision"` +or `"language"` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set -to ``None``. If the component we want to test only cares about a few fields in +to `None`. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. -In summary, the complete config object ``VllmConfig`` can be treated as an +In summary, the complete config object `VllmConfig` can be treated as an engine-level global state that is shared among all vLLM classes. + +[vllm/engine/async_llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py +[vllm/engine/llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py +[vllm/entrypoints/api_server.py]: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md new file mode 100644 index 0000000000000..99b4cb56424c6 --- /dev/null +++ b/docs/source/design/huggingface_integration.md @@ -0,0 +1,36 @@ +(huggingface-integration)= + +# Integration with HuggingFace + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. + +Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. + +1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: + + - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. + - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. + +3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst deleted file mode 100644 index e6c1cea6001ea..0000000000000 --- a/docs/source/design/huggingface_integration.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _huggingface_integration: - -Integration with HuggingFace -=================================== - -This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. - -Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. - -1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: - - - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. - - - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. - -2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. - -3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: - - - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. - - - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. - -4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. - -5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. - -Beyond that, there are two more things vLLM depends on HuggingFace for. - -1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. - -2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. - - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: - -This completes the integration between vLLM and HuggingFace. - -In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md new file mode 100644 index 0000000000000..bb16920e3d0c0 --- /dev/null +++ b/docs/source/design/input_processing/input_processing_pipeline.md @@ -0,0 +1,19 @@ +(input-processing-pipeline)= + +# Input Processing Pipeline + +1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). + +2. Tokenize the data if necessary. + +3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. + + - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. + +4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. + +5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. + +6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. + + - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst deleted file mode 100644 index 48abec8f75286..0000000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _input_processing_pipeline: - -Input Processing Pipeline -========================= - -1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md new file mode 100644 index 0000000000000..3ef8d8878a47b --- /dev/null +++ b/docs/source/design/input_processing/model_inputs_index.md @@ -0,0 +1,43 @@ +(input-processing)= + +# Input Processing + +```{eval-rst} +.. currentmodule:: vllm.inputs +``` + +Each model can override parts of vLLM's {ref}`input processing pipeline ` via +{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. + +Currently, this mechanism is only utilized in {ref}`multi-modal ` models for preprocessing multi-modal input +data in addition to input prompt, but it can be extended to text-only language models when needed. + +## Guides + +```{toctree} +:maxdepth: 1 + +input_processing_pipeline +``` + +## Module Contents + +### LLM Engine Inputs + +```{eval-rst} +.. autoclass:: vllm.inputs.DecoderOnlyInputs + :members: + :show-inheritance: +``` + +### Registry + +```{eval-rst} +.. autodata:: vllm.inputs.INPUT_REGISTRY +``` + +```{eval-rst} +.. automodule:: vllm.inputs.registry + :members: + :show-inheritance: +``` diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst deleted file mode 100644 index f0ec1fea15ddb..0000000000000 --- a/docs/source/design/input_processing/model_inputs_index.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. _input_processing: - -Input Processing -================ - -.. currentmodule:: vllm.inputs - -Each model can override parts of vLLM's :ref:`input processing pipeline ` via -:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -Guides -++++++ - -.. toctree:: - :maxdepth: 1 - - input_processing_pipeline - -Module Contents -+++++++++++++++ - -LLM Engine Inputs ------------------ - -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: - -Registry --------- - -.. autodata:: vllm.inputs.INPUT_REGISTRY - -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md new file mode 100644 index 0000000000000..c21985b36eb3a --- /dev/null +++ b/docs/source/design/kernel/paged_attention.md @@ -0,0 +1,527 @@ +# vLLM Paged Attention + +- Currently, vLLM utilizes its own implementation of a multi-head query + attention kernel (`csrc/attention/attention_kernels.cu`). + This kernel is designed to be compatible with + vLLM's paged KV caches, where the key and value cache are stored in + separate blocks (note that this block concept differs from the GPU + thread block. So in a later document, I will refer to vLLM paged + attention block as "block", while refer to GPU thread block as + "thread block"). +- To achieve high performance, this kernel relies on a specially + designed memory layout and access method, specifically when threads + read data from global memory to shared memory. The purpose of this + document is to provide a high-level explanation of the kernel + implementation step by step, aiding those who wish to learn about the + vLLM multi-head query attention kernel. After going through this + document, users will likely have a better understanding and feel easier + to follow the actual implementation. +- Please note that this document may not cover all details, such as how + to calculate the correct index for the corresponding data or the dot + multiplication implementation. However, after reading this document + and becoming familiar with the high-level logic flow, it should be + easier for you to read the actual code and understand the details. + +## Inputs + +- The kernel function takes a list of arguments for the current thread + to perform its assigned work. The three most important arguments are + the input pointers `q`, `k_cache`, and `v_cache`, which point + to query, key, and value data on global memory that need to be read + and processed. The output pointer `out` points to global memory + where the result should be written. These four pointers actually + refer to multi-dimensional arrays, but each thread only accesses the + portion of data assigned to it. I have omitted all other runtime + parameters here for simplicity. + + ```cpp + template< + typename scalar_t, + int HEAD_SIZE, + int BLOCK_SIZE, + int NUM_THREADS, + int PARTITION_SIZE = 0> + __device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. + ) + ``` + +- There are also a list of template arguments above the function + signature that are determined during compilation time. `scalar_t` + represents the data type of the query, key, and value data elements, + such as FP16. `HEAD_SIZE` indicates the number of elements in each + head. `BLOCK_SIZE` refers to the number of tokens in each block. + `NUM_THREADS` denotes the number of threads in each thread block. + `PARTITION_SIZE` represents the number of tensor parallel GPUs (For + simplicity, we assume this is 0 and tensor parallel is disabled). + +- With these arguments, we need to perform a sequence of preparations. + This includes calculating the current head index, block index, and + other necessary variables. However, for now, we can ignore these + preparations and proceed directly to the actual calculations. It will + be easier to understand them once we grasp the entire flow. + +## Concepts + +- Just before we dive into the calculation flow, I want to describe a + few concepts that are needed for later sections. However, you may + skip this section and return later if you encounter any confusing + terminologies. +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by `q` has a shape of + `[num_seqs, num_heads, head_size]`. That represents there are total + `num_seqs` of query sequence data are pointed by `q`. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the `num_seqs` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, `["What", "is", "your"]` are the context + tokens, and the input query token is `"name"`. The model might + generate the token `"?"`. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (`VEC_SIZE`) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (`V_VEC_SIZE`) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the + `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8. +- **Thread group**: The thread group is a small group of + threads(`THREAD_GROUP_SIZE`) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as `x`. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(`BLOCK_SIZE`) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 * 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(`NUM_THREADS`) that can access the same shared memory. + Each thread block contains multiple warps(`NUM_WARPS`), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +## Query + +- This section will introduce how query data is stored in memory and + fetched by each thread. As mentioned above, each thread group fetches + one query token data, while each thread itself only handles a part of + one query token data. Within each warp, every thread group will fetch + the same query token data, but will multiply it with different key + token data. + + ```cpp + const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; + ``` + + ```{figure} ../../assets/kernel/query.png + :align: center + :alt: query + :width: 70% + + Query data of one token at one head + ``` + +- Each thread defines its own `q_ptr` which points to the assigned + query token data on global memory. For example, if `VEC_SIZE` is 4 + and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains + total of 128 elements divided into 128 / 4 = 32 vecs. + + ```{figure} ../../assets/kernel/q_vecs.png + :align: center + :alt: q_vecs + :width: 70% + + `q_vecs` for one thread group + ``` + + ```cpp + __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; + ``` + +- Next, we need to read the global memory data pointed to by `q_ptr` + into shared memory as `q_vecs`. It is important to note that each + vecs is assigned to a different row. For example, if the + `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, + while thread 1 handles the 1st row vecs. By reading the query data in + this way, neighboring threads like thread 0 and thread 1 can read + neighbor memory, achieving the memory coalescing to improve + performance. + +## Key + +- Similar to the "Query" section, this section introduces memory layout + and assignment for keys. While each thread group only handle one + query token one kernel run, it may handle multiple key tokens across + multiple iterations. Meanwhile, each warp will process multiple blocks + of key tokens in multiple iterations, ensuring that all context + tokens are processed by the entire thread group after the kernel run. + In this context, "handle" refers to performing the dot multiplication + between query data and key data. + + ```cpp + const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; + ``` + +- Unlike to `q_ptr`, `k_ptr` in each thread will point to different + key token at different iterations. As shown above, that `k_ptr` + points to key token data based on `k_cache` at assigned block, + assigned head and assigned token. + + ```{figure} ../../assets/kernel/key.png + :align: center + :alt: key + :width: 70% + + Key data of all context tokens at one head + ``` + +- The diagram above illustrates the memory layout for key data. It + assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is + 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each + rectangle represents all the elements for one key token at one head, + which will be processed by one thread group. The left half shows the + total 16 blocks of key token data for warp 0, while the right half + represents the remaining key token data for other warps or + iterations. Inside each rectangle, there are a total 32 vecs (128 + elements for one token) that will be processed by 2 threads (one + thread group) separately. + + ```{figure} ../../assets/kernel/k_vecs.png + :align: center + :alt: k_vecs + :width: 70% + + `k_vecs` for one thread + ``` + + ```cpp + K_vec k_vecs[NUM_VECS_PER_THREAD] + ``` + +- Next, we need to read the key token data from `k_ptr` and store + them on register memory as `k_vecs`. We use register memory for + `k_vecs` because it will only be accessed by one thread once, + whereas `q_vecs` will be accessed by multiple threads multiple + times. Each `k_vecs` will contain multiple vectors for later + calculation. Each vec will be set at each inner iteration. The + assignment of vecs allows neighboring threads in a warp to read + neighboring memory together, which again promotes the memory + coalescing. For instance, thread 0 will read vec 0, while thread 1 + will read vec 1. In the next inner loop, thread 0 will read vec 2, + while thread 1 will read vec 3, and so on. + +- You may still be a little confused about the overall flow. Don't + worry, please keep reading the next "QK" section. It will illustrate + the query and key calculation flow in a clearer and higher-level + manner. + +## QK + +- As shown the pseudo code below, before the entire for loop block, we + fetch the query data for one token and store it in `q_vecs`. Then, + in the outer for loop, we iterate through different `k_ptrs` that + point to different tokens and prepare the `k_vecs` in the inner for + loop. Finally, we perform the dot multiplication between the + `q_vecs` and each `k_vecs`. + + ```cpp + q_vecs = ... + for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + } + ``` + +- As mentioned before, for each thread, it only fetches part of the + query and key token data at a time. However, there will be a cross + thread group reduction happen in the `Qk_dot<>::dot` . So `qk` + returned here is not just between part of the query and key token dot + multiplication, but actually a full result between entire query and + key token data. + +- For example, if the value of `HEAD_SIZE` is 128 and + `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain + total 64 elements. However, the returned `qk` is actually the + result of dot multiplication between 128 query elements and 128 key + elements. If you want to learn more about the details of the dot + multiplication and reduction, you may refer to the implementation of + `Qk_dot<>::dot`. However, for the sake of simplicity, I will not + cover it in this document. + +## Softmax + +- Next, we need to calculate the normalized softmax for all `qk`s, + as shown above, where each $x$ represents a `qk`. To do this, + we must obtain the reduced value of `qk_max`($m(x)$) and + the `exp_sum`($\ell(x)$) of all `qk`s. The reduction + should be performed across the entire thread block, encompassing + results between the query token and all context key tokens. + + ```{math} + :nowrap: true + + \begin{gather*} + m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ + \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} + \end{gather*} + ``` + +### `qk_max` and `logits` + +- Just right after we get the `qk` result, we can set the temporary + `logits` result with `qk` (In the end, the `logits` should + store the normalized softmax result). Also we can compare and collect + the `qk_max` for all `qk`s that are calculated by current + thread group. + + ```cpp + if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + } + ``` + +- Please note that the `logits` here is on shared memory, so each + thread group will set the fields for its own assigned context tokens. + Overall, the size of logits should be number of context tokens. + + ```cpp + for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + + if (lane == 0) { + red_smem[warp_idx] = qk_max; + } + ``` + +- Then we need to get the reduced `qk_max` across each warp. The main + idea is to make threads in warp to communicate with each other and + get the final max `qk` . + + ```cpp + for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + qk_max = VLLM_SHFL_SYNC(qk_max, 0); + ``` + +- Finally, we can get the reduced `qk_max` from whole thread block by + compare the `qk_max` from all warps in this thread block. Then we + need to broadcast the final result to each thread. + +### `exp_sum` + +- Similar to `qk_max`, we need to get the reduced sum value from the + entire thread block too. + + ```cpp + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; + } + ... + exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); + ``` + +- Firstly, sum all exp values from each thread group, and meanwhile, + convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. + Please note, the `qk_max` here is already the max `qk` across the + whole thread block. And then we can do reduction for `exp_sum` + across whole thread block just like the `qk_max`. + + ```cpp + const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; + } + ``` + +- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain + the final normalized softmax result as `logits`. This `logits` + variable will be used for dot multiplication with the value data in + later steps. Now, it should store the normalized softmax result of + `qk` for all assigned context tokens. + +## Value + +```{figure} ../../assets/kernel/value.png +:align: center +:alt: value +:width: 70% + +Value data of all context tokens at one head +``` + +```{figure} ../../assets/kernel/logits_vec.png +:align: center +:alt: logits_vec +:width: 50% + +`logits_vec` for one thread +``` + +```{figure} ../../assets/kernel/v_vec.png +:align: center +:alt: v_vec +:width: 70% + +List of `v_vec` for one thread +``` + +- Now we need to retrieve the value data and perform dot multiplication + with `logits`. Unlike query and key, there is no thread group + concept for value data. As shown in diagram, different from key token + memory layout, elements from the same column correspond to the same + value token. For one block of value data, there are `HEAD_SIZE` of + rows and `BLOCK_SIZE` of columns that are split into multiple + `v_vecs`. + +- Each thread always fetches `V_VEC_SIZE` elements from the same + `V_VEC_SIZE` of tokens at a time. As a result, a single thread + retrieves multiple `v_vec`s from different rows and the same + columns through multiple inner iterations. For each `v_vec`, it + needs to be dot multiplied with the corresponding `logits_vec`, + which is also `V_VEC_SIZE` elements from `logits`. Overall, with + multiple inner iterations, each warp will process one block of value + tokens. And with multiple outer iterations, the whole context value + tokens are processd + + ```cpp + float accs[NUM_ROWS_PER_THREAD]; + for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } + } + ``` + +- As shown in the above pseudo code, in the outer loop, similar to + `k_ptr`, `logits_vec` iterates over different blocks and reads + `V_VEC_SIZE` elements from `logits`. In the inner loop, each + thread reads `V_VEC_SIZE` elements from the same tokens as a + `v_vec` and performs dot multiplication. It is important to note + that in each inner iteration, the thread fetches different head + position elements for the same tokens. The dot result is then + accumulated in `accs`. Therefore, each entry of `accs` is mapped + to a head position assigned to the current thread. + +- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each + thread fetches 8 value elements for 8 tokens at a time. Each element + is from different tokens at the same head position. If `HEAD_SIZE` + is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to + fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are + a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle + a whole block of value tokens. And each `accs` in each thread + contains 8 elements that accumulated at 8 different head positions. + For the thread 0, the `accs` variable will have 8 elements, which + are 0th, 32th … 224th elements of a value head that are accumulated + from all assigned 8 tokens. + +## LV + +- Now, we need to perform reduction for `accs` within each warp. This + process allows each thread to accumulate the `accs` for the + assigned head positions of all tokens in one block. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; + } + ``` + +- Next, we perform reduction for `accs` across all warps, allowing + each thread to have the accumulation of `accs` for the assigned + head positions of all context tokens. Please note that each `accs` + in every thread only stores the accumulation for a portion of + elements of the entire head for all context tokens. However, overall, + all results for output have been calculated but are just stored in + different thread register memory. + + ```cpp + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. + } + ``` + +## Output + +- Now we can write all of calculated result from local register memory + to final output global memory. + + ```cpp + scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; + ``` + +- First, we need to define the `out_ptr` variable, which points to + the start address of the assigned sequence and assigned head. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } + } + ``` + +- Finally, we need to iterate over different assigned head positions + and write out the corresponding accumulated result based on the + `out_ptr`. diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst deleted file mode 100644 index ba4f7a2718158..0000000000000 --- a/docs/source/design/kernel/paged_attention.rst +++ /dev/null @@ -1,525 +0,0 @@ -vLLM Paged Attention -==================== - -- Currently, vLLM utilizes its own implementation of a multi-head query - attention kernel (``csrc/attention/attention_kernels.cu``). - This kernel is designed to be compatible with - vLLM's paged KV caches, where the key and value cache are stored in - separate blocks (note that this block concept differs from the GPU - thread block. So in a later document, I will refer to vLLM paged - attention block as "block", while refer to GPU thread block as - "thread block"). -- To achieve high performance, this kernel relies on a specially - designed memory layout and access method, specifically when threads - read data from global memory to shared memory. The purpose of this - document is to provide a high-level explanation of the kernel - implementation step by step, aiding those who wish to learn about the - vLLM multi-head query attention kernel. After going through this - document, users will likely have a better understanding and feel easier - to follow the actual implementation. -- Please note that this document may not cover all details, such as how - to calculate the correct index for the corresponding data or the dot - multiplication implementation. However, after reading this document - and becoming familiar with the high-level logic flow, it should be - easier for you to read the actual code and understand the details. - -Inputs ------- - -- The kernel function takes a list of arguments for the current thread - to perform its assigned work. The three most important arguments are - the input pointers ``q``, ``k_cache``, and ``v_cache``, which point - to query, key, and value data on global memory that need to be read - and processed. The output pointer ``out`` points to global memory - where the result should be written. These four pointers actually - refer to multi-dimensional arrays, but each thread only accesses the - portion of data assigned to it. I have omitted all other runtime - parameters here for simplicity. - - .. code:: cpp - - template< - typename scalar_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - int PARTITION_SIZE = 0> - __device__ void paged_attention_kernel( - ... // Other side args. - const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - ... // Other side args. - ) - -- There are also a list of template arguments above the function - signature that are determined during compilation time. ``scalar_t`` - represents the data type of the query, key, and value data elements, - such as FP16. ``HEAD_SIZE`` indicates the number of elements in each - head. ``BLOCK_SIZE`` refers to the number of tokens in each block. - ``NUM_THREADS`` denotes the number of threads in each thread block. - ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For - simplicity, we assume this is 0 and tensor parallel is disabled). -- With these arguments, we need to perform a sequence of preparations. - This includes calculating the current head index, block index, and - other necessary variables. However, for now, we can ignore these - preparations and proceed directly to the actual calculations. It will - be easier to understand them once we grasp the entire flow. - -Concepts --------- - -- Just before we dive into the calculation flow, I want to describe a - few concepts that are needed for later sections. However, you may - skip this section and return later if you encounter any confusing - terminologies. -- **Sequence**: A sequence represents a client request. For example, - the data pointed to by ``q`` has a shape of - ``[num_seqs, num_heads, head_size]``. That represents there are total - ``num_seqs`` of query sequence data are pointed by ``q``. Since this - kernel is a single query attention kernel, each sequence only has one - query token. Hence, the ``num_seqs`` equals the total number of tokens - that are processed in the batch. -- **Context**: The context consists of the generated tokens from the - sequence. For instance, ``["What", "is", "your"]`` are the context - tokens, and the input query token is ``"name"``. The model might - generate the token ``"?"``. -- **Vec**: The vec is a list of elements that are fetched and - calculated together. For query and key data, the vec size - (``VEC_SIZE``) is determined so that each thread group can fetch and - calculate 16 bytes of data at a time. For value data, the vec size - (``V_VEC_SIZE``) is determined so that each thread can fetch and - calculate 16 bytes of data at a time. For example, if the - ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the - ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8. -- **Thread group**: The thread group is a small group of - threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one - query token and one key token at a time. Each thread handles only a - portion of the token data. The total number of elements processed by - one thread group is referred as ``x``. For example, if the thread - group contains 2 threads and the head size is 8, then thread 0 - handles the query and key elements at index 0, 2, 4, 6, while thread - 1 handles the elements at index 1, 3, 5, 7. -- **Block**: The key and value cache data in vLLM are split into - blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``) - of tokens at one head. Each block may contain only a portion of the - whole context tokens. For example, if the block size is 16 and the - head size is 128, then for one head, one block can store 16 \* 128 = - 2048 elements. -- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that - execute simultaneously on a stream multiprocessor (SM). In this - kernel, each warp processes the calculation between one query token - and key tokens of one entire block at a time (it may process multiple - blocks in multiple iterations). For example, if there are 4 warps and - 6 blocks for one context, the assignment would be like warp 0 handles - the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 - handles the 2nd block and warp 3 handles the 3rd block. -- **Thread block**: A thread block is a group of - threads(\ ``NUM_THREADS``) that can access the same shared memory. - Each thread block contains multiple warps(\ ``NUM_WARPS``), and in - this kernel, each thread block processes the calculation between one - query token and key tokens of a whole context. -- **Grid**: A grid is a collection of thread blocks and defines the - shape of the collection. In this kernel, the shape is - ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread - block only handles the calculation for one head, one sequence, and - one partition. - -Query ------ - -- This section will introduce how query data is stored in memory and - fetched by each thread. As mentioned above, each thread group fetches - one query token data, while each thread itself only handles a part of - one query token data. Within each warp, every thread group will fetch - the same query token data, but will multiply it with different key - token data. - - .. code:: cpp - - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - - .. figure:: ../../assets/kernel/query.png - :alt: query - :width: 70% - :align: center - - Query data of one token at one head - -- Each thread defines its own ``q_ptr`` which points to the assigned - query token data on global memory. For example, if ``VEC_SIZE`` is 4 - and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains - total of 128 elements divided into 128 / 4 = 32 vecs. - - .. figure:: ../../assets/kernel/q_vecs.png - :alt: q_vecs - :width: 70% - :align: center - - ``q_vecs`` for one thread group - - .. code:: cpp - - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; - -- Next, we need to read the global memory data pointed to by ``q_ptr`` - into shared memory as ``q_vecs``. It is important to note that each - vecs is assigned to a different row. For example, if the - ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs, - while thread 1 handles the 1st row vecs. By reading the query data in - this way, neighboring threads like thread 0 and thread 1 can read - neighbor memory, achieving the memory coalescing to improve - performance. - -Key ---- - -- Similar to the "Query" section, this section introduces memory layout - and assignment for keys. While each thread group only handle one - query token one kernel run, it may handle multiple key tokens across - multiple iterations. Meanwhile, each warp will process multiple blocks - of key tokens in multiple iterations, ensuring that all context - tokens are processed by the entire thread group after the kernel run. - In this context, "handle" refers to performing the dot multiplication - between query data and key data. - - .. code:: cpp - - const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - -- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different - key token at different iterations. As shown above, that ``k_ptr`` - points to key token data based on ``k_cache`` at assigned block, - assigned head and assigned token. - - .. figure:: ../../assets/kernel/key.png - :alt: key - :width: 70% - :align: center - - Key data of all context tokens at one head - -- The diagram above illustrates the memory layout for key data. It - assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is - 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each - rectangle represents all the elements for one key token at one head, - which will be processed by one thread group. The left half shows the - total 16 blocks of key token data for warp 0, while the right half - represents the remaining key token data for other warps or - iterations. Inside each rectangle, there are a total 32 vecs (128 - elements for one token) that will be processed by 2 threads (one - thread group) separately. - - .. figure:: ../../assets/kernel/k_vecs.png - :alt: k_vecs - :width: 70% - :align: center - - ``k_vecs`` for one thread - - .. code:: cpp - - K_vec k_vecs[NUM_VECS_PER_THREAD] - -- Next, we need to read the key token data from ``k_ptr`` and store - them on register memory as ``k_vecs``. We use register memory for - ``k_vecs`` because it will only be accessed by one thread once, - whereas ``q_vecs`` will be accessed by multiple threads multiple - times. Each ``k_vecs`` will contain multiple vectors for later - calculation. Each vec will be set at each inner iteration. The - assignment of vecs allows neighboring threads in a warp to read - neighboring memory together, which again promotes the memory - coalescing. For instance, thread 0 will read vec 0, while thread 1 - will read vec 1. In the next inner loop, thread 0 will read vec 2, - while thread 1 will read vec 3, and so on. -- You may still be a little confused about the overall flow. Don't - worry, please keep reading the next "QK" section. It will illustrate - the query and key calculation flow in a clearer and higher-level - manner. - -QK ---- - -- As shown the pseudo code below, before the entire for loop block, we - fetch the query data for one token and store it in ``q_vecs``. Then, - in the outer for loop, we iterate through different ``k_ptrs`` that - point to different tokens and prepare the ``k_vecs`` in the inner for - loop. Finally, we perform the dot multiplication between the - ``q_vecs`` and each ``k_vecs``. - - .. code:: cpp - - q_vecs = ... - for ... { - k_ptr = ... - for ... { - k_vecs[i] = ... - } - ... - float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); - } - -- As mentioned before, for each thread, it only fetches part of the - query and key token data at a time. However, there will be a cross - thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk`` - returned here is not just between part of the query and key token dot - multiplication, but actually a full result between entire query and - key token data. -- For example, if the value of ``HEAD_SIZE`` is 128 and - ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain - total 64 elements. However, the returned ``qk`` is actually the - result of dot multiplication between 128 query elements and 128 key - elements. If you want to learn more about the details of the dot - multiplication and reduction, you may refer to the implementation of - ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not - cover it in this document. - -Softmax -------- - -- Next, we need to calculate the normalized softmax for all ``qk``\ s, - as shown above, where each :math:`x` represents a ``qk``. To do this, - we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and - the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction - should be performed across the entire thread block, encompassing - results between the query token and all context key tokens. - - .. math:: - :nowrap: - - \begin{gather*} - m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ - \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} - \end{gather*} - -``qk_max`` and ``logits`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -- Just right after we get the ``qk`` result, we can set the temporary - ``logits`` result with ``qk`` (In the end, the ``logits`` should - store the normalized softmax result). Also we can compare and collect - the ``qk_max`` for all ``qk``\ s that are calculated by current - thread group. - - .. code:: cpp - - if (thread_group_offset == 0) { - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - -- Please note that the ``logits`` here is on shared memory, so each - thread group will set the fields for its own assigned context tokens. - Overall, the size of logits should be number of context tokens. - - .. code:: cpp - - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - -- Then we need to get the reduced ``qk_max`` across each warp. The main - idea is to make threads in warp to communicate with each other and - get the final max ``qk`` . - - .. code:: cpp - - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - -- Finally, we can get the reduced ``qk_max`` from whole thread block by - compare the ``qk_max`` from all warps in this thread block. Then we - need to broadcast the final result to each thread. - -``exp_sum`` -~~~~~~~~~~~ - -- Similar to ``qk_max``, we need to get the reduced sum value from the - entire thread block too. - - .. code:: cpp - - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - ... - exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); - -- Firstly, sum all exp values from each thread group, and meanwhile, - convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``. - Please note, the ``qk_max`` here is already the max ``qk`` across the - whole thread block. And then we can do reduction for ``exp_sum`` - across whole thread block just like the ``qk_max``. - - .. code:: cpp - - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - -- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain - the final normalized softmax result as ``logits``. This ``logits`` - variable will be used for dot multiplication with the value data in - later steps. Now, it should store the normalized softmax result of - ``qk`` for all assigned context tokens. - -Value ------ - -.. figure:: ../../assets/kernel/value.png - :alt: value - :width: 70% - :align: center - - Value data of all context tokens at one head - -.. figure:: ../../assets/kernel/logits_vec.png - :alt: logits_vec - :width: 50% - :align: center - - ``logits_vec`` for one thread - -.. figure:: ../../assets/kernel/v_vec.png - :alt: v_vec - :width: 70% - :align: center - - List of ``v_vec`` for one thread - -- Now we need to retrieve the value data and perform dot multiplication - with ``logits``. Unlike query and key, there is no thread group - concept for value data. As shown in diagram, different from key token - memory layout, elements from the same column correspond to the same - value token. For one block of value data, there are ``HEAD_SIZE`` of - rows and ``BLOCK_SIZE`` of columns that are split into multiple - ``v_vecs``. -- Each thread always fetches ``V_VEC_SIZE`` elements from the same - ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread - retrieves multiple ``v_vec``\ s from different rows and the same - columns through multiple inner iterations. For each ``v_vec``, it - needs to be dot multiplied with the corresponding ``logits_vec``, - which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with - multiple inner iterations, each warp will process one block of value - tokens. And with multiple outer iterations, the whole context value - tokens are processd - - .. code:: cpp - - float accs[NUM_ROWS_PER_THREAD]; - for ... { // Iteration over different blocks. - logits_vec = ... - for ... { // Iteration over different rows. - v_vec = ... - ... - accs[i] += dot(logits_vec, v_vec); - } - } - -- As shown in the above pseudo code, in the outer loop, similar to - ``k_ptr``, ``logits_vec`` iterates over different blocks and reads - ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each - thread reads ``V_VEC_SIZE`` elements from the same tokens as a - ``v_vec`` and performs dot multiplication. It is important to note - that in each inner iteration, the thread fetches different head - position elements for the same tokens. The dot result is then - accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped - to a head position assigned to the current thread. -- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each - thread fetches 8 value elements for 8 tokens at a time. Each element - is from different tokens at the same head position. If ``HEAD_SIZE`` - is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to - fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are - a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle - a whole block of value tokens. And each ``accs`` in each thread - contains 8 elements that accumulated at 8 different head positions. - For the thread 0, the ``accs`` variable will have 8 elements, which - are 0th, 32th … 224th elements of a value head that are accumulated - from all assigned 8 tokens. - -LV ---- -- Now, we need to perform reduction for ``accs`` within each warp. This - process allows each thread to accumulate the ``accs`` for the - assigned head positions of all tokens in one block. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - -- Next, we perform reduction for ``accs`` across all warps, allowing - each thread to have the accumulation of ``accs`` for the assigned - head positions of all context tokens. Please note that each ``accs`` - in every thread only stores the accumulation for a portion of - elements of the entire head for all context tokens. However, overall, - all results for output have been calculated but are just stored in - different thread register memory. - - .. code:: cpp - - float* out_smem = reinterpret_cast(shared_mem); - for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - accs[i] += src[row_idx]; - } - - // Write out the accs. - } - -Output ------- - -- Now we can write all of calculated result from local register memory - to final output global memory. - - .. code:: cpp - - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; - -- First, we need to define the ``out_ptr`` variable, which points to - the start address of the assigned sequence and assigned head. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - -- Finally, we need to iterate over different assigned head positions - and write out the corresponding accumulated result based on the - ``out_ptr``. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md new file mode 100644 index 0000000000000..bcccd284879bb --- /dev/null +++ b/docs/source/design/multimodal/adding_multimodal_plugin.md @@ -0,0 +1,16 @@ +(adding-multimodal-plugin)= + +# Adding a Multimodal Plugin + +This document teaches you how to add a new modality to vLLM. + +Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. +For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. + +The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. + +```{note} +This article is a work in progress. +``` + +% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst deleted file mode 100644 index b726138f840a3..0000000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _adding_multimodal_plugin: - -Adding a Multimodal Plugin -========================== - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. - -.. note:: - This article is a work in progress. - -.. - TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md similarity index 61% rename from docs/source/design/multimodal/multimodal_index.rst rename to docs/source/design/multimodal/multimodal_index.md index c6d47f90b62d5..a240a7446b953 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.md @@ -1,66 +1,83 @@ -.. _multi_modality: +(multi-modality)= -Multi-Modality -============== +# Multi-Modality +```{eval-rst} .. currentmodule:: vllm.multimodal - -vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. +``` -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` -via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to {ref}`supported models ` +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following :ref:`this guide `. +by following {ref}`this guide `. -Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. +Looking to add your own multi-modal model? Please follow the instructions listed {ref}`here `. -Guides -++++++ +## Guides -.. toctree:: - :maxdepth: 1 +```{toctree} +:maxdepth: 1 - adding_multimodal_plugin +adding_multimodal_plugin +``` -Module Contents -+++++++++++++++ +## Module Contents +```{eval-rst} .. automodule:: vllm.multimodal +``` -Registry --------- +### Registry +```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalRegistry :members: :show-inheritance: +``` -Base Classes ------------- +### Base Classes +```{eval-rst} .. autodata:: vllm.multimodal.NestedTensors +``` +```{eval-rst} .. autodata:: vllm.multimodal.BatchedTensorInputs +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalDataBuiltins :members: :show-inheritance: +``` +```{eval-rst} .. autodata:: vllm.multimodal.MultiModalDataDict +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalPlugin :members: :show-inheritance: +``` -Image Classes -------------- +### Image Classes +```{eval-rst} .. automodule:: vllm.multimodal.image :members: :show-inheritance: +``` diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md new file mode 100644 index 0000000000000..147b5cbd58bc3 --- /dev/null +++ b/docs/source/design/plugin_system.md @@ -0,0 +1,54 @@ +(plugin-system)= + +# vLLM's Plugin System + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +## How Plugins Work in vLLM + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see {ref}`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. + +## How vLLM Discovers Plugins + +vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +```python +# inside `setup.py` file +from setuptools import setup + +setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + +# inside `vllm_add_dummy_model.py` file +def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") +``` + +For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins. +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. + +## What Can Plugins Do? + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +## Guidelines for Writing Plugins + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +## Compatibility Guarantee + +vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst deleted file mode 100644 index 5a96cc8b3a464..0000000000000 --- a/docs/source/design/plugin_system.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _plugin_system: - -vLLM's Plugin System -==================== - -The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. - -How Plugins Work in vLLM ------------------------- - -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. - -How vLLM Discovers Plugins --------------------------- - -vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: - -.. code-block:: python - - # inside `setup.py` file - from setuptools import setup - - setup(name='vllm_add_dummy_model', - version='0.1', - packages=['vllm_add_dummy_model'], - entry_points={ - 'vllm.general_plugins': - ["register_dummy_model = vllm_add_dummy_model:register"] - }) - - # inside `vllm_add_dummy_model.py` file - def register(): - from vllm import ModelRegistry - - if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model("MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava") - -For more information on adding entry points to your package, please check the `official documentation `__. - -Every plugin has three parts: - -1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. - -2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. - -3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. - -What Can Plugins Do? --------------------- - -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. - -Guidelines for Writing Plugins ------------------------------- - -- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. - -Compatibility Guarantee ------------------------ - -vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md similarity index 59% rename from docs/source/dev/engine/async_llm_engine.rst rename to docs/source/dev/engine/async_llm_engine.md index 93fc310cb543b..904feaa505164 100644 --- a/docs/source/dev/engine/async_llm_engine.rst +++ b/docs/source/dev/engine/async_llm_engine.md @@ -1,6 +1,7 @@ -AsyncLLMEngine -================================= +# AsyncLLMEngine +```{eval-rst} .. autoclass:: vllm.AsyncLLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md new file mode 100644 index 0000000000000..701cb95d3be33 --- /dev/null +++ b/docs/source/dev/engine/engine_index.md @@ -0,0 +1,17 @@ +# vLLM Engine + +```{eval-rst} +.. automodule:: vllm.engine +``` + +```{eval-rst} +.. currentmodule:: vllm.engine +``` + +```{toctree} +:caption: Engines +:maxdepth: 2 + +llm_engine +async_llm_engine +``` diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst deleted file mode 100644 index ba9ae55ddea46..0000000000000 --- a/docs/source/dev/engine/engine_index.rst +++ /dev/null @@ -1,13 +0,0 @@ -vLLM Engine -================================= - -.. automodule:: vllm.engine -.. currentmodule:: vllm.engine - -.. toctree:: - :maxdepth: 2 - :caption: Engines - - llm_engine - async_llm_engine - diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md similarity index 60% rename from docs/source/dev/engine/llm_engine.rst rename to docs/source/dev/engine/llm_engine.md index 0b8c1e219d7c9..d6613ef5562dc 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.md @@ -1,6 +1,7 @@ -LLMEngine -================================= +# LLMEngine +```{eval-rst} .. autoclass:: vllm.LLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md similarity index 67% rename from docs/source/dev/offline_inference/llm.rst rename to docs/source/dev/offline_inference/llm.md index 83ba1b6987c6d..9f129d5e41686 100644 --- a/docs/source/dev/offline_inference/llm.rst +++ b/docs/source/dev/offline_inference/llm.md @@ -1,6 +1,7 @@ -LLM Class -========= +# LLM Class +```{eval-rst} .. autoclass:: vllm.LLM :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md similarity index 78% rename from docs/source/dev/offline_inference/llm_inputs.rst rename to docs/source/dev/offline_inference/llm_inputs.md index 0d47281db485e..21f688a12c536 100644 --- a/docs/source/dev/offline_inference/llm_inputs.rst +++ b/docs/source/dev/offline_inference/llm_inputs.md @@ -1,14 +1,19 @@ -LLM Inputs -========== +# LLM Inputs +```{eval-rst} .. autodata:: vllm.inputs.PromptType +``` +```{eval-rst} .. autoclass:: vllm.inputs.TextPrompt :show-inheritance: :members: :member-order: bysource +``` +```{eval-rst} .. autoclass:: vllm.inputs.TokensPrompt :show-inheritance: :members: :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md new file mode 100644 index 0000000000000..318a02d8c78df --- /dev/null +++ b/docs/source/dev/offline_inference/offline_index.md @@ -0,0 +1,8 @@ +# Offline Inference + +```{toctree} +:maxdepth: 1 + +llm +llm_inputs +``` diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst deleted file mode 100644 index 27dfb0e9df90e..0000000000000 --- a/docs/source/dev/offline_inference/offline_index.rst +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference -================================= - -.. toctree:: - :maxdepth: 1 - - llm - llm_inputs diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md similarity index 55% rename from docs/source/dev/pooling_params.rst rename to docs/source/dev/pooling_params.md index 334e0287aff09..74b2c57443e4b 100644 --- a/docs/source/dev/pooling_params.rst +++ b/docs/source/dev/pooling_params.md @@ -1,5 +1,6 @@ -Pooling Parameters -================== +# Pooling Parameters +```{eval-rst} .. autoclass:: vllm.PoolingParams :members: +``` diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md similarity index 55% rename from docs/source/dev/sampling_params.rst rename to docs/source/dev/sampling_params.md index f645941a6c022..bdc36af5153db 100644 --- a/docs/source/dev/sampling_params.rst +++ b/docs/source/dev/sampling_params.md @@ -1,5 +1,6 @@ -Sampling Parameters -=================== +# Sampling Parameters +```{eval-rst} .. autoclass:: vllm.SamplingParams :members: +``` diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md new file mode 100644 index 0000000000000..3a90760e0c882 --- /dev/null +++ b/docs/source/getting_started/amd-installation.md @@ -0,0 +1,166 @@ +(installation-rocm)= + +# Installation with ROCm + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +Installation options: + +1. {ref}`Build from source with docker ` +2. {ref}`Build from source ` + +(build-from-source-docker-rocm)= + +## Option 1: Build from source with docker (recommended) + +You can build and install vLLM from source. + +First, build a docker image from [Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + +[Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. +- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. +- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` +- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: + +```console +$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +$ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +(build-from-source-rocm)= + +## Option 2: Build from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + +Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + +> ```console +> $ python3 -m pip install ninja cmake wheel pybind11 +> $ pip uninstall -y triton +> $ git clone https://github.com/OpenAI/triton.git +> $ cd triton +> $ git checkout e192dba +> $ cd python +> $ pip3 install . +> $ cd ../.. +> ``` + +```{note} +- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. +``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + +Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) +Alternatively, wheels intended for vLLM use can be accessed under the releases. + +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. +Note to get your gfx architecture, run `rocminfo |grep gfx`. + +> ```console +> $ git clone https://github.com/ROCm/flash-attention.git +> $ cd flash-attention +> $ git checkout 3cea2fb +> $ git submodule update --init +> $ GPU_ARCHS="gfx90a" python3 setup.py install +> $ cd .. +> ``` + +```{note} +- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) +``` + +3. Build vLLM. + + > For example, vLLM on ROCM 6.2 can be built with the following steps: + > + > ```console + > $ pip install --upgrade pip + > + > $ # Install PyTorch + > $ pip uninstall torch -y + > $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + > + > $ # Build & install AMD SMI + > $ pip install /opt/rocm/share/amd_smi + > + > $ # Install dependencies + > $ pip install --upgrade numba scipy huggingface-hub[cli] + > $ pip install "numpy<2" + > $ pip install -r requirements-rocm.txt + > + > $ # Build vLLM for MI210/MI250/MI300. + > $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + > $ python3 setup.py develop + > ``` + > + > This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation. + +```{tip} +- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. +- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. +- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. +- The ROCm version of PyTorch, ideally, should match the ROCm driver version. +``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst deleted file mode 100644 index ece5d785e0c65..0000000000000 --- a/docs/source/getting_started/amd-installation.rst +++ /dev/null @@ -1,178 +0,0 @@ -.. _installation_rocm: - -Installation with ROCm -====================== - -vLLM supports AMD GPUs with ROCm 6.2. - -Requirements ------------- - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -* ROCm 6.2 - -Installation options: - -#. :ref:`Build from source with docker ` -#. :ref:`Build from source ` - -.. _build_from_source_docker_rocm: - -Option 1: Build from source with docker (recommended) ------------------------------------------------------ - -You can build and install vLLM from source. - -First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -.. code-block:: console - - { - "features": { - "buildkit": true - } - } - - -`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. -* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target. -* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c` -* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running ``docker build`` with ``--build-arg`` options. - - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . - -To run the above docker image ``vllm-rocm``, use the below command: - -.. code-block:: console - - $ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - - -.. _build_from_source_rocm: - -Option 2: Build from source ---------------------------- - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- `ROCm `_ -- `PyTorch `_ - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_ - - -1. Install `Triton flash attention for ROCm `_ - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_ - - .. code-block:: console - - $ python3 -m pip install ninja cmake wheel pybind11 - $ pip uninstall -y triton - $ git clone https://github.com/OpenAI/triton.git - $ cd triton - $ git checkout e192dba - $ cd python - $ pip3 install . - $ cd ../.. - -.. note:: - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - - -2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_ - - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_ -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. -Note to get your gfx architecture, run `rocminfo |grep gfx`. - - .. code-block:: console - - $ git clone https://github.com/ROCm/flash-attention.git - $ cd flash-attention - $ git checkout 3cea2fb - $ git submodule update --init - $ GPU_ARCHS="gfx90a" python3 setup.py install - $ cd .. - -.. note:: - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - -3. Build vLLM. - - For example, vLLM on ROCM 6.2 can be built with the following steps: - - .. code-block:: console - - $ pip install --upgrade pip - - $ # Install PyTorch - $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - - $ # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi - - $ # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli] - $ pip install "numpy<2" - $ pip install -r requirements-rocm.txt - - $ # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop - - - This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation. - - -.. tip:: - - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - - -.. tip:: - - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to `vLLM performance optimization `_. - - diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md new file mode 100644 index 0000000000000..51d6b2215cecd --- /dev/null +++ b/docs/source/getting_started/arm-installation.md @@ -0,0 +1,46 @@ +(installation-arm)= + +# Installation for ARM CPUs + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: + +- CPU backend inference capabilities +- Relevant runtime environment variables +- Performance optimization tips + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. +Contents: + +1. {ref}`Requirements ` +2. {ref}`Quick Start with Dockerfile ` +3. {ref}`Building from Source ` + +(arm-backend-requirements)= + +## Requirements + +- **Operating System**: Linux or macOS +- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +- **Instruction Set Architecture (ISA)**: NEON support is required + +(arm-backend-quick-start-dockerfile)= + +## Quick Start with Dockerfile + +You can quickly set up vLLM on ARM using Docker: + +```console +$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-arm-backend-from-source)= + +## Building from Source + +To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst deleted file mode 100644 index 7b457df92c11d..0000000000000 --- a/docs/source/getting_started/arm-installation.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. _installation_arm: - -Installation for ARM CPUs -========================= - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: - -* CPU backend inference capabilities -* Relevant runtime environment variables -* Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. :ref:`Requirements ` -2. :ref:`Quick Start with Dockerfile ` -3. :ref:`Building from Source ` - -.. _arm_backend_requirements: - -Requirements ------------- - -* **Operating System**: Linux or macOS -* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) -* **Instruction Set Architecture (ISA)**: NEON support is required - -.. _arm_backend_quick_start_dockerfile: - -Quick Start with Dockerfile ---------------------------- - -You can quickly set up vLLM on ARM using Docker: - -.. code-block:: console - - $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_arm_backend_from_source: - -Building from Source --------------------- - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md new file mode 100644 index 0000000000000..63be5275b6180 --- /dev/null +++ b/docs/source/getting_started/cpu-installation.md @@ -0,0 +1,154 @@ +(installation-cpu)= + +# Installation with CPU + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) + +Table of contents: + +1. {ref}`Requirements ` +2. {ref}`Quick start using Dockerfile ` +3. {ref}`Build from source ` +4. {ref}`Related runtime environment variables ` +5. {ref}`Intel Extension for PyTorch ` +6. {ref}`Performance tips ` + +(cpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Compiler: gcc/g++>=12.3.0 (optional, recommended) +- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) + +(cpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-cpu-backend-from-source)= + +## Build from source + +- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get update -y +$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev +$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +- Second, install Python packages for vLLM CPU backend building: + +```console +$ pip install --upgrade pip +$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +- Finally, build and install vLLM CPU backend: + +```console +$ VLLM_TARGET_DEVICE=cpu python setup.py install +``` + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +``` + +(env-intro)= + +## Related runtime environment variables + +- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + +(ipex-guidance)= + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +(cpu-backend-performance-tips)= + +## Performance tips + +- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +$ find / -name *libtcmalloc* # find the dynamic link library path +$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +$ python examples/offline_inference.py # run vLLM +``` + +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: + +```console +$ export VLLM_CPU_KVCACHE_SPACE=40 +$ export VLLM_CPU_OMP_THREADS_BIND=0-29 +$ vllm serve facebook/opt-125m +``` + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +```console +$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + +# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. +CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ +0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 +8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + +# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 +$ export VLLM_CPU_OMP_THREADS_BIND=0-7 +$ python examples/offline_inference.py +``` + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. + +## CPU Backend Considerations + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](https://github.com/vllm-project/vllm/pull/6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + ```console + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + ``` + + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.html) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst deleted file mode 100644 index 649de1cd9b53c..0000000000000 --- a/docs/source/getting_started/cpu-installation.rst +++ /dev/null @@ -1,164 +0,0 @@ -.. _installation_cpu: - -Installation with CPU -======================== - -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - -- Tensor Parallel -- Model Quantization (``INT8 W8A8, AWQ``) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` -#. :ref:`Related runtime environment variables ` -#. :ref:`Intel Extension for PyTorch ` -#. :ref:`Performance tips ` - -.. _cpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Compiler: gcc/g++>=12.3.0 (optional, recommended) -* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) - -.. _cpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_cpu_backend_from_source: - -Build from source ------------------ - -- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev - $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -- Second, install Python packages for vLLM CPU backend building: - -.. code-block:: console - - $ pip install --upgrade pip - $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy - $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, build and install vLLM CPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=cpu python setup.py install - -.. note:: - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - - - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. - -.. _env_intro: - -Related runtime environment variables -------------------------------------- - -- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. - -.. _ipex_guidance: - -Intel Extension for PyTorch ---------------------------- - -- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. - -.. _cpu_backend_performance_tips: - -Performance tips ------------------ - -- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library - $ find / -name *libtcmalloc* # find the dynamic link library path - $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD - $ python examples/offline_inference.py # run vLLM - -- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - -.. code-block:: console - - $ export VLLM_CPU_KVCACHE_SPACE=40 - $ export VLLM_CPU_OMP_THREADS_BIND=0-29 - $ vllm serve facebook/opt-125m - -- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - -.. code-block:: console - - $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores - - # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. - CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ - 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - - # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 - $ export VLLM_CPU_OMP_THREADS_BIND=0-7 - $ python examples/offline_inference.py - -- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. - -CPU Backend Considerations --------------------------- - -- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - -- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. - -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - - * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: - - .. code-block:: console - - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp - - - * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md new file mode 100644 index 0000000000000..b0566478a2fac --- /dev/null +++ b/docs/source/getting_started/debugging.md @@ -0,0 +1,139 @@ +(debugging)= + +# Debugging Tips + +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. + +```{note} +Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. +``` + +## Hangs downloading a model + +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue. + +## Hangs loading a model from disk + +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. + +```{note} +To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. +``` + +## Model is too large + +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using [this example](https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html) . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +## Enable more logging + +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: + +- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. +- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. +- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. +- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. + +## Incorrect network setup + +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=`. + +You might also need to set `export NCCL_SOCKET_IFNAME=` and `export GLOO_SOCKET_IFNAME=` to specify the network interface for the IP address. + +## Error near `self.graph.replay()` + +If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +## Incorrect hardware/driver + +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. + +```python +# Test PyTorch NCCL +import torch +import torch.distributed as dist +dist.init_process_group(backend="nccl") +local_rank = dist.get_rank() % torch.cuda.device_count() +torch.cuda.set_device(local_rank) +data = torch.FloatTensor([1,] * 128).to("cuda") +dist.all_reduce(data, op=dist.ReduceOp.SUM) +torch.cuda.synchronize() +value = data.mean().item() +world_size = dist.get_world_size() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch NCCL is successful!") + +# Test PyTorch GLOO +gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") +cpu_data = torch.FloatTensor([1,] * 128) +dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) +value = cpu_data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch GLOO is successful!") + +if world_size <= 1: + exit() + +# Test vLLM NCCL, with cuda graph +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + +pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) + +s = torch.cuda.Stream() +with torch.cuda.stream(s): + data.fill_(1) + pynccl.all_reduce(data, stream=s) + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL is successful!") + +g = torch.cuda.CUDAGraph() +with torch.cuda.graph(cuda_graph=g, stream=s): + pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + +data.fill_(1) +g.replay() +torch.cuda.current_stream().synchronize() +value = data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL with cuda graph is successful!") + +dist.destroy_process_group(gloo_group) +dist.destroy_process_group() +``` + +If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: + +```console +$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +``` + +If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: + +```console +$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +``` + +If the script runs successfully, you should see the message `sanity check is successful!`. + +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + +```{note} +A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: + +- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. +- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. + +Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. +``` + +## Known Issues + +- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759). diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst deleted file mode 100644 index 0c1afcbd7c0b9..0000000000000 --- a/docs/source/getting_started/debugging.rst +++ /dev/null @@ -1,141 +0,0 @@ -.. _debugging: - -=============== -Debugging Tips -=============== - -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. - -.. note:: - - Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. - -Hangs downloading a model ----------------------------------------- -If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. -It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. - -Hangs loading a model from disk ----------------------------------------- -If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. -It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. - -.. note:: - - To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. - -Model is too large ----------------------------------------- -If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. - -Enable more logging ----------------------------------------- -If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: - -- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. -- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. -- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. - -Incorrect network setup ----------------------------------------- -The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. -If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. - -You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. - -Error near ``self.graph.replay()`` ----------------------------------------- -If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. -To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. - -Incorrect hardware/driver ----------------------------------------- -If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. - -.. code-block:: python - - # Test PyTorch NCCL - import torch - import torch.distributed as dist - dist.init_process_group(backend="nccl") - local_rank = dist.get_rank() % torch.cuda.device_count() - torch.cuda.set_device(local_rank) - data = torch.FloatTensor([1,] * 128).to("cuda") - dist.all_reduce(data, op=dist.ReduceOp.SUM) - torch.cuda.synchronize() - value = data.mean().item() - world_size = dist.get_world_size() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch NCCL is successful!") - - # Test PyTorch GLOO - gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") - cpu_data = torch.FloatTensor([1,] * 128) - dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) - value = cpu_data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch GLOO is successful!") - - if world_size <= 1: - exit() - - # Test vLLM NCCL, with cuda graph - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - - pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) - - s = torch.cuda.Stream() - with torch.cuda.stream(s): - data.fill_(1) - pynccl.all_reduce(data, stream=s) - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL is successful!") - - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(cuda_graph=g, stream=s): - pynccl.all_reduce(data, stream=torch.cuda.current_stream()) - - data.fill_(1) - g.replay() - torch.cuda.current_stream().synchronize() - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL with cuda graph is successful!") - - dist.destroy_process_group(gloo_group) - dist.destroy_process_group() - -If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py - -If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py - -If the script runs successfully, you should see the message ``sanity check is successful!``. - -If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. - -.. note:: - - A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: - - - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. - - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. - - Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. - -Known Issues ----------------------------------------- -- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/getting_started/examples/api_client.md b/docs/source/getting_started/examples/api_client.md new file mode 100644 index 0000000000000..925f74cdc7eb4 --- /dev/null +++ b/docs/source/getting_started/examples/api_client.md @@ -0,0 +1,8 @@ +# API Client + +Source . + +```{literalinclude} ../../../../examples/api_client.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/aqlm_example.md b/docs/source/getting_started/examples/aqlm_example.md new file mode 100644 index 0000000000000..51ed19a3c61fd --- /dev/null +++ b/docs/source/getting_started/examples/aqlm_example.md @@ -0,0 +1,8 @@ +# Aqlm Example + +Source . + +```{literalinclude} ../../../../examples/aqlm_example.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/cpu_offload.md b/docs/source/getting_started/examples/cpu_offload.md new file mode 100644 index 0000000000000..f5b0d58b743a2 --- /dev/null +++ b/docs/source/getting_started/examples/cpu_offload.md @@ -0,0 +1,8 @@ +# Cpu Offload + +Source . + +```{literalinclude} ../../../../examples/cpu_offload.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/examples_index.md b/docs/source/getting_started/examples/examples_index.md new file mode 100644 index 0000000000000..d074aae3edb8c --- /dev/null +++ b/docs/source/getting_started/examples/examples_index.md @@ -0,0 +1,44 @@ +# Examples + +```{toctree} +:caption: Scripts +:maxdepth: 1 + +api_client +aqlm_example +cpu_offload +florence2_inference +gguf_inference +gradio_openai_chatbot_webserver +gradio_webserver +llm_engine_example +lora_with_quantization_inference +multilora_inference +offline_chat_with_tools +offline_inference +offline_inference_arctic +offline_inference_audio_language +offline_inference_chat +offline_inference_distributed +offline_inference_embedding +offline_inference_encoder_decoder +offline_inference_mlpspeculator +offline_inference_neuron +offline_inference_neuron_int8_quantization +offline_inference_pixtral +offline_inference_tpu +offline_inference_vision_language +offline_inference_vision_language_embedding +offline_inference_vision_language_multi_image +offline_inference_with_prefix +offline_inference_with_profiler +offline_profile +openai_chat_completion_client +openai_chat_completion_client_for_multimodal +openai_chat_completion_client_with_tools +openai_chat_embedding_client_for_multimodal +openai_completion_client +openai_embedding_client +save_sharded_state +tensorize_vllm_model +``` diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md new file mode 100644 index 0000000000000..f8e57d9e3d64e --- /dev/null +++ b/docs/source/getting_started/examples/examples_index.template.md @@ -0,0 +1,8 @@ +# Examples + +```{toctree} +:caption: Scripts +:maxdepth: 1 + +%EXAMPLE_DOCS% +``` diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst deleted file mode 100644 index 1b34cccbae15a..0000000000000 --- a/docs/source/getting_started/examples/examples_index.template.rst +++ /dev/null @@ -1,8 +0,0 @@ -Examples -================================= - -.. toctree:: - :maxdepth: 1 - :caption: Scripts - - %EXAMPLE_DOCS% diff --git a/docs/source/getting_started/examples/florence2_inference.md b/docs/source/getting_started/examples/florence2_inference.md new file mode 100644 index 0000000000000..3805648736b7c --- /dev/null +++ b/docs/source/getting_started/examples/florence2_inference.md @@ -0,0 +1,8 @@ +# Florence2 Inference + +Source . + +```{literalinclude} ../../../../examples/florence2_inference.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/gguf_inference.md b/docs/source/getting_started/examples/gguf_inference.md new file mode 100644 index 0000000000000..96d6da400f4f6 --- /dev/null +++ b/docs/source/getting_started/examples/gguf_inference.md @@ -0,0 +1,8 @@ +# Gguf Inference + +Source . + +```{literalinclude} ../../../../examples/gguf_inference.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md b/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md new file mode 100644 index 0000000000000..926c10d95efe1 --- /dev/null +++ b/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md @@ -0,0 +1,8 @@ +# Gradio OpenAI Chatbot Webserver + +Source . + +```{literalinclude} ../../../../examples/gradio_openai_chatbot_webserver.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/gradio_webserver.md b/docs/source/getting_started/examples/gradio_webserver.md new file mode 100644 index 0000000000000..c5f8f7a739da6 --- /dev/null +++ b/docs/source/getting_started/examples/gradio_webserver.md @@ -0,0 +1,8 @@ +# Gradio Webserver + +Source . + +```{literalinclude} ../../../../examples/gradio_webserver.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/llm_engine_example.md b/docs/source/getting_started/examples/llm_engine_example.md new file mode 100644 index 0000000000000..909b730a5b143 --- /dev/null +++ b/docs/source/getting_started/examples/llm_engine_example.md @@ -0,0 +1,8 @@ +# LLM Engine Example + +Source . + +```{literalinclude} ../../../../examples/llm_engine_example.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/lora_with_quantization_inference.md b/docs/source/getting_started/examples/lora_with_quantization_inference.md new file mode 100644 index 0000000000000..ba9af0bf3aa1e --- /dev/null +++ b/docs/source/getting_started/examples/lora_with_quantization_inference.md @@ -0,0 +1,8 @@ +# Lora With Quantization Inference + +Source . + +```{literalinclude} ../../../../examples/lora_with_quantization_inference.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/multilora_inference.md b/docs/source/getting_started/examples/multilora_inference.md new file mode 100644 index 0000000000000..8c5213c238519 --- /dev/null +++ b/docs/source/getting_started/examples/multilora_inference.md @@ -0,0 +1,8 @@ +# MultiLoRA Inference + +Source . + +```{literalinclude} ../../../../examples/multilora_inference.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_chat_with_tools.md b/docs/source/getting_started/examples/offline_chat_with_tools.md new file mode 100644 index 0000000000000..dc126d1efc522 --- /dev/null +++ b/docs/source/getting_started/examples/offline_chat_with_tools.md @@ -0,0 +1,8 @@ +# Offline Chat With Tools + +Source . + +```{literalinclude} ../../../../examples/offline_chat_with_tools.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference.md b/docs/source/getting_started/examples/offline_inference.md new file mode 100644 index 0000000000000..20c8e848089a1 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference.md @@ -0,0 +1,8 @@ +# Offline Inference + +Source . + +```{literalinclude} ../../../../examples/offline_inference.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_arctic.md b/docs/source/getting_started/examples/offline_inference_arctic.md new file mode 100644 index 0000000000000..4ec02315dc0b1 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_arctic.md @@ -0,0 +1,8 @@ +# Offline Inference Arctic + +Source . + +```{literalinclude} ../../../../examples/offline_inference_arctic.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_audio_language.md b/docs/source/getting_started/examples/offline_inference_audio_language.md new file mode 100644 index 0000000000000..fc14e4d6c6f60 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_audio_language.md @@ -0,0 +1,8 @@ +# Offline Inference Audio Language + +Source . + +```{literalinclude} ../../../../examples/offline_inference_audio_language.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_chat.md b/docs/source/getting_started/examples/offline_inference_chat.md new file mode 100644 index 0000000000000..46f6eb4faa84c --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_chat.md @@ -0,0 +1,8 @@ +# Offline Inference Chat + +Source . + +```{literalinclude} ../../../../examples/offline_inference_chat.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_distributed.md b/docs/source/getting_started/examples/offline_inference_distributed.md new file mode 100644 index 0000000000000..e9c07c6fd9877 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_distributed.md @@ -0,0 +1,8 @@ +# Offline Inference Distributed + +Source . + +```{literalinclude} ../../../../examples/offline_inference_distributed.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_embedding.md b/docs/source/getting_started/examples/offline_inference_embedding.md new file mode 100644 index 0000000000000..ea4a37a83714a --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_embedding.md @@ -0,0 +1,8 @@ +# Offline Inference Embedding + +Source . + +```{literalinclude} ../../../../examples/offline_inference_embedding.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_encoder_decoder.md b/docs/source/getting_started/examples/offline_inference_encoder_decoder.md new file mode 100644 index 0000000000000..f18a6eba70796 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_encoder_decoder.md @@ -0,0 +1,8 @@ +# Offline Inference Encoder Decoder + +Source . + +```{literalinclude} ../../../../examples/offline_inference_encoder_decoder.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_mlpspeculator.md b/docs/source/getting_started/examples/offline_inference_mlpspeculator.md new file mode 100644 index 0000000000000..27481d5b6f9e2 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_mlpspeculator.md @@ -0,0 +1,8 @@ +# Offline Inference Mlpspeculator + +Source . + +```{literalinclude} ../../../../examples/offline_inference_mlpspeculator.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_neuron.md b/docs/source/getting_started/examples/offline_inference_neuron.md new file mode 100644 index 0000000000000..943c9da23d936 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_neuron.md @@ -0,0 +1,8 @@ +# Offline Inference Neuron + +Source . + +```{literalinclude} ../../../../examples/offline_inference_neuron.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md b/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md new file mode 100644 index 0000000000000..69992f85c4f46 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md @@ -0,0 +1,8 @@ +# Offline Inference Neuron Int8 Quantization + +Source . + +```{literalinclude} ../../../../examples/offline_inference_neuron_int8_quantization.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_pixtral.md b/docs/source/getting_started/examples/offline_inference_pixtral.md new file mode 100644 index 0000000000000..7d141c2894d7f --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_pixtral.md @@ -0,0 +1,8 @@ +# Offline Inference Pixtral + +Source . + +```{literalinclude} ../../../../examples/offline_inference_pixtral.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_tpu.md b/docs/source/getting_started/examples/offline_inference_tpu.md new file mode 100644 index 0000000000000..ad7a5482ac67b --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_tpu.md @@ -0,0 +1,8 @@ +# Offline Inference Tpu + +Source . + +```{literalinclude} ../../../../examples/offline_inference_tpu.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_vision_language.md b/docs/source/getting_started/examples/offline_inference_vision_language.md new file mode 100644 index 0000000000000..ce4549c8f47b1 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_vision_language.md @@ -0,0 +1,8 @@ +# Offline Inference Vision Language + +Source . + +```{literalinclude} ../../../../examples/offline_inference_vision_language.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md b/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md new file mode 100644 index 0000000000000..ef17dcbff8fa6 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md @@ -0,0 +1,8 @@ +# Offline Inference Vision Language Embedding + +Source . + +```{literalinclude} ../../../../examples/offline_inference_vision_language_embedding.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md b/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md new file mode 100644 index 0000000000000..c6f38ac8329bc --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md @@ -0,0 +1,8 @@ +# Offline Inference Vision Language Multi Image + +Source . + +```{literalinclude} ../../../../examples/offline_inference_vision_language_multi_image.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_with_prefix.md b/docs/source/getting_started/examples/offline_inference_with_prefix.md new file mode 100644 index 0000000000000..8c7bff2cc649a --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_with_prefix.md @@ -0,0 +1,8 @@ +# Offline Inference With Prefix + +Source . + +```{literalinclude} ../../../../examples/offline_inference_with_prefix.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_inference_with_profiler.md b/docs/source/getting_started/examples/offline_inference_with_profiler.md new file mode 100644 index 0000000000000..cf4b5ae7d1336 --- /dev/null +++ b/docs/source/getting_started/examples/offline_inference_with_profiler.md @@ -0,0 +1,8 @@ +# Offline Inference With Profiler + +Source . + +```{literalinclude} ../../../../examples/offline_inference_with_profiler.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/offline_profile.md b/docs/source/getting_started/examples/offline_profile.md new file mode 100644 index 0000000000000..c035fd8180a1f --- /dev/null +++ b/docs/source/getting_started/examples/offline_profile.md @@ -0,0 +1,8 @@ +# Offline Profile + +Source . + +```{literalinclude} ../../../../examples/offline_profile.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_chat_completion_client.md b/docs/source/getting_started/examples/openai_chat_completion_client.md new file mode 100644 index 0000000000000..62527126ea015 --- /dev/null +++ b/docs/source/getting_started/examples/openai_chat_completion_client.md @@ -0,0 +1,8 @@ +# OpenAI Chat Completion Client + +Source . + +```{literalinclude} ../../../../examples/openai_chat_completion_client.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md new file mode 100644 index 0000000000000..7e10bcd402258 --- /dev/null +++ b/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md @@ -0,0 +1,8 @@ +# OpenAI Chat Completion Client For Multimodal + +Source . + +```{literalinclude} ../../../../examples/openai_chat_completion_client_for_multimodal.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md b/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md new file mode 100644 index 0000000000000..699b66cbf9878 --- /dev/null +++ b/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md @@ -0,0 +1,8 @@ +# OpenAI Chat Completion Client With Tools + +Source . + +```{literalinclude} ../../../../examples/openai_chat_completion_client_with_tools.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md new file mode 100644 index 0000000000000..ee4496e851b3d --- /dev/null +++ b/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md @@ -0,0 +1,8 @@ +# OpenAI Chat Embedding Client For Multimodal + +Source . + +```{literalinclude} ../../../../examples/openai_chat_embedding_client_for_multimodal.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_completion_client.md b/docs/source/getting_started/examples/openai_completion_client.md new file mode 100644 index 0000000000000..d0e6fb1e26e9c --- /dev/null +++ b/docs/source/getting_started/examples/openai_completion_client.md @@ -0,0 +1,8 @@ +# OpenAI Completion Client + +Source . + +```{literalinclude} ../../../../examples/openai_completion_client.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/openai_embedding_client.md b/docs/source/getting_started/examples/openai_embedding_client.md new file mode 100644 index 0000000000000..730e4656f2c81 --- /dev/null +++ b/docs/source/getting_started/examples/openai_embedding_client.md @@ -0,0 +1,8 @@ +# OpenAI Embedding Client + +Source . + +```{literalinclude} ../../../../examples/openai_embedding_client.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/save_sharded_state.md b/docs/source/getting_started/examples/save_sharded_state.md new file mode 100644 index 0000000000000..acaf64f9f2ab2 --- /dev/null +++ b/docs/source/getting_started/examples/save_sharded_state.md @@ -0,0 +1,8 @@ +# Save Sharded State + +Source . + +```{literalinclude} ../../../../examples/save_sharded_state.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/examples/tensorize_vllm_model.md b/docs/source/getting_started/examples/tensorize_vllm_model.md new file mode 100644 index 0000000000000..5ceb8ab492f0a --- /dev/null +++ b/docs/source/getting_started/examples/tensorize_vllm_model.md @@ -0,0 +1,8 @@ +# Tensorize vLLM Model + +Source . + +```{literalinclude} ../../../../examples/tensorize_vllm_model.py +:language: python +:linenos: true +``` diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md new file mode 100644 index 0000000000000..170d7e49ba806 --- /dev/null +++ b/docs/source/getting_started/gaudi-installation.md @@ -0,0 +1,388 @@ +# Installation with Intel® Gaudi® AI Accelerators + +This README provides instructions on running vLLM with Intel Gaudi devices. + +## Requirements and Installation + +Please follow the instructions provided in the [Gaudi Installation +Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the [Optimizing Training Platform +Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + +### Requirements + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + +### Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +### Build from source + +#### Environment verification + +To verify that the Intel Gaudi software was correctly installed, run: + +```console +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural_compressor is installed +``` + +Refer to [Intel Gaudi Software Stack +Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) +for more details. + +#### Run Docker Image + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the [Intel Gaudi +documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) +for more details. + +Use the following commands to run a Docker image: + +```console +$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +``` + +#### Build and Install vLLM + +To build and install vLLM from source, run: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python setup.py develop +``` + +Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + +```console +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout habana_main +$ python setup.py develop +``` + +## Supported Features + +- [Offline batched + inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) +- Online inference via [OpenAI-Compatible + Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +## Unsupported Features + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +## Supported Configurations + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +## Performance Tuning + +### Execution modes + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. + +```{eval-rst} +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode +``` + +```{warning} +In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. +``` + +### Bucketing mechanism + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. + +```{note} +Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. +``` + +Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +``` +INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +``` + +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +``` +min = 2, step = 32, max = 64 +=> ramp_up = (2, 4, 8, 16) +=> stable = (32, 64) +=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) +``` + +Example (without ramp-up) + +``` +min = 128, step = 128, max = 512 +=> ramp_up = () +=> stable = (128, 256, 384, 512) +=> buckets = ramp_up + stable => (128, 256, 384, 512) +``` + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +```{warning} +If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. +``` + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. + +```{note} +Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +``` + +### Warmup + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +``` +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +... +INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +... +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +``` + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +```{tip} +Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. +``` + +### HPU Graph capture + +[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +```{note} +`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. +``` + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode +\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. + +```{note} +`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. +``` + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +``` +INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +... +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +... +INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +... +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +``` + +### Recommended vLLM Parameters + +- We recommend running inference on Gaudi 2 with `block_size` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see [Gaudi + Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +### Environment variables + +**Diagnostic and profiling knobs:** + +- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default + +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default + +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default + +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default + +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default + +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - `{phase}` is either `PROMPT` or `DECODE` + + - `{dim}` is either `BS`, `SEQ` or `BLOCK` + + - `{param}` is either `MIN`, `STEP` or `MAX` + + - Default values: + + - Prompt: + : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - Decode: + : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default +- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs + +## Troubleshooting: Tweaking HPU Graphs + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak `gpu_memory_utilization` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default `gpu_memory_utilization` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. +- If this method is not efficient, you can disable `HPUGraph` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding `--enforce-eager` flag to + server (for online inference), or by passing `enforce_eager=True` + argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst deleted file mode 100644 index 249e08278ff8f..0000000000000 --- a/docs/source/getting_started/gaudi-installation.rst +++ /dev/null @@ -1,402 +0,0 @@ -Installation with Intel® Gaudi® AI Accelerators -=============================================== - -This README provides instructions on running vLLM with Intel Gaudi devices. - -Requirements and Installation ------------------------------ - -Please follow the instructions provided in the `Gaudi Installation -Guide `__ -to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the `Optimizing Training Platform -Guide `__. - -Requirements -~~~~~~~~~~~~ - -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - - -Quick start using Dockerfile -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code:: console - - $ docker build -f Dockerfile.hpu -t vllm-hpu-env . - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env - - -.. tip:: - If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. - - -Build from source -~~~~~~~~~~~~~~~~~ - -Environment verification -^^^^^^^^^^^^^^^^^^^^^^^^ - -To verify that the Intel Gaudi software was correctly installed, run: - -.. code:: console - - $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible - $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed - $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed - $ pip list | grep neural # verify that neural_compressor is installed - -Refer to `Intel Gaudi Software Stack -Verification `__ -for more details. - -Run Docker Image -^^^^^^^^^^^^^^^^ - -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the `Intel Gaudi -documentation `__ -for more details. - -Use the following commands to run a Docker image: - -.. code:: console - - $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - -Build and Install vLLM -^^^^^^^^^^^^^^^^^^^^^^ - -To build and install vLLM from source, run: - -.. code:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python setup.py develop - - -Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: - -.. code:: console - - $ git clone https://github.com/HabanaAI/vllm-fork.git - $ cd vllm-fork - $ git checkout habana_main - $ python setup.py develop - - -Supported Features ------------------- - -- `Offline batched - inference `__ -- Online inference via `OpenAI-Compatible - Server `__ -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with `HPU Graphs `__ - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) - -Unsupported Features --------------------- - -- Beam search -- LoRA adapters -- Quantization -- Prefill chunking (mixed-batch inferencing) - -Supported Configurations ------------------------- - -The following configurations have been validated to be function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -- `meta-llama/Llama-2-7b `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-7b-chat-hf `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-70b `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Llama-2-70b-chat-hf `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - -Performance Tuning ------------------- - -Execution modes -~~~~~~~~~~~~~~~ - -Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. - -.. list-table:: vLLM execution modes - :widths: 25 25 50 - :header-rows: 1 - - * - ``PT_HPU_LAZY_MODE`` - - ``enforce_eager`` - - execution mode - * - 0 - - 0 - - torch.compile - * - 0 - - 1 - - PyTorch eager mode - * - 1 - - 0 - - HPU Graphs - * - 1 - - 1 - - PyTorch lazy mode - -.. warning:: - In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. - - -Bucketing mechanism -~~~~~~~~~~~~~~~~~~~ - -Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. - -.. note:: - Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. - -Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: - -.. code-block:: - - INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - -``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. - -Example (with ramp-up) - -.. code-block:: - - min = 2, step = 32, max = 64 - => ramp_up = (2, 4, 8, 16) - => stable = (32, 64) - => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) - -Example (without ramp-up) - -.. code-block:: - - min = 128, step = 128, max = 512 - => ramp_up = () - => stable = (128, 256, 384, 512) - => buckets = ramp_up + stable => (128, 256, 384, 512) - - -In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. - -.. warning:: - If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. - -As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. - -.. note:: - Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. - -Warmup -~~~~~~ - -Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: - -.. code-block:: - - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB - INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB - ... - INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB - INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB - ... - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - -This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - -.. tip:: - Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. - -HPU Graph capture -~~~~~~~~~~~~~~~~~ - -`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. - - -When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). -Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. -Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. -Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. -Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. - -.. note:: - ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. - -User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: -- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode -- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt - -When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. - - -.. note:: - ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. - - -Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): - -.. code-block:: - - INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache - INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 - INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB - ... - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - ... - INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB - INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB - ... - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory - INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) - - -Recommended vLLM Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- We recommend running inference on Gaudi 2 with ``block_size`` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see `Gaudi - Architecture `__). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. - -Environment variables -~~~~~~~~~~~~~~~~~~~~~ - -**Diagnostic and profiling knobs:** - -- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. - -**Performance tuning knobs:** - -- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default -- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default -- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default -- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default -- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default -- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism - - - ``{phase}`` is either ``PROMPT`` or ``DECODE`` - - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` - - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` - - Default values: - - - Prompt: - - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` - - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` - - - Decode: - - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` - - -Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - -- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default -- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs - -Troubleshooting: Tweaking HPU Graphs ------------------------------------- - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak ``gpu_memory_utilization`` knob. It will decrease the - allocation of KV cache, leaving some headroom for capturing graphs - with larger batch size. By default ``gpu_memory_utilization`` is set - to 0.9. It attempts to allocate ~90% of HBM left for KV cache after - short profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. - -- If this method is not efficient, you can disable ``HPUGraph`` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding ``--enforce-eager`` flag to - server (for online inference), or by passing ``enforce_eager=True`` - argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md new file mode 100644 index 0000000000000..0029f68ef8441 --- /dev/null +++ b/docs/source/getting_started/installation.md @@ -0,0 +1,199 @@ +(installation)= + +# Installation + +vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Install released versions + +You can install vLLM using pip: + +```console +$ # (Recommended) Create a new conda environment. +$ conda create -n myenv python=3.12 -y +$ conda activate myenv + +$ # Install vLLM with CUDA 12.1. +$ pip install vllm +``` + +```{note} +Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See [this issue](https://github.com/vllm-project/vllm/issues/8420) for more details. +``` + +````{note} +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. +We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: + +```console +$ # Install vLLM with CUDA 11.8. +$ export VLLM_VERSION=0.6.1.post1 +$ export PYTHON_VERSION=310 +$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + +Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. +```` + +(install-the-latest-code)= + +## Install the latest code + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: + +```console +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +If you want to access the wheels for previous commits, you can specify the commit hash in the URL: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + +Another way to access the latest code is to use the docker images: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +(build-from-source)= + +## Build from source + +(python-only-build)= + +### Python-only build (without compilation) + +If you only need to change Python code, you can build and install vLLM without compilation. Using `` pip's ` ``--editable\`\` flag \<>\`\_, changes you make to the code will be reflected when you run vLLM: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_USE_PRECOMPILED=1 pip install --editable . +``` + +This will download the latest nightly wheel and use the compiled libraries from there in the install. + +The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): + +```console +$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +$ pip install --editable . +``` + +You can find more information about vLLM's wheels [above](#install-the-latest-code). + +```{note} +There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +``` + +### Full build (with compilation) + +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -e . +``` + +```{tip} +Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + +For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . +As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + +[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. +The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. +``` + +#### Use an existing PyTorch installation + +There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: + +- Building vLLM with PyTorch nightly or a custom PyTorch build. +- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it. + +To build vLLM using an existing PyTorch installation: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python use_existing_torch.py +$ pip install -r requirements-build.txt +$ pip install -e . --no-build-isolation +``` + +#### Use the local cutlass for compilation + +Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. +To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +``` + +#### Troubleshooting + +To avoid your system being overloaded, you can limit the number of compilation jobs +to be run simultaneously, via the environment variable `MAX_JOBS`. For example: + +```console +$ export MAX_JOBS=6 +$ pip install -e . +``` + +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. + +Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. + +```console +$ # Use `--ipc=host` to make sure the shared memory is large enough. +$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +``` + +If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: + +```console +$ export CUDA_HOME=/usr/local/cuda +$ export PATH="${CUDA_HOME}/bin:$PATH" +``` + +Here is a sanity check to verify that the CUDA Toolkit is correctly installed: + +```console +$ nvcc --version # verify that nvcc is in your PATH +$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +``` + +### Unsupported OS build + +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. + +Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: + +```console +$ export VLLM_TARGET_DEVICE=empty +$ pip install -e . +``` diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst deleted file mode 100644 index 9b6cb0e80d60e..0000000000000 --- a/docs/source/getting_started/installation.rst +++ /dev/null @@ -1,214 +0,0 @@ -.. _installation: - -============ -Installation -============ - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. - -Requirements -============ - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Install released versions -========================= - -You can install vLLM using pip: - -.. code-block:: console - - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.12 -y - $ conda activate myenv - - $ # Install vLLM with CUDA 12.1. - $ pip install vllm - -.. note:: - - Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue `_ for more details. - -.. note:: - - As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. - We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: - - .. code-block:: console - - $ # Install vLLM with CUDA 11.8. - $ export VLLM_VERSION=0.6.1.post1 - $ export PYTHON_VERSION=310 - $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 - - In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. - - Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. - - -.. _install-the-latest-code: - -Install the latest code -======================= - -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: - -.. code-block:: console - - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - -Another way to access the latest code is to use the docker images: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} - -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. - -The latest code can contain bugs and may not be stable. Please use it with caution. - -.. _build_from_source: - -Build from source -================= - -.. _python-only-build: - -Python-only build (without compilation) ---------------------------------------- - -If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_USE_PRECOMPILED=1 pip install --editable . - -This will download the latest nightly wheel and use the compiled libraries from there in the install. - -The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: - -.. code-block:: console - - $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl - $ pip install --editable . - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -.. note:: - - There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. - -Full build (with compilation) ------------------------------ - -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -e . - -.. tip:: - - Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . - As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. - - `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. - The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. - - -Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: - -* Building vLLM with PyTorch nightly or a custom PyTorch build. -* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it. - -To build vLLM using an existing PyTorch installation: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python use_existing_torch.py - $ pip install -r requirements-build.txt - $ pip install -e . --no-build-isolation - - -Use the local cutlass for compilation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. -To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . - - -Troubleshooting -~~~~~~~~~~~~~~~ - -To avoid your system being overloaded, you can limit the number of compilation jobs -to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: - -.. code-block:: console - - $ export MAX_JOBS=6 - $ pip install -e . - -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. - -Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. - -.. code-block:: console - - $ # Use `--ipc=host` to make sure the shared memory is large enough. - $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 - -If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: - -.. code-block:: console - - $ export CUDA_HOME=/usr/local/cuda - $ export PATH="${CUDA_HOME}/bin:$PATH" - -Here is a sanity check to verify that the CUDA Toolkit is correctly installed: - -.. code-block:: console - - $ nvcc --version # verify that nvcc is in your PATH - $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME - - -Unsupported OS build --------------------- - -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. - -Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: - -.. code-block:: console - - $ export VLLM_TARGET_DEVICE=empty - $ pip install -e . diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md new file mode 100644 index 0000000000000..1adcefcb6124a --- /dev/null +++ b/docs/source/getting_started/neuron-installation.md @@ -0,0 +1,132 @@ +(installation-neuron)= + +# Installation with Neuron + +vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. +Paged Attention and Chunked Prefill are currently in development and will be available soon. +Data types currently supported in Neuron SDK are FP16 and BF16. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.11 +- Accelerator: NeuronCore_v2 (in trn1/inf2 instances) +- Pytorch 2.0.1/2.1.1 +- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) + +Installation steps: + +- {ref}`Build from source ` + + - {ref}`Step 0. Launch Trn1/Inf2 instances ` + - {ref}`Step 1. Install drivers and tools ` + - {ref}`Step 2. Install transformers-neuronx and its dependencies ` + - {ref}`Step 3. Install vLLM from source ` + +(build-from-source-neuron)= + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +## Build from source + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +(launch-instances)= + +### Step 0. Launch Trn1/Inf2 instances + +Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). + +- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. +- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/) +- Select Ubuntu Server 22.04 TLS AMI +- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. +- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance + +(install-drivers)= + +### Step 1. Install drivers and tools + +The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: + +```console +# Configure Linux for Neuron repository updates +. /etc/os-release +sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <` - - - :ref:`Step 0. Launch Trn1/Inf2 instances ` - - :ref:`Step 1. Install drivers and tools ` - - :ref:`Step 2. Install transformers-neuronx and its dependencies ` - - :ref:`Step 3. Install vLLM from source ` - -.. _build_from_source_neuron: - -.. note:: - - The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. - -Build from source ------------------ - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -.. _launch_instances: - -Step 0. Launch Trn1/Inf2 instances -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_. - -- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. -- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_ -- Select Ubuntu Server 22.04 TLS AMI -- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. -- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance - -.. _install_drivers: - -Step 1. Install drivers and tools -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: - -.. code-block:: console - - # Configure Linux for Neuron repository updates - . /etc/os-release - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances. -Follow the steps below to install transformer-neuronx package and its dependencies. - -.. code-block:: console - - # Install Python venv - sudo apt-get install -y python3.10-venv g++ - - # Create Python venv - python3.10 -m venv aws_neuron_venv_pytorch - - # Activate Python venv - source aws_neuron_venv_pytorch/bin/activate - - # Install Jupyter notebook kernel - pip install ipykernel - python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" - pip install jupyter notebook - pip install environment_kernels - - # Set pip repository pointing to the Neuron repository - python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com - - # Install wget, awscli - python -m pip install wget - python -m pip install awscli - - # Update Neuron Compiler and Framework - python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx - -.. _install_vllm: - -Step 3. Install vLLM from source -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -U -r requirements-neuron.txt - $ VLLM_TARGET_DEVICE="neuron" pip install . - -If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md new file mode 100644 index 0000000000000..275ebd9b1892d --- /dev/null +++ b/docs/source/getting_started/openvino-installation.md @@ -0,0 +1,104 @@ +(installation-openvino)= + +# Installation with OpenVINO + +vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) + +**Table of contents**: + +- {ref}`Requirements ` +- {ref}`Quick start using Dockerfile ` +- {ref}`Build from source ` +- {ref}`Performance tips ` +- {ref}`Limitations ` + +(openvino-backend-requirements)= + +## Requirements + +- OS: Linux +- Instruction set architecture (ISA) requirement: at least AVX2. + +(openvino-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.openvino -t vllm-openvino-env . +$ docker run -it --rm vllm-openvino-env +``` + +(install-openvino-backend-from-source)= + +## Install from source + +- First, install Python. For example, on Ubuntu 22.04, you can run: + + ```console + $ sudo apt-get update -y + $ sudo apt-get install python3 + ``` + +- Second, install prerequisites vLLM OpenVINO backend installation: + + ```console + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +- Finally, install vLLM with OpenVINO backend: + + ```console + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + ``` + +- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). + +(openvino-backend-performance-tips)= + +## Performance tips + +### vLLM OpenVINO backend environment variables + +- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` + +### CPU performance tips + +CPU uses the following environment variables to control behavior: + +- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`) + +OpenVINO best known configuration for CPU is: + +```console +$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 +``` + +### GPU performance tips + +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache). + +Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. + +OpenVINO best known configuration for GPU is: + +```console +$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +(openvino-backend-limitations)= + +## Limitations + +- LoRA serving is not supported. +- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. +- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst deleted file mode 100644 index 5eeb7c78f7e51..0000000000000 --- a/docs/source/getting_started/openvino-installation.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _installation_openvino: - -Installation with OpenVINO -========================== - -vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs `_). OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (``--enable-prefix-caching``) -- Chunked prefill (``--enable-chunked-prefill``) - -**Table of contents**: - -- :ref:`Requirements ` -- :ref:`Quick start using Dockerfile ` -- :ref:`Build from source ` -- :ref:`Performance tips ` -- :ref:`Limitations ` - -.. _openvino_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Instruction set architecture (ISA) requirement: at least AVX2. - -.. _openvino_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.openvino -t vllm-openvino-env . - $ docker run -it --rm vllm-openvino-env - -.. _install_openvino_backend_from_source: - -Install from source -------------------- - -- First, install Python. For example, on Ubuntu 22.04, you can run: - - .. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install python3 - -- Second, install prerequisites vLLM OpenVINO backend installation: - - .. code-block:: console - - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, install vLLM with OpenVINO backend: - - .. code-block:: console - - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html `_. - -.. _openvino_backend_performance_tips: - -Performance tips ----------------- - -vLLM OpenVINO backend environment variables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. - -- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` - -CPU performance tips -~~~~~~~~~~~~~~~~~~~~ - -CPU uses the following environment variables to control behavior: - -- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. - -To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) - -OpenVINO best known configuration for CPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 - -GPU performance tips -~~~~~~~~~~~~~~~~~~~~ -GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). - -Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. - -OpenVINO best known configuration for GPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json - -.. _openvino_backend_limitations: - -Limitations ------------ - -- LoRA serving is not supported. - -- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. - -- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md new file mode 100644 index 0000000000000..5cb39791efcf3 --- /dev/null +++ b/docs/source/getting_started/quickstart.md @@ -0,0 +1,174 @@ +(quickstart)= + +# Quickstart + +This guide will help you quickly get started with vLLM to: + +- {ref}`Run offline batched inference ` +- {ref}`Run OpenAI-compatible inference ` + +## Prerequisites + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Installation + +You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. + +```console +$ conda create -n myenv python=3.10 -y +$ conda activate myenv +$ pip install vllm +``` + +Please refer to the {ref}`installation documentation ` for more details on installing vLLM. + +(offline-batched-inference)= + +## Offline Batched Inference + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py). + +The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: + +- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process. + +```python +from vllm import LLM, SamplingParams +``` + +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). + +```python +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +``` + +The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found {ref}`here `. + +```python +llm = LLM(model="facebook/opt-125m") +``` + +```{note} +By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. +``` + +Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. + +```python +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +(openai-compatible-server)= + +## OpenAI-Compatible Server + +vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. +By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints. + +Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: + +```console +$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +``` + +```{note} +By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template). +``` + +This server can be queried in the same format as OpenAI API. For example, to list the models: + +```console +$ curl http://localhost:8000/v1/models +``` + +You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. + +### OpenAI Completions API with vLLM + +Once your server is started, you can query the model with input prompts: + +```console +$ curl http://localhost:8000/v1/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "prompt": "San Francisco is a", +$ "max_tokens": 7, +$ "temperature": 0 +$ }' +``` + +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) +completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") +print("Completion result:", completion) +``` + +A more detailed client example can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). + +### OpenAI Chat Completions API with vLLM + +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. + +You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: + +```console +$ curl http://localhost:8000/v1/chat/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "messages": [ +$ {"role": "system", "content": "You are a helpful assistant."}, +$ {"role": "user", "content": "Who won the world series in 2020?"} +$ ] +$ }' +``` + +Alternatively, you can use the `openai` python package: + +```python +from openai import OpenAI +# Set OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a joke."}, + ] +) +print("Chat response:", chat_response) +``` diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst deleted file mode 100644 index 0c0491c860563..0000000000000 --- a/docs/source/getting_started/quickstart.rst +++ /dev/null @@ -1,181 +0,0 @@ -.. _quickstart: - -========== -Quickstart -========== - -This guide will help you quickly get started with vLLM to: - -* :ref:`Run offline batched inference ` -* :ref:`Run OpenAI-compatible inference ` - -Prerequisites --------------- -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Installation --------------- - -You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. - -.. code-block:: console - - $ conda create -n myenv python=3.10 -y - $ conda activate myenv - $ pip install vllm - -Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - -.. _offline_batched_inference: - -Offline Batched Inference -------------------------- - -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. - -The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: - -- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. -- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. - -.. code-block:: python - - from vllm import LLM, SamplingParams - -The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. - -.. code-block:: python - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - -.. note:: - - By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. - -Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. - -.. code-block:: python - - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -.. _openai_compatible_server: - -OpenAI-Compatible Server ------------------------- - -vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. - -Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: - -.. code-block:: console - - $ vllm serve Qwen/Qwen2.5-1.5B-Instruct - -.. note:: - - By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. - -This server can be queried in the same format as OpenAI API. For example, to list the models: - -.. code-block:: console - - $ curl http://localhost:8000/v1/models - -You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. - -OpenAI Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once your server is started, you can query the model with input prompts: - -.. code-block:: console - - $ curl http://localhost:8000/v1/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "prompt": "San Francisco is a", - $ "max_tokens": 7, - $ "temperature": 0 - $ }' - -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") - print("Completion result:", completion) - -A more detailed client example can be found `here `__. - -OpenAI Chat Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. - -You can use the `create chat completion `_ endpoint to interact with the model: - -.. code-block:: console - - $ curl http://localhost:8000/v1/chat/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "messages": [ - $ {"role": "system", "content": "You are a helpful assistant."}, - $ {"role": "user", "content": "Who won the world series in 2020?"} - $ ] - $ }' - -Alternatively, you can use the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - # Set OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - chat_response = client.chat.completions.create( - model="Qwen/Qwen2.5-1.5B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Tell me a joke."}, - ] - ) - print("Chat response:", chat_response) diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md new file mode 100644 index 0000000000000..f4916460026d1 --- /dev/null +++ b/docs/source/getting_started/tpu-installation.md @@ -0,0 +1,193 @@ +(installation-tpu)= + +# Installation with TPU + +Tensor Processing Units (TPUs) are Google's custom-developed application-specific +integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs +are available in different versions each with different hardware specifications. +For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm). +For more information on the TPU versions supported with vLLM, see: + +- [TPU v6e](https://cloud.google.com/tpu/docs/v6e) +- [TPU v5e](https://cloud.google.com/tpu/docs/v5e) +- [TPU v5p](https://cloud.google.com/tpu/docs/v5p) +- [TPU v4](https://cloud.google.com/tpu/docs/v4) + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations) +- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config) +- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config) +- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config) + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota). + +For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing). + +You may need additional persistent storage for your TPU VMs. For more +information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). + +## Requirements + +- Google Cloud TPU VM +- TPU versions: v6e, v5e, v5p, v4 +- Python: 3.10 or newer + +### Provision Cloud TPUs + +You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest) +or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources) +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api). +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. + +```{note} +In all of the following commands, replace the ALL CAPS parameter names with +appropriate values. See the parameter descriptions table for more information. +``` + +## Provision a Cloud TPU with the queued resource API + +Create a TPU v5e with 4 TPU chips: + +```console +gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ +--node-id TPU_NAME \ +--project PROJECT_ID \ +--zone ZONE \ +--accelerator-type ACCELERATOR_TYPE \ +--runtime-version RUNTIME_VERSION \ +--service-account SERVICE_ACCOUNT +``` + +```{eval-rst} +.. list-table:: Parameter descriptions + :header-rows: 1 + + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` +``` + +Connect to your TPU using SSH: + +```bash +gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE +``` + +Install Miniconda + +```bash +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh +source ~/.bashrc +``` + +Create and activate a Conda environment for vLLM: + +```bash +conda create -n vllm python=3.10 -y +conda activate vllm +``` + +Clone the vLLM repository and go to the vLLM directory: + +```bash +git clone https://github.com/vllm-project/vllm.git && cd vllm +``` + +Uninstall the existing `torch` and `torch_xla` packages: + +```bash +pip uninstall torch torch-xla -y +``` + +Install build dependencies: + +```bash +pip install -r requirements-tpu.txt +sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` + +Run the setup script: + +```bash +VLLM_TARGET_DEVICE="tpu" python setup.py develop +``` + +## Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see + + + + +(build-docker-tpu)= + +## Build a docker image with {code}`Dockerfile.tpu` + +You can use [Dockerfile.tpu](https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu) +to build a Docker image with TPU support. + +```console +$ docker build -f Dockerfile.tpu -t vllm-tpu . +``` + +Run the Docker image with the following command: + +```console +$ # Make sure to add `--privileged --net host --shm-size=16G`. +$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +``` + +```{note} +Since TPU relies on XLA which requires static shapes, vLLM bucketizes the +possible input shapes and compiles an XLA graph for each shape. The +compilation time may take 20~30 minutes in the first run. However, the +compilation time reduces to ~5 minutes afterwards because the XLA graphs are +cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). +``` + +````{tip} +If you encounter the following error: + +```console +from torch._C import * # noqa: F403 +ImportError: libopenblas.so.0: cannot open shared object file: No such +file or directory +``` + +Install OpenBLAS with the following command: + +```console +$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` +```` diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst deleted file mode 100644 index 22cc684a1c778..0000000000000 --- a/docs/source/getting_started/tpu-installation.rst +++ /dev/null @@ -1,200 +0,0 @@ -.. _installation_tpu: - -##################### -Installation with TPU -##################### - -Tensor Processing Units (TPUs) are Google's custom-developed application-specific -integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs -are available in different versions each with different hardware specifications. -For more information about TPUs, see `TPU System Architecture `_. -For more information on the TPU versions supported with vLLM, see: - -* `TPU v6e `_ -* `TPU v5e `_ -* `TPU v5p `_ -* `TPU v4 `_ - -These TPU versions allow you to configure the physical arrangements of the TPU -chips. This can improve throughput and networking performance. For more -information see: - -* `TPU v6e topologies `_ -* `TPU v5e topologies `_ -* `TPU v5p topologies `_ -* `TPU v4 topologies `_ - -In order for you to use Cloud TPUs you need to have TPU quota granted to your -Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a -GPC project and are specified in terms of TPU version, the number of TPU you -want to use, and quota type. For more information, see `TPU quota `_. - -For TPU pricing information, see `Cloud TPU pricing `_. - -You may need additional persistent storage for your TPU VMs. For more -information, see `Storage options for Cloud TPU data `_. - -Requirements ------------- - -* Google Cloud TPU VM -* TPU versions: v6e, v5e, v5p, v4 -* Python: 3.10 or newer - -Provision Cloud TPUs -==================== - -You can provision Cloud TPUs using the `Cloud TPU API `_ -or the `queued resources `_ -API. This section shows how to create TPUs using the queued resource API. For -more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. -Queued resources enable you to request Cloud TPU resources in a queued manner. -When you request queued resources, the request is added to a queue maintained by -the Cloud TPU service. When the requested resource becomes available, it's -assigned to your Google Cloud project for your immediate exclusive use. - -.. note:: - In all of the following commands, replace the ALL CAPS parameter names with - appropriate values. See the parameter descriptions table for more information. - -Provision a Cloud TPU with the queued resource API --------------------------------------------------- -Create a TPU v5e with 4 TPU chips: - -.. code-block:: console - - gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ - --node-id TPU_NAME \ - --project PROJECT_ID \ - --zone ZONE \ - --accelerator-type ACCELERATOR_TYPE \ - --runtime-version RUNTIME_VERSION \ - --service-account SERVICE_ACCOUNT - - -.. list-table:: Parameter descriptions - :header-rows: 1 - - * - Parameter name - - Description - * - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. - * - TPU_NAME - - The user-assigned name of the TPU which is created when the queued - resource request is allocated. - * - PROJECT_ID - - Your Google Cloud project - * - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ - * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, - see `TPU versions `_. - * - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. - * - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` - -Connect to your TPU using SSH: - -.. code-block:: bash - - gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE - -Install Miniconda - -.. code-block:: bash - - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh - source ~/.bashrc - -Create and activate a Conda environment for vLLM: - -.. code-block:: bash - - conda create -n vllm python=3.10 -y - conda activate vllm - -Clone the vLLM repository and go to the vLLM directory: - -.. code-block:: bash - - git clone https://github.com/vllm-project/vllm.git && cd vllm - -Uninstall the existing `torch` and `torch_xla` packages: - -.. code-block:: bash - - pip uninstall torch torch-xla -y - -Install build dependencies: - -.. code-block:: bash - - pip install -r requirements-tpu.txt - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - -Run the setup script: - -.. code-block:: bash - - VLLM_TARGET_DEVICE="tpu" python setup.py develop - - -Provision Cloud TPUs with GKE ------------------------------ - -For more information about using TPUs with GKE, see -https://cloud.google.com/kubernetes-engine/docs/how-to/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus - -.. _build_docker_tpu: - -Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------- - -You can use `Dockerfile.tpu `_ -to build a Docker image with TPU support. - -.. code-block:: console - - $ docker build -f Dockerfile.tpu -t vllm-tpu . - -Run the Docker image with the following command: - -.. code-block:: console - - $ # Make sure to add `--privileged --net host --shm-size=16G`. - $ docker run --privileged --net host --shm-size=16G -it vllm-tpu - -.. note:: - - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the - possible input shapes and compiles an XLA graph for each shape. The - compilation time may take 20~30 minutes in the first run. However, the - compilation time reduces to ~5 minutes afterwards because the XLA graphs are - cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). - -.. tip:: - - If you encounter the following error: - - .. code-block:: console - - from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such - file or directory - - - Install OpenBLAS with the following command: - - .. code-block:: console - - $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md new file mode 100644 index 0000000000000..9b40ab01d7654 --- /dev/null +++ b/docs/source/getting_started/xpu-installation.md @@ -0,0 +1,74 @@ +(installation-xpu)= + +# Installation with XPU + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +1. {ref}`Requirements ` +2. {ref}`Quick start using Dockerfile ` +3. {ref}`Build from source ` + +(xpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Supported Hardware: Intel Data Center GPU, Intel ARC GPU +- OneAPI requirements: oneAPI 2024.2 + +(xpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env +``` + +(build-xpu-backend-from-source)= + +## Build from source + +- First, install required driver and intel OneAPI 2024.2 or later. +- Second, install Python packages for vLLM XPU backend building: + +```console +$ source /opt/intel/oneapi/setvars.sh +$ pip install --upgrade pip +$ pip install -v -r requirements-xpu.txt +``` + +- Finally, build and install vLLM XPU backend: + +```console +$ VLLM_TARGET_DEVICE=xpu python setup.py install +``` + +```{note} +- FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. +``` + +## Distributed inference and serving + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +```console +$ python -m vllm.entrypoints.openai.api_server \ +$ --model=facebook/opt-13b \ +$ --dtype=bfloat16 \ +$ --device=xpu \ +$ --max_model_len=1024 \ +$ --distributed-executor-backend=ray \ +$ --pipeline-parallel-size=2 \ +$ -tp=8 +``` + +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh). diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst deleted file mode 100644 index b1868acbc84b0..0000000000000 --- a/docs/source/getting_started/xpu-installation.rst +++ /dev/null @@ -1,80 +0,0 @@ -.. _installation_xpu: - -Installation with XPU -======================== - -vLLM initially supports basic model inferencing and serving on Intel GPU platform. - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` - -.. _xpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Supported Hardware: Intel Data Center GPU, Intel ARC GPU -* OneAPI requirements: oneAPI 2024.2 - -.. _xpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - vllm-xpu-env - -.. _build_xpu_backend_from_source: - -Build from source ------------------ - -- First, install required driver and intel OneAPI 2024.2 or later. - -- Second, install Python packages for vLLM XPU backend building: - -.. code-block:: console - - $ source /opt/intel/oneapi/setvars.sh - $ pip install --upgrade pip - $ pip install -v -r requirements-xpu.txt - -- Finally, build and install vLLM XPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=xpu python setup.py install - -.. note:: - - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. - - -Distributed inference and serving ---------------------------------- - -XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: - -.. code-block:: console - - $ python -m vllm.entrypoints.openai.api_server \ - $ --model=facebook/opt-13b \ - $ --dtype=bfloat16 \ - $ --device=xpu \ - $ --max_model_len=1024 \ - $ --distributed-executor-backend=ray \ - $ --pipeline-parallel-size=2 \ - $ -tp=8 - -By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000000000..1e55847ab2b0b --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,197 @@ +# Welcome to vLLM! + +```{figure} ./assets/logos/vllm-logo-text-light.png +:align: center +:alt: vLLM +:class: no-scaled-link +:width: 60% +``` + +```{raw} html +

+Easy, fast, and cheap LLM serving for everyone + +

+ +

+ +Star +Watch +Fork +

+``` + +vLLM is a fast and easy-to-use library for LLM inference and serving. + +vLLM is fast with: + +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with **PagedAttention** +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill + +vLLM is flexible and easy to use with: + +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Prefix caching support +- Multi-lora support + +For more information, check out the following: + +- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) +- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) +- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. +- {ref}`vLLM Meetups `. + +## Documentation + +```{toctree} +:caption: Getting Started +:maxdepth: 1 + +getting_started/installation +getting_started/amd-installation +getting_started/openvino-installation +getting_started/cpu-installation +getting_started/gaudi-installation +getting_started/arm-installation +getting_started/neuron-installation +getting_started/tpu-installation +getting_started/xpu-installation +getting_started/quickstart +getting_started/debugging +getting_started/examples/examples_index +``` + +```{toctree} +:caption: Serving +:maxdepth: 1 + +serving/openai_compatible_server +serving/deploying_with_docker +serving/deploying_with_k8s +serving/deploying_with_helm +serving/deploying_with_nginx +serving/distributed_serving +serving/metrics +serving/integrations +serving/tensorizer +``` + +```{toctree} +:caption: Models +:maxdepth: 1 + +models/supported_models +models/generative_models +models/pooling_models +models/adding_model +models/enabling_multimodal_inputs +``` + +```{toctree} +:caption: Usage +:maxdepth: 1 + +usage/lora +usage/multimodal_inputs +usage/tool_calling +usage/structured_outputs +usage/spec_decode +usage/compatibility_matrix +usage/performance +usage/faq +usage/engine_args +usage/env_vars +usage/usage_stats +``` + +```{toctree} +:caption: Quantization +:maxdepth: 1 + +quantization/supported_hardware +quantization/auto_awq +quantization/bnb +quantization/gguf +quantization/int8 +quantization/fp8 +quantization/fp8_e5m2_kvcache +quantization/fp8_e4m3_kvcache +``` + +```{toctree} +:caption: Automatic Prefix Caching +:maxdepth: 1 + +automatic_prefix_caching/apc +automatic_prefix_caching/details +``` + +```{toctree} +:caption: Performance +:maxdepth: 1 + +performance/benchmarks +``` + +% Community: User community resources + +```{toctree} +:caption: Community +:maxdepth: 1 + +community/meetups +community/sponsors +``` + +% API Documentation: API reference aimed at vllm library usage + +```{toctree} +:caption: API Documentation +:maxdepth: 2 + +dev/sampling_params +dev/pooling_params +dev/offline_inference/offline_index +dev/engine/engine_index +``` + +% Design: docs about vLLM internals + +```{toctree} +:caption: Design +:maxdepth: 2 + +design/arch_overview +design/huggingface_integration +design/plugin_system +design/input_processing/model_inputs_index +design/kernel/paged_attention +design/multimodal/multimodal_index +``` + +% For Developers: contributing to the vLLM project + +```{toctree} +:caption: For Developers +:maxdepth: 2 + +contributing/overview +contributing/profiling/profiling_index +contributing/dockerfile/dockerfile +``` + +# Indices and tables + +- {ref}`genindex` +- {ref}`modindex` diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 842013d6d49c4..0000000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,191 +0,0 @@ -Welcome to vLLM! -================ - -.. figure:: ./assets/logos/vllm-logo-text-light.png - :width: 60% - :align: center - :alt: vLLM - :class: no-scaled-link - -.. raw:: html - -

- Easy, fast, and cheap LLM serving for everyone - -

- -

- - Star - Watch - Fork -

- - - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -vLLM is fast with: - -* State-of-the-art serving throughput -* Efficient management of attention key and value memory with **PagedAttention** -* Continuous batching of incoming requests -* Fast model execution with CUDA/HIP graph -* Quantization: `GPTQ `_, `AWQ `_, INT4, INT8, and FP8 -* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. -* Speculative decoding -* Chunked prefill - -vLLM is flexible and easy to use with: - -* Seamless integration with popular HuggingFace models -* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -* Tensor parallelism and pipeline parallelism support for distributed inference -* Streaming outputs -* OpenAI-compatible API server -* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. -* Prefix caching support -* Multi-lora support - -For more information, check out the following: - -* `vLLM announcing blog post `_ (intro to PagedAttention) -* `vLLM paper `_ (SOSP 2023) -* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. -* :ref:`vLLM Meetups `. - - -Documentation -------------- - -.. toctree:: - :maxdepth: 1 - :caption: Getting Started - - getting_started/installation - getting_started/amd-installation - getting_started/openvino-installation - getting_started/cpu-installation - getting_started/gaudi-installation - getting_started/arm-installation - getting_started/neuron-installation - getting_started/tpu-installation - getting_started/xpu-installation - getting_started/quickstart - getting_started/debugging - getting_started/examples/examples_index - -.. toctree:: - :maxdepth: 1 - :caption: Serving - - serving/openai_compatible_server - serving/deploying_with_docker - serving/deploying_with_k8s - serving/deploying_with_helm - serving/deploying_with_nginx - serving/distributed_serving - serving/metrics - serving/integrations - serving/tensorizer - -.. toctree:: - :maxdepth: 1 - :caption: Models - - models/supported_models - models/generative_models - models/pooling_models - models/adding_model - models/enabling_multimodal_inputs - -.. toctree:: - :maxdepth: 1 - :caption: Usage - - usage/lora - usage/multimodal_inputs - usage/tool_calling - usage/structured_outputs - usage/spec_decode - usage/compatibility_matrix - usage/performance - usage/faq - usage/engine_args - usage/env_vars - usage/usage_stats - -.. toctree:: - :maxdepth: 1 - :caption: Quantization - - quantization/supported_hardware - quantization/auto_awq - quantization/bnb - quantization/gguf - quantization/int8 - quantization/fp8 - quantization/fp8_e5m2_kvcache - quantization/fp8_e4m3_kvcache - -.. toctree:: - :maxdepth: 1 - :caption: Automatic Prefix Caching - - automatic_prefix_caching/apc - automatic_prefix_caching/details - -.. toctree:: - :maxdepth: 1 - :caption: Performance - - performance/benchmarks - -.. Community: User community resources - -.. toctree:: - :maxdepth: 1 - :caption: Community - - community/meetups - community/sponsors - -.. API Documentation: API reference aimed at vllm library usage - -.. toctree:: - :maxdepth: 2 - :caption: API Documentation - - dev/sampling_params - dev/pooling_params - dev/offline_inference/offline_index - dev/engine/engine_index - -.. Design: docs about vLLM internals - -.. toctree:: - :maxdepth: 2 - :caption: Design - - design/arch_overview - design/huggingface_integration - design/plugin_system - design/input_processing/model_inputs_index - design/kernel/paged_attention - design/multimodal/multimodal_index - -.. For Developers: contributing to the vLLM project - -.. toctree:: - :maxdepth: 2 - :caption: For Developers - - contributing/overview - contributing/profiling/profiling_index - contributing/dockerfile/dockerfile - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md new file mode 100644 index 0000000000000..e28d6020c0a7e --- /dev/null +++ b/docs/source/models/adding_model.md @@ -0,0 +1,155 @@ +(adding-a-new-model)= + +# Adding a New Model + +This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{note} +By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, +please follow {ref}`this guide ` after implementing the model here. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. +We will be happy to help you out! +``` + +## 0. Fork the vLLM repository + +Start by forking our [GitHub] repository and then {ref}`build it from source `. +This gives you the ability to modify the codebase and test your model. + +```{tip} +If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. +``` + +## 1. Bring your model code + +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the [vllm/model_executor/models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory. +For instance, vLLM's [OPT model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out the [LLAMA model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the [vLLM models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in [vllm/model_executor/models/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py). + +## 6. Out-of-Tree Model Integration + +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see {ref}`plugin_system`. + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that {ref}`here `. +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst deleted file mode 100644 index df06d736ca86b..0000000000000 --- a/docs/source/models/adding_model.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. _adding_a_new_model: - -Adding a New Model -================== - -This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. - -.. note:: - The complexity of adding a new model depends heavily on the model's architecture. - The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. - However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. - -.. note:: - By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, - please follow :ref:`this guide ` after implementing the model here. - -.. tip:: - If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. - We will be happy to help you out! - - -0. Fork the vLLM repository --------------------------------- - -Start by forking our `GitHub`_ repository and then :ref:`build it from source `. -This gives you the ability to modify the codebase and test your model. - -.. tip:: - If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. - -1. Bring your model code ------------------------- - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. -For instance, vLLM's `OPT model `_ was adapted from the HuggingFace's `modeling_opt.py `_ file. - -.. warning:: - When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. - - -2. Make your code compatible with vLLM --------------------------------------- - -To ensure compatibility with vLLM, your model must meet the following requirements: - -Initialization Code -^^^^^^^^^^^^^^^^^^^ - -All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for: - -* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -.. code-block:: python - - from torch import nn - from vllm.config import VllmConfig - from vllm.attention import Attention - - class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - - class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - - class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - - class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") - -Computation Code -^^^^^^^^^^^^^^^^ - -Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -.. code-block:: python - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - ... - -.. note:: - Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. - If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. - -For reference, check out the `LLAMA model `__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models `__ directory for more examples. - -3. (Optional) Implement tensor parallelism and quantization support -------------------------------------------------------------------- - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -4. Implement the weight loading logic -------------------------------------- - -You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -5. Register your model ----------------------- - -Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. - -6. Out-of-Tree Model Integration --------------------------------- - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. - -To register the model, use the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -.. code-block:: python - - from vllm import ModelRegistry - - ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") - -.. important:: - If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - Read more about that :ref:`here `. - -.. note:: - Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md new file mode 100644 index 0000000000000..bf37ebb6e87c6 --- /dev/null +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -0,0 +1,143 @@ +(enabling-multimodal-inputs)= + +# Enabling Multimodal Inputs + +This document walks you through the steps to extend a vLLM model so that it accepts {ref}`multi-modal inputs `. + +```{seealso} +{ref}`adding_a_new_model` +``` + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to {ref}`these steps `. +Further update the model as follows: + +- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + + ```{note} + The model class does not have to be named {code}`*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + ``` + +- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` + for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + +## 2. Register input mappers + +For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. +This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_image_input_mapper() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. + +```{seealso} +{ref}`input_processing_pipeline` +``` + +## 3. Register maximum number of multi-modal tokens + +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item +and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() ++ @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +{ref}`input_processing_pipeline` +``` + +## 4. (Optional) Register dummy data + +During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. +In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() ++ @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +```{note} +The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +{ref}`input_processing_pipeline` +``` + +## 5. (Optional) Register input processor + +Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. +This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. +You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() ++ @INPUT_REGISTRY.register_input_processor() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. +Here are some examples: + +- Insert static number of image tokens: [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +{ref}`input_processing_pipeline` +``` diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst deleted file mode 100644 index 5c1236e1a8972..0000000000000 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. _enabling_multimodal_inputs: - -Enabling Multimodal Inputs -========================== - -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. - -.. seealso:: - :ref:`adding_a_new_model` - - -1. Update the base vLLM model ------------------------------ - -It is assumed that you have already implemented the model in vLLM according to :ref:`these steps `. -Further update the model as follows: - -- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - .. code-block:: diff - - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - - .. note:: - The model class does not have to be named :code:`*ForCausalLM`. - Check out `the HuggingFace Transformers documentation `__ for some examples. - -- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - .. code-block:: diff - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - - -2. Register input mappers -------------------------- - -For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`. - -.. code-block:: diff - - from vllm.model_executor.models.interfaces import SupportsMultiModal - + from vllm.multimodal import MULTIMODAL_REGISTRY - - + @MULTIMODAL_REGISTRY.register_image_input_mapper() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. - -.. seealso:: - :ref:`input_processing_pipeline` - - -3. Register maximum number of multi-modal tokens ------------------------------------------------- - -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - + @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -4. (Optional) Register dummy data ---------------------------------- - -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - + @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -.. note:: - The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -5. (Optional) Register input processor --------------------------------------- - -Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call. -You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - + @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: - -- Insert static number of image tokens: `LLaVA-1.5 Model `__ -- Insert dynamic number of image tokens: `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md new file mode 100644 index 0000000000000..33ebc1244cd21 --- /dev/null +++ b/docs/source/models/generative_models.md @@ -0,0 +1,138 @@ +(generative-models)= + +# Generative Models + +vLLM provides first-class support for generative models, which covers most of LLMs. + +In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, +which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See {ref}`Engine Arguments ` for a list of options when initializing the model. + +For generative models, the only supported {code}`task` option is {code}`"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + +### `LLM.generate` + +The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. +It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), +except that tokenization and detokenization are also performed automatically. + +```python +llm = LLM(model="facebook/opt-125m") +outputs = llm.generate("Hello, my name is") + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. +For example, you can use greedy sampling by setting {code}`temperature=0`: + +```python +llm = LLM(model="facebook/opt-125m") +params = SamplingParams(temperature=0) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found in [examples/offline_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py). + +### `LLM.beam_search` + +The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`. +For example, to search using 5 beams and output at most 50 tokens: + +```python +llm = LLM(model="facebook/opt-125m") +params = BeamSearchParams(beam_width=5, max_tokens=50) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +### `LLM.chat` + +The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`. +In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. + +```{important} +In general, only instruction-tuned models have a chat template. +Base models may perform poorly as they are not trained to respond to the chat conversation. +``` + +```python +llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +outputs = llm.chat(conversation) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found in [examples/offline_inference_chat.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py). + +If the model doesn't have a chat template or you want to specify another one, +you can explicitly pass a chat template: + +```python +from vllm.entrypoints.chat_utils import load_chat_template + +# You can find a list of existing chat templates under `examples/` +custom_template = load_chat_template(chat_template="") +print("Loaded chat template:", custom_template) + +outputs = llm.chat(conversation, chat_template=custom_template) +``` + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference. +Please click on the above link for more details on how to launch the server. + +### Completions API + +Our Completions API is similar to `LLM.generate` but only accepts text. +It is compatible with [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). + +### Chat API + +Our Chat API is similar to `LLM.chat`, accepting both text and {ref}`multi-modal inputs `. +It is compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py). diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst deleted file mode 100644 index fb71185600863..0000000000000 --- a/docs/source/models/generative_models.rst +++ /dev/null @@ -1,146 +0,0 @@ -.. _generative_models: - -Generative Models -================= - -vLLM provides first-class support for generative models, which covers most of LLMs. - -In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface. -Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For generative models, the only supported :code:`task` option is :code:`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - -``LLM.generate`` -^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM. -It is similar to `its counterpart in HF Transformers `__, -except that tokenization and detokenization are also performed automatically. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - outputs = llm.generate("Hello, my name is") - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -You can optionally control the language generation by passing :class:`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting :code:`temperature=0`: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = SamplingParams(temperature=0) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference.py `_. - -``LLM.beam_search`` -^^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.beam_search` method implements `beam search `__ on top of :class:`~vllm.LLM.generate`. -For example, to search using 5 beams and output at most 50 tokens: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = BeamSearchParams(beam_width=5, max_tokens=50) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -``LLM.chat`` -^^^^^^^^^^^^ - -The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`. -In particular, it accepts input similar to `OpenAI Chat Completions API `__ -and automatically applies the model's `chat template `__ to format the prompt. - -.. important:: - - In general, only instruction-tuned models have a chat template. - Base models may perform poorly as they are not trained to respond to the chat conversation. - -.. code-block:: python - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - outputs = llm.chat(conversation) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference_chat.py `_. - -If the model doesn't have a chat template or you want to specify another one, -you can explicitly pass a chat template: - -.. code-block:: python - - from vllm.entrypoints.chat_utils import load_chat_template - - # You can find a list of existing chat templates under `examples/` - custom_template = load_chat_template(chat_template="") - print("Loaded chat template:", custom_template) - - outputs = llm.chat(conversation, chat_template=custom_template) - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Completions API -^^^^^^^^^^^^^^^ - -Our Completions API is similar to ``LLM.generate`` but only accepts text. -It is compatible with `OpenAI Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_completion_client.py `_. - -Chat API -^^^^^^^^ - -Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs `. -It is compatible with `OpenAI Chat Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_chat_completion_client.py `_. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md new file mode 100644 index 0000000000000..caab85f5be8c1 --- /dev/null +++ b/docs/source/models/pooling_models.md @@ -0,0 +1,92 @@ +(pooling-models)= + +# Pooling Models + +vLLM also supports pooling models, including embedding, reranking and reward models. + +In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface. +These models use a {class}`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input +before returning them. + +```{note} +We currently support pooling models primarily as a matter of convenience. +As shown in the {ref}`Compatibility Matrix `, most vLLM features are not applicable to +pooling models as they only work on the generation or decode stage, so performance may not improve as much. +``` + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See {ref}`Engine Arguments ` for a list of options when initializing the model. + +For pooling models, we support the following {code}`task` options: + +- Embedding ({code}`"embed"` / {code}`"embedding"`) +- Classification ({code}`"classify"`) +- Sentence Pair Scoring ({code}`"score"`) +- Reward Modeling ({code}`"reward"`) + +The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: + +- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. +- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. +- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. +- Reward Modeling: Extract all of the hidden states and return them directly. + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). + +You can customize the model's pooling method via the {code}`override_pooler_config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +### `LLM.encode` + +The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. +It returns the aggregated hidden states directly. + +```python +llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") +outputs = llm.encode("Hello, my name is") + +outputs = model.encode(prompts) +for output in outputs: + embeddings = output.outputs.embedding + print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}") +``` + +A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py). + +### `LLM.score` + +The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html). +These types of models serve as rerankers between candidate query-document pairs in RAG systems. + +```{note} +vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. +To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). +``` + +You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py) as reference. + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference. +Please click on the above link for more details on how to launch the server. + +### Embeddings API + +Our Embeddings API is similar to `LLM.encode`, accepting both text and {ref}`multi-modal inputs `. + +The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py). + +The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) +that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat), +so it is not part of the OpenAI standard. Please see {ref}`this page ` for more details on how to use it. + +### Score API + +Our Score API is similar to `LLM.score`. +Please see [this page](../serving/openai_compatible_server.html#score-api-for-cross-encoder-models) for more details on how to use it. diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst deleted file mode 100644 index 7fa66274c3c5a..0000000000000 --- a/docs/source/models/pooling_models.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _pooling_models: - -Pooling Models -============== - -vLLM also supports pooling models, including embedding, reranking and reward models. - -In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input -before returning them. - -.. note:: - - We currently support pooling models primarily as a matter of convenience. - As shown in the :ref:`Compatibility Matrix `, most vLLM features are not applicable to - pooling models as they only work on the generation or decode stage, so performance may not improve as much. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For pooling models, we support the following :code:`task` options: - -- Embedding (:code:`"embed"` / :code:`"embedding"`) -- Classification (:code:`"classify"`) -- Sentence Pair Scoring (:code:`"score"`) -- Reward Modeling (:code:`"reward"`) - -The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used: - -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. - -When loading `Sentence Transformers `__ models, -we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`). - -You can customize the model's pooling method via the :code:`override_pooler_config` option, -which takes priority over both the model's and Sentence Transformers's defaults. - -``LLM.encode`` -^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. -It returns the aggregated hidden states directly. - -.. code-block:: python - - llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") - outputs = llm.encode("Hello, my name is") - - outputs = model.encode(prompts) - for output in outputs: - embeddings = output.outputs.embedding - print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}") - -A code example can be found in `examples/offline_inference_embedding.py `_. - -``LLM.score`` -^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs. -It is primarily designed for `cross-encoder models `__. -These types of models serve as rerankers between candidate query-document pairs in RAG systems. - -.. note:: - - vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. - To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. - -You can use `these tests `_ as reference. - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Embeddings API -^^^^^^^^^^^^^^ - -Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs `. - -The text-only API is compatible with `OpenAI Embeddings API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_embedding_client.py `_. - -The multi-modal API is an extension of the `OpenAI Embeddings API `__ -that incorporates `OpenAI Chat Completions API `__, -so it is not part of the OpenAI standard. Please see :ref:`this page ` for more details on how to use it. - -Score API -^^^^^^^^^ - -Our Score API is similar to ``LLM.score``. -Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it. diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md new file mode 100644 index 0000000000000..59b677c6a4edd --- /dev/null +++ b/docs/source/models/supported_models.md @@ -0,0 +1,824 @@ +(supported-models)= + +# Supported Models + +vLLM supports generative and pooling models across various tasks. +If a model supports more than one task, you can set the task via the {code}`--task` argument. + +For each task, we list the model architectures that have been implemented in vLLM. +Alongside each architecture, we include some popular models that use it. + +## Loading a Model + +### HuggingFace Hub + +By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). + +To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository. +If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory. + +````{tip} +The easiest way to check if your model is really supported at runtime is to run the program below: + +```python +from vllm import LLM + +# For generative models (task=generate) only +llm = LLM(model=..., task="generate") # Name or path of your model +output = llm.generate("Hello, my name is") +print(output) + +# For pooling models (task={embed,classify,reward}) only +llm = LLM(model=..., task="embed") # Name or path of your model +output = llm.encode("Hello, my name is") +print(output) +``` + +If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. +```` + +Otherwise, please refer to {ref}`Adding a New Model ` and {ref}`Enabling Multimodal Inputs ` +for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. + +### ModelScope + +To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: + +```shell +$ export VLLM_USE_MODELSCOPE=True +``` + +And use with {code}`trust_remote_code=True`. + +```python +from vllm import LLM + +llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) + +# For generative models (task=generate) only +output = llm.generate("Hello, my name is") +print(output) + +# For pooling models (task={embed,classify,reward}) only +output = llm.encode("Hello, my name is") +print(output) +``` + +## List of Text-only Language Models + +### Generative Models + +See {ref}`this page ` for more information on how to use generative models. + +#### Text Generation (`--task generate`) + +```{eval-rst} +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`AquilaForCausalLM` + - Aquila, Aquila2 + - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`ArcticForCausalLM` + - Arctic + - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. + - + - ✅︎ + * - :code:`BaiChuanForCausalLM` + - Baichuan2, Baichuan + - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`BloomForCausalLM` + - BLOOM, BLOOMZ, BLOOMChat + - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. + - + - ✅︎ + * - :code:`BartForConditionalGeneration` + - BART + - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. + - + - + * - :code:`ChatGLMModel` + - ChatGLM + - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. + - ✅︎ + - ✅︎ + * - :code:`CohereForCausalLM` + - Command-R + - :code:`CohereForAI/c4ai-command-r-v01`, etc. + - ✅︎ + - ✅︎ + * - :code:`DbrxForCausalLM` + - DBRX + - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. + - + - ✅︎ + * - :code:`DeciLMForCausalLM` + - DeciLM + - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. + - + - ✅︎ + * - :code:`DeepseekForCausalLM` + - DeepSeek + - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. + - + - ✅︎ + * - :code:`DeepseekV2ForCausalLM` + - DeepSeek-V2 + - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. + - + - ✅︎ + * - :code:`ExaoneForCausalLM` + - EXAONE-3 + - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`FalconForCausalLM` + - Falcon + - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. + - + - ✅︎ + * - :code:`FalconMambaForCausalLM` + - FalconMamba + - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`GemmaForCausalLM` + - Gemma + - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. + - ✅︎ + - ✅︎ + * - :code:`Gemma2ForCausalLM` + - Gemma2 + - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. + - ✅︎ + - ✅︎ + * - :code:`GlmForCausalLM` + - GLM-4 + - :code:`THUDM/glm-4-9b-chat-hf`, etc. + - ✅︎ + - ✅︎ + * - :code:`GPT2LMHeadModel` + - GPT-2 + - :code:`gpt2`, :code:`gpt2-xl`, etc. + - + - ✅︎ + * - :code:`GPTBigCodeForCausalLM` + - StarCoder, SantaCoder, WizardCoder + - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. + - ✅︎ + - ✅︎ + * - :code:`GPTJForCausalLM` + - GPT-J + - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. + - + - ✅︎ + * - :code:`GPTNeoXForCausalLM` + - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. + - + - ✅︎ + * - :code:`GraniteForCausalLM` + - Granite 3.0, PowerLM + - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. + - ✅︎ + - ✅︎ + * - :code:`GraniteMoeForCausalLM` + - Granite 3.0 MoE, PowerMoE + - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. + - ✅︎ + - ✅︎ + * - :code:`InternLMForCausalLM` + - InternLM + - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. + - ✅︎ + - ✅︎ + * - :code:`InternLM2ForCausalLM` + - InternLM2 + - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. + - ✅︎ + - ✅︎ + * - :code:`JAISLMHeadModel` + - Jais + - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. + - + - ✅︎ + * - :code:`JambaForCausalLM` + - Jamba + - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. + - ✅︎ + - ✅︎ + * - :code:`LlamaForCausalLM` + - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. + - ✅︎ + - ✅︎ + * - :code:`MambaForCausalLM` + - Mamba + - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. + - + - ✅︎ + * - :code:`MiniCPMForCausalLM` + - MiniCPM + - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. + - ✅︎ + - ✅︎ + * - :code:`MiniCPM3ForCausalLM` + - MiniCPM3 + - :code:`openbmb/MiniCPM3-4B`, etc. + - ✅︎ + - ✅︎ + * - :code:`MistralForCausalLM` + - Mistral, Mistral-Instruct + - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ + - ✅︎ + * - :code:`MixtralForCausalLM` + - Mixtral-8x7B, Mixtral-8x7B-Instruct + - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. + - ✅︎ + - ✅︎ + * - :code:`MPTForCausalLM` + - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. + - + - ✅︎ + * - :code:`NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ + - ✅︎ + * - :code:`OLMoForCausalLM` + - OLMo + - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. + - + - ✅︎ + * - :code:`OLMo2ForCausalLM` + - OLMo2 + - :code:`allenai/OLMo2-7B-1124`, etc. + - + - ✅︎ + * - :code:`OLMoEForCausalLM` + - OLMoE + - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`OPTForCausalLM` + - OPT, OPT-IML + - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + - + - ✅︎ + * - :code:`OrionForCausalLM` + - Orion + - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. + - + - ✅︎ + * - :code:`PhiForCausalLM` + - Phi + - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. + - ✅︎ + - ✅︎ + * - :code:`Phi3ForCausalLM` + - Phi-3 + - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`Phi3SmallForCausalLM` + - Phi-3-Small + - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. + - + - ✅︎ + * - :code:`PhiMoEForCausalLM` + - Phi-3.5-MoE + - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`PersimmonForCausalLM` + - Persimmon + - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. + - + - ✅︎ + * - :code:`QWenLMHeadModel` + - Qwen + - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2ForCausalLM` + - Qwen2 + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2MoeForCausalLM` + - Qwen2MoE + - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + - + - ✅︎ + * - :code:`StableLmForCausalLM` + - StableLM + - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. + - + - ✅︎ + * - :code:`Starcoder2ForCausalLM` + - Starcoder2 + - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. + - + - ✅︎ + * - :code:`SolarForCausalLM` + - Solar Pro + - :code:`upstage/solar-pro-preview-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`TeleChat2ForCausalLM` + - TeleChat2 + - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. + - ✅︎ + - ✅︎ + * - :code:`XverseForCausalLM` + - XVERSE + - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. + - ✅︎ + - ✅︎ +``` + +```{note} +Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +``` + +### Pooling Models + +See {ref}`this page ` for more information on how to use pooling models. + +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` + +#### Text Embedding (`--task embed`) + +Any text generation model can be converted into an embedding model by passing {code}`--task embed`. + +```{note} +To get the best results, you should use pooling models that are specifically trained as such. +``` + +The following table lists those that are tested in vLLM. + +```{eval-rst} +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`BertModel` + - BERT-based + - :code:`BAAI/bge-base-en-v1.5`, etc. + - + - + * - :code:`Gemma2Model` + - Gemma2-based + - :code:`BAAI/bge-multilingual-gemma2`, etc. + - + - ✅︎ + * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. + - Llama-based + - :code:`intfloat/e5-mistral-7b-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` + - Qwen2-based + - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + - ✅︎ + - ✅︎ + * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` + - RoBERTa-based + - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. + - + - + * - :code:`XLMRobertaModel` + - XLM-RoBERTa-based + - :code:`intfloat/multilingual-e5-large`, etc. + - + - +``` + +```{note} +{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. +You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`. +``` + +```{note} +Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. +You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. + +On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention +despite being described otherwise on its model card. +``` + +#### Reward Modeling (`--task reward`) + +```{eval-rst} +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`LlamaForCausalLM` + - Llama-based + - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2ForRewardModel` + - Qwen2-based + - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. + - ✅︎ + - ✅︎ +``` + +```{important} +For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, +e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +``` + +#### Classification (`--task classify`) + +```{eval-rst} +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForSequenceClassification` + - Qwen2-based + - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. + - ✅︎ + - ✅︎ +``` + +#### Sentence Pair Scoring (`--task score`) + +```{eval-rst} +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`BertForSequenceClassification` + - BERT-based + - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + - + - + * - :code:`RobertaForSequenceClassification` + - RoBERTa-based + - :code:`cross-encoder/quora-roberta-base`, etc. + - + - + * - :code:`XLMRobertaForSequenceClassification` + - XLM-RoBERTa-based + - :code:`BAAI/bge-reranker-v2-m3`, etc. + - + - +``` + +(supported-mm-models)= + +## List of Multimodal Language Models + +The following modalities are supported depending on the model: + +- **T**ext +- **I**mage +- **V**ideo +- **A**udio + +Any combination of modalities joined by {code}`+` are supported. + +- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by {code}`/` are mutually exclusive. + +- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + +See {ref}`this page ` on how to pass multi-modal inputs to the model. + +### Generative Models + +See {ref}`this page ` for more information on how to use generative models. + +#### Text Generation (`--task generate`) + +```{eval-rst} +.. list-table:: + :widths: 25 25 15 20 5 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Inputs + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + - V1 + * - :code:`AriaForConditionalGeneration` + - Aria + - T + I + - :code:`rhymes-ai/Aria` + - + - ✅︎ + - + * - :code:`Blip2ForConditionalGeneration` + - BLIP-2 + - T + I\ :sup:`E` + - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. + - + - ✅︎ + - + * - :code:`ChameleonForConditionalGeneration` + - Chameleon + - T + I + - :code:`facebook/chameleon-7b` etc. + - + - ✅︎ + - + * - :code:`FuyuForCausalLM` + - Fuyu + - T + I + - :code:`adept/fuyu-8b` etc. + - + - ✅︎ + - + * - :code:`ChatGLMModel` + - GLM-4V + - T + I + - :code:`THUDM/glm-4v-9b` etc. + - ✅︎ + - ✅︎ + - + * - :code:`H2OVLChatModel` + - H2OVL + - T + I\ :sup:`E+` + - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅︎ + - + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - + - + * - :code:`InternVLChatModel` + - InternVL 2.5, Mono-InternVL, InternVL 2.0 + - T + I\ :sup:`E+` + - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. + - + - ✅︎ + - ✅︎ + * - :code:`LlavaForConditionalGeneration` + - LLaVA-1.5 + - T + I\ :sup:`E+` + - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. + - + - ✅︎ + - ✅︎ + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT + - T + I\ :sup:`E+` + - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + - + - ✅︎ + - + * - :code:`LlavaNextVideoForConditionalGeneration` + - LLaVA-NeXT-Video + - T + V + - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + - + - ✅︎ + - + * - :code:`LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - T + I\ :sup:`+` + V\ :sup:`+` + - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + - + - ✅︎ + - + * - :code:`MiniCPMV` + - MiniCPM-V + - T + I\ :sup:`E+` + - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. + - ✅︎ + - ✅︎ + - + * - :code:`MllamaForConditionalGeneration` + - Llama 3.2 + - T + I\ :sup:`+` + - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. + - + - + - + * - :code:`MolmoForCausalLM` + - Molmo + - T + I + - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. + - + - ✅︎ + - ✅︎ + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - T + I\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. + - + - ✅︎ + - ✅︎ + * - :code:`PaliGemmaForConditionalGeneration` + - PaliGemma + - T + I\ :sup:`E` + - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. + - + - ✅︎ + - + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision, Phi-3.5-Vision + - T + I\ :sup:`E+` + - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. + - + - ✅︎ + - ✅︎ + * - :code:`PixtralForConditionalGeneration` + - Pixtral + - T + I\ :sup:`+` + - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. + - + - ✅︎ + - ✅︎ + * - :code:`QWenLMHeadModel` + - Qwen-VL + - T + I\ :sup:`E+` + - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. + - ✅︎ + - ✅︎ + - + * - :code:`Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A\ :sup:`+` + - :code:`Qwen/Qwen2-Audio-7B-Instruct` + - + - ✅︎ + - + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL + - T + I\ :sup:`E+` + V\ :sup:`E+` + - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. + - ✅︎ + - ✅︎ + - + * - :code:`UltravoxModel` + - Ultravox + - T + A\ :sup:`E+` + - :code:`fixie-ai/ultravox-v0_3` + - + - ✅︎ + - +``` + +{sup}`E` + + Pre-computed embeddings can be inputted for this modality. + +{sup}`+` + + Multiple items can be inputted per text prompt for this modality. + +````{important} +To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) +or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` + +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` + +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` + +```{note} +To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) +and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +``` + +```{note} +The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. +For more details, please see: +``` + +### Pooling Models + +See {ref}`this page ` for more information on how to use pooling models. + +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` + +#### Text Embedding (`--task embed`) + +Any text generation model can be converted into an embedding model by passing {code}`--task embed`. + +```{note} +To get the best results, you should use pooling models that are specifically trained as such. +``` + +The following table lists those that are tested in vLLM. + +```{eval-rst} +.. list-table:: + :widths: 25 25 15 25 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Inputs + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅︎ + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision-based + - T + I + - :code:`TIGER-Lab/VLM2Vec-Full` + - 🚧 + - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ +``` + +______________________________________________________________________ + +# Model Support Policy + +At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: + +1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! +2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. + +```{tip} +When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +``` + +3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. +4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. +5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. + +Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. + +Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. + +We have the following levels of testing for models: + +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. +2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](https://github.com/vllm-project/vllm/tree/main/tests) and [examples](https://github.com/vllm-project/vllm/tree/main/examples) for the models that have passed this test. +4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md new file mode 100644 index 0000000000000..7859c4ed2353d --- /dev/null +++ b/docs/source/performance/benchmarks.md @@ -0,0 +1,28 @@ +(benchmarks)= + +# Benchmark Suites + +vLLM contains two sets of benchmarks: + +- {ref}`Performance benchmarks ` +- {ref}`Nightly benchmarks ` + +(performance-benchmarks)= + +## Performance Benchmarks + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). + +More information on the performance benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). + +(nightly-benchmarks)= + +## Nightly Benchmarks + +These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. + +The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). + +More information on the nightly benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst deleted file mode 100644 index 6d4d7b544cb5d..0000000000000 --- a/docs/source/performance/benchmarks.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _benchmarks: - -================ -Benchmark Suites -================ - -vLLM contains two sets of benchmarks: - -+ :ref:`Performance benchmarks ` -+ :ref:`Nightly benchmarks ` - - -.. _performance_benchmarks: - -Performance Benchmarks ----------------------- - -The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. - -The latest performance results are hosted on the public `vLLM Performance Dashboard `_. - -More information on the performance benchmarks and their parameters can be found `here `__. - -.. _nightly_benchmarks: - -Nightly Benchmarks ------------------- - -These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. - -The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. - -More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/docs/source/quantization/auto_awq.md b/docs/source/quantization/auto_awq.md new file mode 100644 index 0000000000000..c02fbf0605a8c --- /dev/null +++ b/docs/source/quantization/auto_awq.md @@ -0,0 +1,78 @@ +(auto-awq)= + +# AutoAWQ + +```{warning} +Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better +accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency +inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. +``` + +To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). +Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +The main benefits are lower latency and memory usage. + +You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). + +```console +$ pip install autoawq +``` + +After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: + +```python +from awq import AutoAWQForCausalLM +from transformers import AutoTokenizer + +model_path = 'mistralai/Mistral-7B-Instruct-v0.2' +quant_path = 'mistral-instruct-v0.2-awq' +quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + +# Load model +model = AutoAWQForCausalLM.from_pretrained( + model_path, **{"low_cpu_mem_usage": True, "use_cache": False} +) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + +# Quantize +model.quantize(tokenizer, quant_config=quant_config) + +# Save quantized model +model.save_quantized(quant_path) +tokenizer.save_pretrained(quant_path) + +print(f'Model is quantized and saved at "{quant_path}"') +``` + +To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: + +```console +$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +``` + +AWQ models are also supported directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst deleted file mode 100644 index 8eb6fa2f4cbe1..0000000000000 --- a/docs/source/quantization/auto_awq.rst +++ /dev/null @@ -1,79 +0,0 @@ -.. _auto_awq: - -AutoAWQ -================== - -.. warning:: - - Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better - accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency - inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. - -To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. -Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. -The main benefits are lower latency and memory usage. - -You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. - -.. code-block:: console - - $ pip install autoawq - -After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: - -.. code-block:: python - - from awq import AutoAWQForCausalLM - from transformers import AutoTokenizer - - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } - - # Load model - model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} - ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # Quantize - model.quantize(tokenizer, quant_config=quant_config) - - # Save quantized model - model.save_quantized(quant_path) - tokenizer.save_pretrained(quant_path) - - print(f'Model is quantized and saved at "{quant_path}"') - -To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: - -.. code-block:: console - - $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq - -AWQ models are also supported directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md new file mode 100644 index 0000000000000..2f7b779a9b361 --- /dev/null +++ b/docs/source/quantization/bnb.md @@ -0,0 +1,39 @@ +(bits-and-bytes)= + +# BitsAndBytes + +vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference. +BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. +Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. + +Below are the steps to utilize BitsAndBytes with vLLM. + +```console +$ pip install bitsandbytes>=0.44.0 +``` + +vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. + +You can find bitsandbytes quantized models on . +And usually, these repositories have a config.json file that includes a quantization_config section. + +## Read quantized checkpoint. + +```python +from vllm import LLM +import torch +# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. +model_id = "unsloth/tinyllama-bnb-4bit" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` + +## Inflight quantization: load as 4bit quantization + +```python +from vllm import LLM +import torch +model_id = "huggyllama/llama-7b" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst deleted file mode 100644 index 682938cc63d48..0000000000000 --- a/docs/source/quantization/bnb.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. _bits_and_bytes: - -BitsAndBytes -================== - -vLLM now supports `BitsAndBytes `_ for more efficient model inference. -BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. -Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. - -Below are the steps to utilize BitsAndBytes with vLLM. - -.. code-block:: console - - $ pip install bitsandbytes>=0.44.0 - -vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. - -You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes. -And usually, these repositories have a config.json file that includes a quantization_config section. - -Read quantized checkpoint. --------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. - model_id = "unsloth/tinyllama-bnb-4bit" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - -Inflight quantization: load as 4bit quantization ------------------------------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - model_id = "huggyllama/llama-7b" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - diff --git a/docs/source/quantization/fp8.md b/docs/source/quantization/fp8.md new file mode 100644 index 0000000000000..b2eda74fd1e3b --- /dev/null +++ b/docs/source/quantization/fp8.md @@ -0,0 +1,192 @@ +(fp8)= + +# FP8 W8A8 + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. +Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. + +Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127). + +The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: + +- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. +- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. + +```{note} +FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). +FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. +``` + +## Quick Start with Online Dynamic Quantization + +Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor. + +In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. + +```python +from vllm import LLM +model = LLM("facebook/opt-125m", quantization="fp8") +# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB +result = model.generate("Hello, my name is") +``` + +```{warning} +Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. +``` + +## Installation + +To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves three main steps: + +1. Loading the model +2. Applying quantization +3. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Applying Quantization + +For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses: + +- Static, per-channel quantization on the weights +- Dynamic, per-token quantization on the activations + +Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +# Configure the simple PTQ quantization +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + +# Apply the quantization algorithm. +oneshot(model=model, recipe=recipe) + +# Save the model. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) +``` + +### 3. Evaluating Accuracy + +Install `vllm` and `lm-evaluation-harness`: + +```console +$ pip install vllm lm-eval==0.4.4 +``` + +Load and run the model in `vllm`: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +model.generate("Hello my name is") +``` + +Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. +``` + +```console +$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic +$ lm_eval \ + --model vllm \ + --model_args pretrained=$MODEL,add_bos_token=True \ + --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 +``` + +Here's an example of the resulting scores: + +```text +|Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| +|-----|------:|----------------|-----:|-----------|---|----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| +| | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| +``` + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. + +## Deprecated Flow + +```{note} +The following information is preserved for reference and search purposes. +The quantization method described below is deprecated in favor of the `llmcompressor` method described above. +``` + +For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8). + +```bash +git clone https://github.com/neuralmagic/AutoFP8.git +pip install -e AutoFP8 +``` + +This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed. + +## Offline Quantization with Static Activation Scaling Factors + +You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument. + +```python +from datasets import load_dataset +from transformers import AutoTokenizer +from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + +pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" +quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" + +tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) +tokenizer.pad_token = tokenizer.eos_token + +# Load and tokenize 512 dataset samples for calibration of activation scales +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) +examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] +examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") + +# Define quantization config with static activation scales +quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") + +# Load the model, quantize, and save checkpoint +model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) +model.quantize(examples) +model.save_quantized(quantized_model_dir) +``` + +Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`. +Finally, you can load the quantized model checkpoint directly in vLLM. + +```python +from vllm import LLM +model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") +# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB +result = model.generate("Hello, my name is") +``` diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst deleted file mode 100644 index 4dbf8e9d346e1..0000000000000 --- a/docs/source/quantization/fp8.rst +++ /dev/null @@ -1,204 +0,0 @@ -.. _fp8: - -FP8 W8A8 -================== - -vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. -Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. -Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. -Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. - -Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. - -The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: - -- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``. -- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values. - -.. note:: - - FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). - FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. - -Quick Start with Online Dynamic Quantization --------------------------------------------- - -Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor. - -In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. - -.. code-block:: python - - from vllm import LLM - model = LLM("facebook/opt-125m", quantization="fp8") - # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB - result = model.generate("Hello, my name is") - -.. warning:: - - Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. - -Installation ------------- - -To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves three main steps: - -1. Loading the model -2. Applying quantization -3. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses: - -- Static, per-channel quantization on the weights -- Dynamic, per-token quantization on the activations - -Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import QuantizationModifier - - # Configure the simple PTQ quantization - recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) - - # Apply the quantization algorithm. - oneshot(model=model, recipe=recipe) - - # Save the model. - SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" - model.save_pretrained(SAVE_DIR) - tokenizer.save_pretrained(SAVE_DIR) - -3. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -Install ``vllm`` and ``lm-evaluation-harness``: - -.. code-block:: console - - $ pip install vllm lm-eval==0.4.4 - -Load and run the model in ``vllm``: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") - model.generate("Hello my name is") - -Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``): - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations. - -.. code-block:: console - - $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic - $ lm_eval \ - --model vllm \ - --model_args pretrained=$MODEL,add_bos_token=True \ - --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 - -Here's an example of the resulting scores: - -.. code-block:: text - - |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| - |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| - |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| - | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. - - -Deprecated Flow ------------------- - -.. note:: - - The following information is preserved for reference and search purposes. - The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above. - -For static per-tensor offline quantization to FP8, please install the `AutoFP8 library `_. - -.. code-block:: bash - - git clone https://github.com/neuralmagic/AutoFP8.git - pip install -e AutoFP8 - -This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. - -Offline Quantization with Static Activation Scaling Factors ------------------------------------------------------------ - -You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. - -.. code-block:: python - - from datasets import load_dataset - from transformers import AutoTokenizer - from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig - - pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" - quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) - tokenizer.pad_token = tokenizer.eos_token - - # Load and tokenize 512 dataset samples for calibration of activation scales - ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) - examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] - examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") - - # Define quantization config with static activation scales - quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") - - # Load the model, quantize, and save checkpoint - model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - model.quantize(examples) - model.save_quantized(quantized_model_dir) - -Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``. -Finally, you can load the quantized model checkpoint directly in vLLM. - -.. code-block:: python - - from vllm import LLM - model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") - # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB - result = model.generate("Hello, my name is") - diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md new file mode 100644 index 0000000000000..f200c722d1d42 --- /dev/null +++ b/docs/source/quantization/fp8_e4m3_kvcache.md @@ -0,0 +1,44 @@ +(fp8-e4m3-kvcache)= + +# FP8 E4M3 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, +improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 +(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of +the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of +FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside +each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling +factors of a finer granularity (e.g. per-channel). + +These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If +this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an +unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). + +To install AMMO (AlgorithMic Model Optimization): + +```console +$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +``` + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon +offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. +Thus, LLM inference is greatly accelerated with minimal accuracy loss. + +Here is an example of how to enable this feature: + +```python +# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to +# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. + +from vllm import LLM, SamplingParams +sampling_params = SamplingParams(temperature=1.3, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) + +# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, +# output w/o scaling factors: England, located in the southeastern part of the country. It is known +``` diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst deleted file mode 100644 index cc52d8f40af8f..0000000000000 --- a/docs/source/quantization/fp8_e4m3_kvcache.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. _fp8_e4m3_kvcache: - -FP8 E4M3 KV Cache -================== - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -.. code-block:: console - - $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - - -Here is an example of how to enable this feature: - -.. code-block:: python - - # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to - # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. - - from vllm import LLM, SamplingParams - sampling_params = SamplingParams(temperature=1.3, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") - prompt = "London is the capital of" - out = llm.generate(prompt, sampling_params)[0].outputs[0].text - print(out) - - # output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, - # output w/o scaling factors: England, located in the southeastern part of the country. It is known - diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md new file mode 100644 index 0000000000000..3a81ab17f332f --- /dev/null +++ b/docs/source/quantization/fp8_e5m2_kvcache.md @@ -0,0 +1,31 @@ +(fp8-kv-cache)= + +# FP8 E5M2 KV Cache + +The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. + +Here is an example of how to enable this feature: + +```python +from vllm import LLM, SamplingParams +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Create an LLM. +llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst deleted file mode 100644 index b2d824427f786..0000000000000 --- a/docs/source/quantization/fp8_e5m2_kvcache.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _fp8_kv_cache: - -FP8 E5M2 KV Cache -================== - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -.. code-block:: python - - from vllm import LLM, SamplingParams - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - diff --git a/docs/source/quantization/gguf.md b/docs/source/quantization/gguf.md new file mode 100644 index 0000000000000..eebf11dfc1b2b --- /dev/null +++ b/docs/source/quantization/gguf.md @@ -0,0 +1,72 @@ +(gguf)= + +# GGUF + +```{warning} +Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. +``` + +```{warning} +Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. +``` + +To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: + +```console +$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` + +You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: + +```console +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +``` + +```{warning} +We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. +``` + +You can also use the GGUF model directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# In this script, we demonstrate how to pass input to the chat method: +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.chat(conversation, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/gguf.rst b/docs/source/quantization/gguf.rst deleted file mode 100644 index 9f00dc5563909..0000000000000 --- a/docs/source/quantization/gguf.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. _gguf: - -GGUF -================== - -.. warning:: - - Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. - -.. warning:: - - Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split `_ tool to merge them to a single-file model. - -To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF `_ with the following command: - -.. code-block:: console - - $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 - -You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs: - -.. code-block:: console - - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 - -.. warning:: - - We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. - -You can also use the GGUF model directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # In this script, we demonstrate how to pass input to the chat method: - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.chat(conversation, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/int8.md b/docs/source/quantization/int8.md new file mode 100644 index 0000000000000..1ac50ba987dda --- /dev/null +++ b/docs/source/quantization/int8.md @@ -0,0 +1,136 @@ +(int8)= + +# INT8 W8A8 + +vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). + +```{note} +INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). +``` + +## Prerequisites + +To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Preparing Calibration Data + +When quantizing activations to INT8, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: + +```python +from datasets import load_dataset + +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) + +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` + +### 3. Applying Quantization + +Now, apply the quantization algorithms: + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + +# Configure the quantization algorithms +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), +] + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save the compressed model +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +This process creates a W8A8 model with weights and activations quantized to 8-bit integers. + +### 4. Evaluating Accuracy + +After quantization, you can load and run the model in vLLM: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") +``` + +To evaluate accuracy, you can use `lm_eval`: + +```console +$ lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' +``` + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. +``` + +## Best Practices + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst deleted file mode 100644 index aa5b251becb1c..0000000000000 --- a/docs/source/quantization/int8.rst +++ /dev/null @@ -1,145 +0,0 @@ -.. _int8: - -INT8 W8A8 -================== - -vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. -This quantization method is particularly useful for reducing model size while maintaining good performance. - -Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM `_. - -.. note:: - - INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). - -Prerequisites -------------- - -To use INT8 quantization with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves four main steps: - -1. Loading the model -2. Preparing calibration data -3. Applying quantization -4. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", - ) - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Preparing Calibration Data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When quantizing activations to INT8, you need sample data to estimate the activation scales. -It's best to use calibration data that closely matches your deployment data. -For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``: - -.. code-block:: python - - from datasets import load_dataset - - NUM_CALIBRATION_SAMPLES = 512 - MAX_SEQUENCE_LENGTH = 2048 - - # Load and preprocess the dataset - ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") - ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - - def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} - ds = ds.map(preprocess) - - def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) - ds = ds.map(tokenize, remove_columns=ds.column_names) - -3. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -Now, apply the quantization algorithms: - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import GPTQModifier - from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - - # Configure the quantization algorithms - recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), - ] - - # Apply quantization - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - ) - - # Save the compressed model - SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - -This process creates a W8A8 model with weights and activations quantized to 8-bit integers. - -4. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -After quantization, you can load and run the model in vLLM: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") - -To evaluate accuracy, you can use ``lm_eval``: - -.. code-block:: console - - $ lm_eval --model vllm \ - --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ - --tasks gsm8k \ - --num_fewshot 5 \ - --limit 250 \ - --batch_size 'auto' - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations. - -Best Practices --------------- - -- Start with 512 samples for calibration data (increase if accuracy drops) -- Use a sequence length of 2048 as a starting point -- Employ the chat template or instruction template that the model was trained with -- If you've fine-tuned a model, consider using a sample of your training data for calibration - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.md similarity index 84% rename from docs/source/quantization/supported_hardware.rst rename to docs/source/quantization/supported_hardware.md index 09f8e7112cf0c..d2160772a24cb 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.md @@ -1,132 +1,132 @@ -.. _supported_hardware_for_quantization: - -Supported Hardware for Quantization Kernels -=========================================== - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -.. list-table:: - :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 - - * - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU - * - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - * - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - * - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - -Notes: -^^^^^^ - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- "✅︎" indicates that the quantization method is supported on the specified hardware. -- "✗" indicates that the quantization method is not supported on the specified hardware. - -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. +(supported-hardware-for-quantization)= + +# Supported Hardware for Quantization Kernels + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +```{eval-rst} +.. list-table:: + :header-rows: 1 + :widths: 20 8 8 8 8 8 8 8 8 8 8 + + * - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU + * - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + * - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + * - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +``` + +## Notes: + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅︎" indicates that the quantization method is supported on the specified hardware. +- "✗" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the [quantization directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization) or consult with the vLLM development team. diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/serving/deploying_with_bentoml.md new file mode 100644 index 0000000000000..dfa0de4f0f6d7 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.md @@ -0,0 +1,7 @@ +(deploying-with-bentoml)= + +# Deploying with BentoML + +[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst deleted file mode 100644 index 4b9d19f5bdb72..0000000000000 --- a/docs/source/serving/deploying_with_bentoml.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_bentoml: - -Deploying with BentoML -====================== - -`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. - -For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md new file mode 100644 index 0000000000000..4863936236119 --- /dev/null +++ b/docs/source/serving/deploying_with_cerebrium.md @@ -0,0 +1,109 @@ +(deploying-with-cerebrium)= + +# Deploying with Cerebrium + +```{raw} html +

+ vLLM_plus_cerebrium +

+``` + +vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +```console +$ pip install cerebrium +$ cerebrium login +``` + +Next, create your Cerebrium project, run: + +```console +$ cerebrium init vllm-project +``` + +Next, to install the required packages, add the following to your cerebrium.toml: + +```toml +[cerebrium.deployment] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + +[cerebrium.dependencies.pip] +vllm = "latest" +``` + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: + +```python +from vllm import LLM, SamplingParams + +llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + +def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} +``` + +Then, run the following code to deploy it to the cloud + +```console +$ cerebrium deploy +``` + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +```python +curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' +``` + +You should get a response like: + +```python +{ + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 +} +``` + +You now have an autoscaling endpoint where you only pay for the compute you use! diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst deleted file mode 100644 index 9585b6ef5cb38..0000000000000 --- a/docs/source/serving/deploying_with_cerebrium.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. _deploying_with_cerebrium: - -Deploying with Cerebrium -============================ - -.. raw:: html - -

- vLLM_plus_cerebrium -

- -vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. - -To install the Cerebrium client, run: - -.. code-block:: console - - $ pip install cerebrium - $ cerebrium login - -Next, create your Cerebrium project, run: - -.. code-block:: console - - $ cerebrium init vllm-project - -Next, to install the required packages, add the following to your cerebrium.toml: - -.. code-block:: toml - - [cerebrium.deployment] - docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" - - [cerebrium.dependencies.pip] - vllm = "latest" - -Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") - - def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): - - sampling_params = SamplingParams(temperature=temperature, top_p=top_p) - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - results = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - results.append({"prompt": prompt, "generated_text": generated_text}) - - return {"results": results} - - -Then, run the following code to deploy it to the cloud - -.. code-block:: console - - $ cerebrium deploy - -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) - -.. code-block:: python - - curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ - -H 'Content-Type: application/json' \ - -H 'Authorization: ' \ - --data '{ - "prompts": [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is" - ] - }' - -You should get a response like: - -.. code-block:: python - - { - "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", - "result": { - "result": [ - { - "prompt": "Hello, my name is", - "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" - }, - { - "prompt": "The president of the United States is", - "generated_text": " elected every four years. This is a democratic system.\n\n5. What" - }, - { - "prompt": "The capital of France is", - "generated_text": " Paris.\n" - }, - { - "prompt": "The future of AI is", - "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." - } - ] - }, - "run_time_ms": 152.53663063049316 - } - -You now have an autoscaling endpoint where you only pay for the compute you use! - diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md new file mode 100644 index 0000000000000..796e8810a444c --- /dev/null +++ b/docs/source/serving/deploying_with_docker.md @@ -0,0 +1,48 @@ +(deploying-with-docker)= + +# Deploying with Docker + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +```{note} +You can either use the `ipc=host` flag or `--shm-size` flag to allow the +container to access the host's shared memory. vLLM uses PyTorch, which uses shared +memory to share data between processes under the hood, particularly for tensor parallel inference. +``` + +You can build and run vLLM from source via the provided [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile). To build vLLM: + +```console +$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +``` + +```{note} +By default vLLM will build for all GPU types for widest distribution. If you are just building for the +current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` +for vLLM to find the current GPU type and build for that. +``` + +To run vLLM: + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai +``` + +```{note} +**For \`v0.4.1\` and \`v0.4.2\` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . +``` diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst deleted file mode 100644 index 14d94b09e9b9c..0000000000000 --- a/docs/source/serving/deploying_with_docker.rst +++ /dev/null @@ -1,53 +0,0 @@ -.. _deploying_with_docker: - -Deploying with Docker -============================ - -vLLM offers an official Docker image for deployment. -The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 - - -.. note:: - - You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the - container to access the host's shared memory. vLLM uses PyTorch, which uses shared - memory to share data between processes under the hood, particularly for tensor parallel inference. - - -You can build and run vLLM from source via the provided `Dockerfile `_. To build vLLM: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 - - -.. note:: - - By default vLLM will build for all GPU types for widest distribution. If you are just building for the - current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` - for vLLM to find the current GPU type and build for that. - - -To run vLLM: - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - vllm/vllm-openai - -.. note:: - - **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md new file mode 100644 index 0000000000000..65ef1c0016208 --- /dev/null +++ b/docs/source/serving/deploying_with_dstack.md @@ -0,0 +1,102 @@ +(deploying-with-dstack)= + +# Deploying with dstack + +```{raw} html +

+ vLLM_plus_dstack +

+``` + +vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. + +To install dstack client, run: + +```console +$ pip install "dstack[all] +$ dstack server +``` + +Next, to configure your dstack project, run: + +```console +$ mkdir -p vllm-dstack +$ cd vllm-dstack +$ dstack init +``` + +Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: + +```yaml +type: service + +python: "3.11" +env: + - MODEL=NousResearch/Llama-2-7b-chat-hf +port: 8000 +resources: + gpu: 24GB +commands: + - pip install vllm + - vllm serve $MODEL --port 8000 +model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf +``` + +Then, run the following CLI for provisioning: + +```console +$ dstack run . -f serve.dstack.yml + +⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + +Continue? [y/n]: y +⠙ Submitting run... +⠏ Launching spicy-treefrog-1 (pulling) +spicy-treefrog-1 provisioning completed (running) +Service is published at ... +``` + +After the provisioning, you can interact with the model by using the OpenAI SDK: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.", + api_key="" +) + +completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] +) + +print(completion.choices[0].message.content) +``` + +```{note} +dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) +``` diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst deleted file mode 100644 index e1eb45b225d9c..0000000000000 --- a/docs/source/serving/deploying_with_dstack.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _deploying_with_dstack: - -Deploying with dstack -============================ - -.. raw:: html - -

- vLLM_plus_dstack -

- -vLLM can be run on a cloud based GPU machine with `dstack `__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. - -To install dstack client, run: - -.. code-block:: console - - $ pip install "dstack[all] - $ dstack server - -Next, to configure your dstack project, run: - -.. code-block:: console - - $ mkdir -p vllm-dstack - $ cd vllm-dstack - $ dstack init - -Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: - -.. code-block:: yaml - - type: service - - python: "3.11" - env: - - MODEL=NousResearch/Llama-2-7b-chat-hf - port: 8000 - resources: - gpu: 24GB - commands: - - pip install vllm - - vllm serve $MODEL --port 8000 - model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf - -Then, run the following CLI for provisioning: - -.. code-block:: console - - $ dstack run . -f serve.dstack.yml - - ⠸ Getting run plan... - Configuration serve.dstack.yml - Project deep-diver-main - User deep-diver - Min resources 2..xCPU, 8GB.., 1xGPU (24GB) - Max price - - Max duration - - Spot policy auto - Retry policy no - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - ... - Shown 3 of 193 offers, $5.876 max - - Continue? [y/n]: y - ⠙ Submitting run... - ⠏ Launching spicy-treefrog-1 (pulling) - spicy-treefrog-1 provisioning completed (running) - Service is published at ... - -After the provisioning, you can interact with the model by using the OpenAI SDK: - -.. code-block:: python - - from openai import OpenAI - - client = OpenAI( - base_url="https://gateway.", - api_key="" - ) - - completion = client.chat.completions.create( - model="NousResearch/Llama-2-7b-chat-hf", - messages=[ - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming.", - } - ] - ) - - print(completion.choices[0].message.content) - -.. note:: - - dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository `__ diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.md similarity index 88% rename from docs/source/serving/deploying_with_helm.rst rename to docs/source/serving/deploying_with_helm.md index 21b17e881b945..d10e7a344c994 100644 --- a/docs/source/serving/deploying_with_helm.rst +++ b/docs/source/serving/deploying_with_helm.md @@ -1,7 +1,6 @@ -.. _deploying_with_helm: +(deploying-with-helm)= -Deploying with Helm -=================== +# Deploying with Helm A Helm chart to deploy vLLM for Kubernetes @@ -9,44 +8,42 @@ Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file. -Prerequisites -------------- +## Prerequisites + Before you begin, ensure that you have the following: - A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin `__ +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) - Available GPU resources in your cluster - S3 with the model which will be deployed -Installing the chart --------------------- - -To install the chart with the release name ``test-vllm``: - -.. code-block:: console +## Installing the chart - helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +To install the chart with the release name `test-vllm`: -Uninstalling the Chart ----------------------- +```console +helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +``` -To uninstall the ``test-vllm`` deployment: +## Uninstalling the Chart -.. code-block:: console +To uninstall the `test-vllm` deployment: - helm uninstall test-vllm --namespace=ns-vllm +```console +helm uninstall test-vllm --namespace=ns-vllm +``` The command removes all the Kubernetes components associated with the chart **including persistent volumes** and deletes the release. -Architecture ------------- +## Architecture -.. image:: architecture_helm_deployment.png +```{image} architecture_helm_deployment.png +``` -Values ------- +## Values +```{eval-rst} .. list-table:: Values :widths: 25 25 25 25 :header-rows: 1 @@ -251,3 +248,4 @@ Values - string - test - Release name +``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md new file mode 100644 index 0000000000000..e52ba463b3777 --- /dev/null +++ b/docs/source/serving/deploying_with_k8s.md @@ -0,0 +1,171 @@ +(deploying-with-k8s)= + +# Deploying with Kubernetes + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +## Deployment Steps + +1. **Create a PVC , Secret and Deployment for vLLM** + +PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mistral-7b + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem +``` + +Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret + namespace: default +type: Opaque +data: + token: "REPLACE_WITH_TOKEN" +``` + +Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 +``` + +2. **Create a Kubernetes Service for vLLM** + +Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: mistral-7b + namespace: default +spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP +``` + +3. **Deploy and Test** + +Apply the deployment and service configurations using `kubectl apply -f `: + +```console +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +``` + +To test the deployment, run the following `curl` command: + +```console +curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' +``` + +If the service is correctly deployed, you should receive a response from the vLLM model. + +## Conclusion + +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst deleted file mode 100644 index 7dc076dc709df..0000000000000 --- a/docs/source/serving/deploying_with_k8s.rst +++ /dev/null @@ -1,175 +0,0 @@ -.. _deploying_with_k8s: - -Deploying with Kubernetes -========================== - -Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. - -Prerequisites -------------- -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` -- Available GPU resources in your cluster - -Deployment Steps ----------------- - -1. **Create a PVC , Secret and Deployment for vLLM** - - -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -.. code-block:: yaml - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: mistral-7b - namespace: default - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -.. code-block:: yaml - - apiVersion: v1 - kind: Secret - metadata: - name: hf-token-secret - namespace: default - type: Opaque - data: - token: "REPLACE_WITH_TOKEN" - - -Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: - -.. code-block:: yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b - spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 - -2. **Create a Kubernetes Service for vLLM** - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - name: mistral-7b - namespace: default - spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP - -3. **Deploy and Test** - -Apply the deployment and service configurations using ``kubectl apply -f ``: - -.. code-block:: console - - kubectl apply -f deployment.yaml - kubectl apply -f service.yaml - -To test the deployment, run the following ``curl`` command: - -.. code-block:: console - - curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "facebook/opt-125m", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' - -If the service is correctly deployed, you should receive a response from the vLLM model. - -Conclusion ----------- -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/serving/deploying_with_kserve.md new file mode 100644 index 0000000000000..feaeb5d0ec8a2 --- /dev/null +++ b/docs/source/serving/deploying_with_kserve.md @@ -0,0 +1,7 @@ +(deploying-with-kserve)= + +# Deploying with KServe + +vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. + +Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst deleted file mode 100644 index 01d7ccc6e9300..0000000000000 --- a/docs/source/serving/deploying_with_kserve.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_kserve: - -Deploying with KServe -============================ - -vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. - -Please see `this guide `_ for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/serving/deploying_with_kubeai.md new file mode 100644 index 0000000000000..3609d7e05acd3 --- /dev/null +++ b/docs/source/serving/deploying_with_kubeai.md @@ -0,0 +1,15 @@ +(deploying-with-kubeai)= + +# Deploying with KubeAI + +[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. + +Please see the Installation Guides for environment specific instructions: + +- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/) +- [EKS](https://www.kubeai.org/installation/eks/) +- [GKE](https://www.kubeai.org/installation/gke/) + +Once you have KubeAI installed, you can +[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/) +using vLLM. diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst deleted file mode 100644 index ec3c065320fd9..0000000000000 --- a/docs/source/serving/deploying_with_kubeai.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _deploying_with_kubeai: - -Deploying with KubeAI -===================== - -`KubeAI `_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. - - -Please see the Installation Guides for environment specific instructions: - -* `Any Kubernetes Cluster `_ -* `EKS `_ -* `GKE `_ - -Once you have KubeAI installed, you can -`configure text generation models `_ -using vLLM. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/serving/deploying_with_lws.md new file mode 100644 index 0000000000000..22bab419eaca3 --- /dev/null +++ b/docs/source/serving/deploying_with_lws.md @@ -0,0 +1,11 @@ +(deploying-with-lws)= + +# Deploying with LWS + +LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. +A major use case is for multi-host/multi-node distributed inference. + +vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving. + +Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on +deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_lws.rst b/docs/source/serving/deploying_with_lws.rst deleted file mode 100644 index b63a432dde0d5..0000000000000 --- a/docs/source/serving/deploying_with_lws.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _deploying_with_lws: - -Deploying with LWS -============================ - -LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. -A major use case is for multi-host/multi-node distributed inference. - -vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. - -Please see `this guide `_ for more details on -deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md new file mode 100644 index 0000000000000..e06182bf4a32b --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.md @@ -0,0 +1,133 @@ +(nginxloadbalancer)= + +# Deploying with Nginx Loadbalancer + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +1. {ref}`Build Nginx Container ` +2. {ref}`Create Simple Nginx Config file ` +3. {ref}`Build vLLM Container ` +4. {ref}`Create Docker Network ` +5. {ref}`Launch vLLM Containers ` +6. {ref}`Launch Nginx ` +7. {ref}`Verify That vLLM Servers Are Ready ` + +(nginxloadbalancer-nginx-build)= + +## Build Nginx Container + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +```console +export vllm_root=`pwd` +``` + +Create a file named `Dockerfile.nginx`: + +```console +FROM nginx:latest +RUN rm /etc/nginx/conf.d/default.conf +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] +``` + +Build the container: + +```console +docker build . -f Dockerfile.nginx --tag nginx-lb +``` + +(nginxloadbalancer-nginx-conf)= + +## Create Simple Nginx Config file + +Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. + +```console +upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; +} +server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } +} +``` + +(nginxloadbalancer-nginx-vllm-container)= + +## Build vLLM Container + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm +``` + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +``` + +(nginxloadbalancer-nginx-docker-network)= + +## Create Docker Network + +```console +docker network create vllm_nginx +``` + +(nginxloadbalancer-nginx-launch-container)= + +## Launch vLLM Containers + +Notes: + +- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below. +- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again. +- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. +- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. + +```console +mkdir -p ~/.cache/huggingface/hub/ +hf_cache_dir=~/.cache/huggingface/ +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf +``` + +```{note} +If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. +``` + +(nginxloadbalancer-nginx-launch-nginx)= + +## Launch Nginx + +```console +docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest +``` + +(nginxloadbalancer-nginx-verify-nginx)= + +## Verify That vLLM Servers Are Ready + +```console +docker logs vllm0 | grep Uvicorn +docker logs vllm1 | grep Uvicorn +``` + +Both outputs should look like this: + +```console +INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +``` diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst deleted file mode 100644 index b5dff02b6bae6..0000000000000 --- a/docs/source/serving/deploying_with_nginx.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. _nginxloadbalancer: - -Deploying with Nginx Loadbalancer -================================= - -This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. - -Table of contents: - -#. :ref:`Build Nginx Container ` -#. :ref:`Create Simple Nginx Config file ` -#. :ref:`Build vLLM Container ` -#. :ref:`Create Docker Network ` -#. :ref:`Launch vLLM Containers ` -#. :ref:`Launch Nginx ` -#. :ref:`Verify That vLLM Servers Are Ready ` - -.. _nginxloadbalancer_nginx_build: - -Build Nginx Container ---------------------- - -This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. - -.. code-block:: console - - export vllm_root=`pwd` - -Create a file named ``Dockerfile.nginx``: - -.. code-block:: console - - FROM nginx:latest - RUN rm /etc/nginx/conf.d/default.conf - EXPOSE 80 - CMD ["nginx", "-g", "daemon off;"] - -Build the container: - -.. code-block:: console - - docker build . -f Dockerfile.nginx --tag nginx-lb - -.. _nginxloadbalancer_nginx_conf: - -Create Simple Nginx Config file -------------------------------- - -Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. - -.. code-block:: console - - upstream backend { - least_conn; - server vllm0:8000 max_fails=3 fail_timeout=10000s; - server vllm1:8000 max_fails=3 fail_timeout=10000s; - } - server { - listen 80; - location / { - proxy_pass http://backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - } - -.. _nginxloadbalancer_nginx_vllm_container: - -Build vLLM Container --------------------- - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm - - -If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy - -.. _nginxloadbalancer_nginx_docker_network: - -Create Docker Network ---------------------- - -.. code-block:: console - - docker network create vllm_nginx - - -.. _nginxloadbalancer_nginx_launch_container: - -Launch vLLM Containers ----------------------- - -Notes: - -* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. -* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. -* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. -* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. - -.. code-block:: console - - mkdir -p ~/.cache/huggingface/hub/ - hf_cache_dir=~/.cache/huggingface/ - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf - -.. note:: - If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. - -.. _nginxloadbalancer_nginx_launch_nginx: - -Launch Nginx ------------- - -.. code-block:: console - - docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest - -.. _nginxloadbalancer_nginx_verify_nginx: - -Verify That vLLM Servers Are Ready ----------------------------------- - -.. code-block:: console - - docker logs vllm0 | grep Uvicorn - docker logs vllm1 | grep Uvicorn - -Both outputs should look like this: - -.. code-block:: console - - INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/serving/deploying_with_triton.md new file mode 100644 index 0000000000000..9b0a6f1d54ae8 --- /dev/null +++ b/docs/source/serving/deploying_with_triton.md @@ -0,0 +1,5 @@ +(deploying-with-triton)= + +# Deploying with NVIDIA Triton + +The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst deleted file mode 100644 index 5ce7c3d03dd2d..0000000000000 --- a/docs/source/serving/deploying_with_triton.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. _deploying_with_triton: - -Deploying with NVIDIA Triton -============================ - -The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md new file mode 100644 index 0000000000000..5ab70a9aaaba6 --- /dev/null +++ b/docs/source/serving/distributed_serving.md @@ -0,0 +1,105 @@ +(distributed-serving)= + +# Distributed Inference and Serving + +## How to decide the distributed inference strategy? + +Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: + +- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. + +In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. + +After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. + +```{note} +There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. +``` + +## Details for Distributed Inference and Serving + +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. + +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. + +To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: + +```python +from vllm import LLM +llm = LLM("facebook/opt-13b", tensor_parallel_size=4) +output = llm.generate("San Franciso is a") +``` + +To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: + +```console +$ vllm serve facebook/opt-13b \ +$ --tensor-parallel-size 4 +``` + +You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: + +```console +$ vllm serve gpt2 \ +$ --tensor-parallel-size 4 \ +$ --pipeline-parallel-size 2 +``` + +## Multi-Node Inference and Serving + +If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. + +The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. + +Pick a node as the head node, and run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --head \ +$ /path/to/the/huggingface/home/in/this/node +``` + +On the rest of the worker nodes, run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --worker \ +$ /path/to/the/huggingface/home/in/this/node +``` + +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. + +Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. + +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 8 \ +$ --pipeline-parallel-size 2 +``` + +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 16 +``` + +To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. + +```{warning} +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](https://docs.vllm.ai/en/latest/getting_started/debugging.html) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the [discussion](https://github.com/vllm-project/vllm/issues/6803) for more information. +``` + +```{warning} +Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. + +When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. +``` diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst deleted file mode 100644 index 4d57206e53a05..0000000000000 --- a/docs/source/serving/distributed_serving.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. _distributed_serving: - -Distributed Inference and Serving -================================= - -How to decide the distributed inference strategy? -------------------------------------------------- - -Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: - -- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. -- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. -- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. - -In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. - -After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. - -.. note:: - There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. - -Details for Distributed Inference and Serving ----------------------------------------------- - -vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. - -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. - -To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: - -.. code-block:: python - - from vllm import LLM - llm = LLM("facebook/opt-13b", tensor_parallel_size=4) - output = llm.generate("San Franciso is a") - -To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: - -.. code-block:: console - - $ vllm serve facebook/opt-13b \ - $ --tensor-parallel-size 4 - -You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: - -.. code-block:: console - - $ vllm serve gpt2 \ - $ --tensor-parallel-size 4 \ - $ --pipeline-parallel-size 2 - -Multi-Node Inference and Serving --------------------------------- - -If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. - -The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. - -Pick a node as the head node, and run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --head \ - $ /path/to/the/huggingface/home/in/this/node - -On the rest of the worker nodes, run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --worker \ - $ /path/to/the/huggingface/home/in/this/node - -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. - -Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. - -After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 8 \ - $ --pipeline-parallel-size 2 - -You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 16 - -To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. - -.. warning:: - After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. - -.. warning:: - - Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. - - When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model. diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md new file mode 100644 index 0000000000000..d214c77254257 --- /dev/null +++ b/docs/source/serving/integrations.md @@ -0,0 +1,17 @@ +# Integrations + +```{toctree} +:maxdepth: 1 + +run_on_sky +deploying_with_kserve +deploying_with_kubeai +deploying_with_triton +deploying_with_bentoml +deploying_with_cerebrium +deploying_with_lws +deploying_with_dstack +serving_with_langchain +serving_with_llamaindex +serving_with_llamastack +``` diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst deleted file mode 100644 index 0dd505a739863..0000000000000 --- a/docs/source/serving/integrations.rst +++ /dev/null @@ -1,17 +0,0 @@ -Integrations ------------- - -.. toctree:: - :maxdepth: 1 - - run_on_sky - deploying_with_kserve - deploying_with_kubeai - deploying_with_triton - deploying_with_bentoml - deploying_with_cerebrium - deploying_with_lws - deploying_with_dstack - serving_with_langchain - serving_with_llamaindex - serving_with_llamastack diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md new file mode 100644 index 0000000000000..03b691c98b04e --- /dev/null +++ b/docs/source/serving/metrics.md @@ -0,0 +1,38 @@ +# Production Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +You can start the server using Python, or using \[Docker\](deploying_with_docker.rst): + +```console +$ vllm serve unsloth/Llama-3.2-1B-Instruct +``` + +Then query the endpoint to get the latest metrics from the server: + +```console +$ curl http://0.0.0.0:8000/metrics + +# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. +# TYPE vllm:iteration_tokens_total histogram +vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 +vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +... +``` + +The following metrics are exposed: + +```{literalinclude} ../../../vllm/engine/metrics.py +:end-before: end-metrics-definitions +:language: python +:start-after: begin-metrics-definitions +``` diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst deleted file mode 100644 index 231111cd7b738..0000000000000 --- a/docs/source/serving/metrics.rst +++ /dev/null @@ -1,38 +0,0 @@ -Production Metrics -================== - -vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM -OpenAI compatible API server. - -You can start the server using Python, or using [Docker](deploying_with_docker.rst): - -.. code-block:: console - - $ vllm serve unsloth/Llama-3.2-1B-Instruct - -Then query the endpoint to get the latest metrics from the server: - -.. code-block:: console - - $ curl http://0.0.0.0:8000/metrics - - # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. - # TYPE vllm:iteration_tokens_total histogram - vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 - vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - ... - -The following metrics are exposed: - -.. literalinclude:: ../../../vllm/engine/metrics.py - :language: python - :start-after: begin-metrics-definitions - :end-before: end-metrics-definitions diff --git a/docs/source/serving/run_on_sky.md b/docs/source/serving/run_on_sky.md new file mode 100644 index 0000000000000..115873ae49292 --- /dev/null +++ b/docs/source/serving/run_on_sky.md @@ -0,0 +1,345 @@ +(on-cloud)= + +# Deploying and scaling up with SkyPilot + +```{raw} html +

+ vLLM +

+``` + +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). + +## Prerequisites + +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`. +- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that {code}`sky check` shows clouds or Kubernetes are enabled. + +```console +pip install skypilot-nightly +sky check +``` + +## Run on a single instance + +See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). + +```yaml +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 +``` + +Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN +``` + +Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. + +```console +(task, pid=7431) Running on public URL: https://.gradio.live +``` + +**Optional**: Serve the 70B model instead of the default 8B and use more GPU: + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct +``` + +## Scale up to multiple replicas + +SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 +``` + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +Start the serving the Llama-3 8B model on multiple replicas: + +```console +HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN +``` + +Wait until the service is ready: + +```console +watch -n10 sky serve status vllm +``` + +```{raw} html +
+Example outputs: +``` + +```console +Services +NAME VERSION UPTIME STATUS REPLICAS ENDPOINT +vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + +Service Replicas +SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION +vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +``` + +```{raw} html +
+``` + +After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: + +```console +ENDPOINT=$(sky serve status --endpoint 8081 vllm) +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' +``` + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 +``` + +This will scale the service up to when the QPS exceeds 2 for each replica. + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +To update the service with the new config: + +```console +HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN +``` + +To stop the service: + +```console +sky serve down vllm +``` + +### **Optional**: Connect a GUI to the endpoint + +It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. + +```{raw} html +
+Click to see the full GUI YAML +``` + +```yaml +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + +resources: + cpus: 2 + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + +run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log +``` + +```{raw} html +
+``` + +1. Start the chat web UI: + +```console +sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) +``` + +2. Then, we can access the GUI at the returned gradio link: + +```console +| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live +``` diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst deleted file mode 100644 index 227e6fd2a7818..0000000000000 --- a/docs/source/serving/run_on_sky.rst +++ /dev/null @@ -1,366 +0,0 @@ -.. _on_cloud: - -Deploying and scaling up with SkyPilot -================================================ - -.. raw:: html - -

- vLLM -

- -vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. - - -Prerequisites -------------- - -- Go to the `HuggingFace model page `__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`. -- Check that you have installed SkyPilot (`docs `__). -- Check that :code:`sky check` shows clouds or Kubernetes are enabled. - -.. code-block:: console - - pip install skypilot-nightly - sky check - - -Run on a single instance ------------------------- - -See the vLLM SkyPilot YAML for serving, `serving.yaml `__. - -.. code-block:: yaml - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 - -Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN - -Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. - -.. code-block:: console - - (task, pid=7431) Running on public URL: https://.gradio.live - -**Optional**: Serve the 70B model instead of the default 8B and use more GPU: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct - - -Scale up to multiple replicas ------------------------------ - -SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - -.. raw:: html - -
- -Start the serving the Llama-3 8B model on multiple replicas: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN - - -Wait until the service is ready: - -.. code-block:: console - - watch -n10 sky serve status vllm - - -.. raw:: html - -
- Example outputs: - -.. code-block:: console - - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - -.. raw:: html - -
- -After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: - -.. code-block:: console - - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' - -To enable autoscaling, you could replace the `replicas` with the following configs in `service`: - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - -This will scale the service up to when the QPS exceeds 2 for each replica. - - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - - -.. raw:: html - -
- -To update the service with the new config: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN - - -To stop the service: - -.. code-block:: console - - sky serve down vllm - - -**Optional**: Connect a GUI to the endpoint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - -It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. - -.. raw:: html - -
- Click to see the full GUI YAML - -.. code-block:: yaml - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - - resources: - cpus: 2 - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - # Install Gradio for web UI. - pip install gradio openai - - run: | - conda activate vllm - export PATH=$PATH:/sbin - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log - - -.. raw:: html - -
- -1. Start the chat web UI: - -.. code-block:: console - - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) - - -2. Then, we can access the GUI at the returned gradio link: - -.. code-block:: console - - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live - - diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/serving_with_langchain.md new file mode 100644 index 0000000000000..96bd5943f3d64 --- /dev/null +++ b/docs/source/serving/serving_with_langchain.md @@ -0,0 +1,30 @@ +(run-on-langchain)= + +# Serving with Langchain + +vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) . + +To install langchain, run + +```console +$ pip install langchain langchain_community -q +``` + +To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. + +```python +from langchain_community.llms import VLLM + +llm = VLLM(model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # tensor_parallel_size=... # for distributed inference +) + +print(llm("What is the capital of France ?")) +``` + +Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst deleted file mode 100644 index 6440c8aad5986..0000000000000 --- a/docs/source/serving/serving_with_langchain.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _run_on_langchain: - -Serving with Langchain -============================ - -vLLM is also available via `Langchain `_ . - -To install langchain, run - -.. code-block:: console - - $ pip install langchain langchain_community -q - -To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. - -.. code-block:: python - - from langchain_community.llms import VLLM - - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference - ) - - print(llm("What is the capital of France ?")) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/serving_with_llamaindex.md new file mode 100644 index 0000000000000..98859d8e3f828 --- /dev/null +++ b/docs/source/serving/serving_with_llamaindex.md @@ -0,0 +1,26 @@ +(run-on-llamaindex)= + +# Serving with llama_index + +vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) . + +To install llamaindex, run + +```console +$ pip install llama-index-llms-vllm -q +``` + +To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. + +```python +from llama_index.llms.vllm import Vllm + +llm = Vllm( + model="microsoft/Orca-2-7b", + tensor_parallel_size=4, + max_new_tokens=100, + vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, +) +``` + +Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst deleted file mode 100644 index 038e961344e47..0000000000000 --- a/docs/source/serving/serving_with_llamaindex.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _run_on_llamaindex: - -Serving with llama_index -============================ - -vLLM is also available via `llama_index `_ . - -To install llamaindex, run - -.. code-block:: console - - $ pip install llama-index-llms-vllm -q - -To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. - -.. code-block:: python - - from llama_index.llms.vllm import Vllm - - llm = Vllm( - model="microsoft/Orca-2-7b", - tensor_parallel_size=4, - max_new_tokens=100, - vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, - ) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md new file mode 100644 index 0000000000000..d31ea0be38e41 --- /dev/null +++ b/docs/source/serving/serving_with_llamastack.md @@ -0,0 +1,38 @@ +(run-on-llamastack)= + +# Serving with Llama Stack + +vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . + +To install Llama Stack, run + +```console +$ pip install llama-stack -q +``` + +## Inference using OpenAI Compatible API + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +```yaml +inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 +``` + +Please refer to [this guide](https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md) for more details on this remote vLLM provider. + +## Inference via Embedded vLLM + +An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) +is also available. This is a sample of configuration using that method: + +```yaml +inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 +``` diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst deleted file mode 100644 index 8ef96c4e54369..0000000000000 --- a/docs/source/serving/serving_with_llamastack.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. _run_on_llamastack: - -Serving with Llama Stack -============================ - -vLLM is also available via `Llama Stack `_ . - -To install Llama Stack, run - -.. code-block:: console - - $ pip install llama-stack -q - -Inference using OpenAI Compatible API -------------------------------------- - -Then start Llama Stack server pointing to your vLLM server with the following configuration: - -.. code-block:: yaml - - inference: - - provider_id: vllm0 - provider_type: remote::vllm - config: - url: http://127.0.0.1:8000 - -Please refer to `this guide `_ for more details on this remote vLLM provider. - -Inference via Embedded vLLM ---------------------------- - -An `inline vLLM provider -`_ -is also available. This is a sample of configuration using that method: - -.. code-block:: yaml - - inference - - provider_type: vllm - config: - model: Llama3.1-8B-Instruct - tensor_parallel_size: 4 diff --git a/docs/source/serving/tensorizer.md b/docs/source/serving/tensorizer.md new file mode 100644 index 0000000000000..d3dd29d48f730 --- /dev/null +++ b/docs/source/serving/tensorizer.md @@ -0,0 +1,16 @@ +(tensorizer)= + +# Loading Models with CoreWeave's Tensorizer + +vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). + +```{note} +Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. +``` diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst deleted file mode 100644 index 96a93db94871b..0000000000000 --- a/docs/source/serving/tensorizer.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _tensorizer: - -Loading Models with CoreWeave's Tensorizer -========================================== -vLLM supports loading models with `CoreWeave's Tensorizer `_. -vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized -at runtime extremely quickly directly to the GPU, resulting in significantly -shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. - -For more information on CoreWeave's Tensorizer, please refer to -`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. - -.. note:: - Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.md similarity index 72% rename from docs/source/usage/compatibility_matrix.rst rename to docs/source/usage/compatibility_matrix.md index 04dd72b1e3527..f339a84ed12e9 100644 --- a/docs/source/usage/compatibility_matrix.rst +++ b/docs/source/usage/compatibility_matrix.md @@ -1,33 +1,31 @@ -.. _compatibility_matrix: +(compatibility-matrix)= -Compatibility Matrix -==================== +# Compatibility Matrix -The tables below show mutually exclusive features and the support on some hardware. +The tables below show mutually exclusive features and the support on some hardware. -.. note:: +```{note} +Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. +``` - Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. +## Feature x Feature -Feature x Feature ------------------ +```{raw} html + +``` -.. raw:: html - - - +```{eval-rst} .. list-table:: :header-rows: 1 :widths: auto @@ -50,192 +48,192 @@ Feature x Feature - beam-search - :abbr:`guided dec (Guided Decoding)` * - :ref:`CP ` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - * - :ref:`APC ` - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - * - :ref:`LoRA ` - - `✗ `__ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ - ✅ - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - * - :ref:`SD ` - ✅ - ✅ - ✗ - ✅ - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - * - CUDA graph - ✅ - ✅ - ✅ - ✅ - ✅ - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - * - :abbr:`pooling (Pooling Models)` - ✗ - ✗ - - ✗ - ✗ - ✗ - ✗ - - - - - - - - - - - - - - - - - - - - + - ✗ + - + - + - + - + - + - + - + - + - + - * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✗ - - `✗ `__ - - ✗ + - `✗ `__ + - ✗ - ✗ - - `✗ `__ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - + - `✗ `__ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - * - :abbr:`logP (Logprobs)` - ✅ - ✅ - ✅ - ✅ - ✅ - - ✅ + - ✅ - ✗ - ✅ - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - * - :abbr:`prmpt logP (Prompt Logprobs)` - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - - + - ✅ + - ✅ + - + - + - + - + - + - + - * - :abbr:`async output (Async Output Processing)` - ✅ - ✅ - ✅ - ✅ - ✗ - - ✅ - - ✗ + - ✅ + - ✗ - ✗ - ✅ - ✅ - - - - - - - - - - - - + - + - + - + - + - + - * - multi-step - ✗ - ✅ @@ -243,20 +241,20 @@ Feature x Feature - ✅ - ✗ - ✅ - - ✗ + - ✗ - ✗ - ✅ - - `✗ `__ + - `✗ `__ - ✅ - - - - - - - - - - + - + - + - + - + - * - :abbr:`mm (Multimodal Inputs)` - ✅ - - `✗ `__ - - `✗ `__ + - `✗ `__ + - `✗ `__ - ? - ? - ✅ @@ -266,44 +264,44 @@ Feature x Feature - ✅ - ✅ - ? - - - - - - - - + - + - + - + - * - best-of - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - `✗ `__ + - `✗ `__ - ✅ - - - - - - + - + - + - * - beam-search - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - `✗ `__ + - `✗ `__ - ? - ✅ - - - - + - + - * - :abbr:`guided dec (Guided Decoding)` - ✅ - ✅ @@ -316,16 +314,17 @@ Feature x Feature - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ? - ✅ - ✅ - - + - +``` -Feature x Hardware -^^^^^^^^^^^^^^^^^^ +### Feature x Hardware +```{eval-rst} .. list-table:: :header-rows: 1 :widths: auto @@ -339,7 +338,7 @@ Feature x Hardware - CPU - AMD * - :ref:`CP ` - - `✗ `__ + - `✗ `__ - ✅ - ✅ - ✅ @@ -347,7 +346,7 @@ Feature x Hardware - ✅ - ✅ * - :ref:`APC ` - - `✗ `__ + - `✗ `__ - ✅ - ✅ - ✅ @@ -360,7 +359,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ @@ -368,7 +367,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ * - :ref:`SD ` - ✅ @@ -440,7 +439,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ✅ * - best-of - ✅ @@ -466,3 +465,4 @@ Feature x Hardware - ✅ - ✅ - ✅ +``` diff --git a/docs/source/usage/engine_args.rst b/docs/source/usage/engine_args.md similarity index 76% rename from docs/source/usage/engine_args.rst rename to docs/source/usage/engine_args.md index e7ce8cdcabe88..cd3c6a430b7fa 100644 --- a/docs/source/usage/engine_args.rst +++ b/docs/source/usage/engine_args.md @@ -1,23 +1,25 @@ -.. _engine_args: +(engine-args)= -Engine Arguments -================ +# Engine Arguments Below, you can find an explanation of every engine argument for vLLM: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _engine_args_parser :prog: vllm serve :nodefaultconst: +``` -Async Engine Arguments ----------------------- +## Async Engine Arguments Below are the additional arguments related to the asynchronous engine: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _async_engine_args_parser :prog: vllm serve - :nodefaultconst: \ No newline at end of file + :nodefaultconst: +``` diff --git a/docs/source/usage/env_vars.md b/docs/source/usage/env_vars.md new file mode 100644 index 0000000000000..f9b08077a03b4 --- /dev/null +++ b/docs/source/usage/env_vars.md @@ -0,0 +1,15 @@ +# Environment Variables + +vLLM uses the following environment variables to configure the system: + +```{warning} +Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. + +All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). +``` + +```{literalinclude} ../../../vllm/envs.py +:end-before: end-env-vars-definition +:language: python +:start-after: begin-env-vars-definition +``` diff --git a/docs/source/usage/env_vars.rst b/docs/source/usage/env_vars.rst deleted file mode 100644 index ff2259c0da3f1..0000000000000 --- a/docs/source/usage/env_vars.rst +++ /dev/null @@ -1,14 +0,0 @@ -Environment Variables -======================== - -vLLM uses the following environment variables to configure the system: - -.. warning:: - Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. - - All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. - -.. literalinclude:: ../../../vllm/envs.py - :language: python - :start-after: begin-env-vars-definition - :end-before: end-env-vars-definition diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.md similarity index 61% rename from docs/source/usage/faq.rst rename to docs/source/usage/faq.md index d88da32092924..c388939c8dd71 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.md @@ -1,34 +1,33 @@ -.. _faq: +(faq)= -Frequently Asked Questions -=========================== +# Frequently Asked Questions - Q: How can I serve multiple models on a single port using the OpenAI API? +> Q: How can I serve multiple models on a single port using the OpenAI API? A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. ----------------------------------------- +______________________________________________________________________ - Q: Which model to use for offline inference embedding? +> Q: Which model to use for offline inference embedding? -A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; -more are listed :ref:`here `. +A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); +more are listed {ref}`here `. -By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, -`Mistral-7B-Instruct-v0.3 `__ into embedding models, +By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), +[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, but they are expected be inferior to models that are specifically trained on embedding tasks. ----------------------------------------- +______________________________________________________________________ - Q: Can the output of a prompt vary across runs in vLLM? +> Q: Can the output of a prompt vary across runs in vLLM? A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to -numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, -see the `Numerical Accuracy section `_. +numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, +see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations). In vLLM, the same requests might be batched differently due to factors such as other concurrent requests, -changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, -can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in +changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, +can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. **Mitigation Strategies** diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md new file mode 100644 index 0000000000000..a7bb881951abf --- /dev/null +++ b/docs/source/usage/lora.md @@ -0,0 +1,215 @@ +(lora)= + +# LoRA Adapters + +This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model. + +LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`. + +Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save +them locally with + +```python +from huggingface_hub import snapshot_download + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") +``` + +Then we instantiate the base model and pass in the `enable_lora=True` flag: + +```python +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) +``` + +We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter +of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and +the third parameter is the path to the LoRA adapter. + +```python +sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] +) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", +] + +outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) +) +``` + +Check out [examples/multilora_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py) +for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. + +## Serving LoRA Adapters + +LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use +`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: + +```bash +vllm serve meta-llama/Llama-2-7b-hf \ + --enable-lora \ + --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +```{note} +The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. +``` + +The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, +etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along +with its base model: + +```bash +curl localhost:8000/v1/models | jq . +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + ... + }, + { + "id": "sql-lora", + "object": "model", + ... + } + ] +} +``` + +Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be +processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other +LoRA adapter requests if they were provided and `max_loras` is set high enough). + +The following is an example request + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' | jq +``` + +## Dynamically serving LoRA Adapters + +In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading +LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility +to change models on-the-fly is needed. + +Note: Enabling this feature in production environments is risky as user may participate model adapter management. + +To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. + +```bash +export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True +``` + +Loading a LoRA Adapter: + +To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary +details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. + +Example request to load a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/load_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter", + "lora_path": "/path/to/sql-lora-adapter" +}' +``` + +Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter +cannot be found or loaded, an appropriate error message will be returned. + +Unloading a LoRA Adapter: + +To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint +with the name or ID of the adapter to be unloaded. + +Example request to unload a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/unload_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter" +}' +``` + +## New format for `--lora-modules` + +In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: + +```bash +--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. +Now, you can specify a base_model_name alongside the name and path using JSON format. For example: + +```bash +--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' +``` + +To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. + +## Lora model lineage in model card + +The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: + +- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. +- The `root` field points to the artifact location of the lora adapter. + +```bash +$ curl http://localhost:8000/v1/models + +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", + "parent": null, + "permission": [ + { + ..... + } + ] + }, + { + "id": "sql-lora", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", + "parent": meta-llama/Llama-2-7b-hf, + "permission": [ + { + .... + } + ] + } + ] +} +``` diff --git a/docs/source/usage/lora.rst b/docs/source/usage/lora.rst deleted file mode 100644 index c2c6fa2aebfaf..0000000000000 --- a/docs/source/usage/lora.rst +++ /dev/null @@ -1,225 +0,0 @@ -.. _lora: - -LoRA Adapters -============= - -This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. - -LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`. - -Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save -them locally with - -.. code-block:: python - - from huggingface_hub import snapshot_download - - sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - -Then we instantiate the base model and pass in the ``enable_lora=True`` flag: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.lora.request import LoRARequest - - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) - - -We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter -of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and -the third parameter is the path to the LoRA adapter. - -.. code-block:: python - - sampling_params = SamplingParams( - temperature=0, - max_tokens=256, - stop=["[/assistant]"] - ) - - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - ] - - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) - ) - - -Check out `examples/multilora_inference.py `_ -for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. - -Serving LoRA Adapters ---------------------- -LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use -``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server: - -.. code-block:: bash - - vllm serve meta-llama/Llama-2-7b-hf \ - --enable-lora \ - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -.. note:: - The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. - -The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``, -etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along -with its base model: - -.. code-block:: bash - - curl localhost:8000/v1/models | jq . - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - ... - }, - { - "id": "sql-lora", - "object": "model", - ... - } - ] - } - -Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be -processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other -LoRA adapter requests if they were provided and ``max_loras`` is set high enough). - -The following is an example request - -.. code-block:: bash - - curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "sql-lora", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' | jq - - -Dynamically serving LoRA Adapters ---------------------------------- - -In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading -LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility -to change models on-the-fly is needed. - -Note: Enabling this feature in production environments is risky as user may participate model adapter management. - -To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` -is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. - -.. code-block:: bash - - export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True - - -Loading a LoRA Adapter: - -To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary -details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. - -Example request to load a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/load_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter", - "lora_path": "/path/to/sql-lora-adapter" - }' - -Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter -cannot be found or loaded, an appropriate error message will be returned. - -Unloading a LoRA Adapter: - -To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint -with the name or ID of the adapter to be unloaded. - -Example request to unload a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/unload_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter" - }' - - -New format for `--lora-modules` -------------------------------- - -In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: - -.. code-block:: bash - - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. -Now, you can specify a base_model_name alongside the name and path using JSON format. For example: - -.. code-block:: bash - - --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' - -To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. - - -Lora model lineage in model card --------------------------------- - -The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: - -- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. -- The `root` field points to the artifact location of the lora adapter. - -.. code-block:: bash - - $ curl http://localhost:8000/v1/models - - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", - "parent": null, - "permission": [ - { - ..... - } - ] - }, - { - "id": "sql-lora", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", - "parent": meta-llama/Llama-2-7b-hf, - "permission": [ - { - .... - } - ] - } - ] - } diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md new file mode 100644 index 0000000000000..86a003153b633 --- /dev/null +++ b/docs/source/usage/multimodal_inputs.md @@ -0,0 +1,398 @@ +(multimodal-inputs)= + +# Multimodal Inputs + +This page teaches you how to pass multi-modal inputs to {ref}`multi-modal models ` in vLLM. + +```{note} +We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes, +and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. +``` + +## Offline Inference + +To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: + +- `prompt`: The prompt should follow the format that is documented on HuggingFace. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. + +### Image + +You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples: + +```python +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Load the image using PIL.Image +image = PIL.Image.open(...) + +# Single prompt inference +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +# Batch inference +image_1 = PIL.Image.open(...) +image_2 = PIL.Image.open(...) +outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] +) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +A code example can be found in [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py). + +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: + +```python +llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept +) + +# Refer to the HuggingFace repo for the correct format to use +prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + +# Load the images using PIL.Image +image1 = PIL.Image.open(...) +image2 = PIL.Image.open(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +A code example can be found in [examples/offline_inference_vision_language_multi_image.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py). + +Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: + +```python +# Specify the maximum number of frames per video to be 4. This can be changed. +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + +# Create the request payload. +video_frames = ... # load your video making sure it only has the number of frames specified earlier. +message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], +} +for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + +# Perform inference and log output. +outputs = llm.chat([message]) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +### Video + +You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Please refer to [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py) for more details. + +### Audio + +You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. + +Please refer to [examples/offline_inference_audio_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py) for more details. + +### Embedding + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +```python +# Inference with image embeddings as input +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Embeddings for single image +# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +```python +# Construct the prompt based on your model +prompt = ... + +# Embeddings for multiple images +# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +# Qwen2-VL +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } +} + +# MiniCPM-V +llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } +} + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +## Online Inference + +Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). + +```{important} +A chat template is **required** to use Chat Completions API. + +Although most models come with a chat template, for others you have to define one yourself. +The chat template can be inferred based on the documentation on the model's HuggingFace repo. +For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja). +``` + +### Image + +Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). +Here is a simple example using Phi-3.5-Vision. + +First, launch the OpenAI-compatible server: + +```bash +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 +``` + +Then, you can use the OpenAI client as follows: + +```python +from openai import OpenAI + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +# Single-image input inference +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) + +# Multi-image input inference +image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" +image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) +``` + +A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). + +```{tip} +Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, +and pass the file path as `url` in the API request. +``` + +```{tip} +There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. +In fact, you can place image placeholders in the middle of the text by interleaving text and image content. +``` + +````{note} +By default, the timeout for fetching images through HTTP URL is `5` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_IMAGE_FETCH_TIMEOUT= +``` +```` + +### Video + +Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. + +You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py) as reference. + +````{note} +By default, the timeout for fetching videos through HTTP URL url is `30` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_VIDEO_FETCH_TIMEOUT= +``` +```` + +### Audio + +Instead of {code}`image_url`, you can pass an audio file via {code}`audio_url`. + +A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). + +````{note} +By default, the timeout for fetching audios through HTTP URL is `10` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_AUDIO_FETCH_TIMEOUT= +``` +```` + +### Embedding + +vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings), +where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models. + +```{tip} +The schema of `messages` is exactly the same as in Chat Completions API. +You can refer to the above tutorials for more details on how to pass each type of multi-modal data. +``` + +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: + +```bash +vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja +``` + +```{important} +Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embedding` +to run this model in embedding mode instead of text generation mode. + +The custom chat template is completely different from the original one for this model, +and can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja). +``` + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + +```python +import requests + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, +) +response.raise_for_status() +response_json = response.json() +print("Embedding output:", response_json["data"][0]["embedding"]) +``` + +Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model. + +```bash +vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja +``` + +```{important} +Like with VLM2Vec, we have to explicitly pass `--task embedding`. + +Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled +by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja). +``` + +```{important} +Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code +example below for details. +``` + +A full code example can be found in [examples/openai_chat_embedding_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py). diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst deleted file mode 100644 index c93f65327e31b..0000000000000 --- a/docs/source/usage/multimodal_inputs.rst +++ /dev/null @@ -1,404 +0,0 @@ -.. _multimodal_inputs: - -Multimodal Inputs -================= - -This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. - -.. note:: - We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, - and `open an issue on GitHub `_ if you have any feedback or feature requests. - -Offline Inference ------------------ - -To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: - -* ``prompt``: The prompt should follow the format that is documented on HuggingFace. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. - -Image -^^^^^ - -You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Load the image using PIL.Image - image = PIL.Image.open(...) - - # Single prompt inference - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Batch inference - image_1 = PIL.Image.open(...) - image_2 = PIL.Image.open(...) - outputs = llm.generate( - [ - { - "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_1}, - }, - { - "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_2}, - } - ] - ) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language.py `_. - -To substitute multiple images inside the same text prompt, you can pass in a list of images instead: - -.. code-block:: python - - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, # Required to load Phi-3.5-vision - max_model_len=4096, # Otherwise, it may not fit in smaller GPUs - limit_mm_per_prompt={"image": 2}, # The maximum number to accept - ) - - # Refer to the HuggingFace repo for the correct format to use - prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" - - # Load the images using PIL.Image - image1 = PIL.Image.open(...) - image2 = PIL.Image.open(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. - -Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: - -.. code-block:: python - - # Specify the maximum number of frames per video to be 4. This can be changed. - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - - # Create the request payload. - video_frames = ... # load your video making sure it only has the number of frames specified earlier. - message = { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, - ], - } - for i in range(len(video_frames)): - base64_image = encode_image(video_frames[i]) # base64 encoding. - new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - message["content"].append(new_image) - - # Perform inference and log output. - outputs = llm.chat([message]) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Video -^^^^^ - -You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary -instead of using multi-image input. - -Please refer to `examples/offline_inference_vision_language.py `_ for more details. - -Audio -^^^^^ - -You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. - -Please refer to `examples/offline_inference_audio_language.py `_ for more details. - -Embedding -^^^^^^^^^ - -To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, -pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. - -.. code-block:: python - - # Inference with image embeddings as input - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Embeddings for single image - # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: - -.. code-block:: python - - # Construct the prompt based on your model - prompt = ... - - # Embeddings for multiple images - # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - # Qwen2-VL - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_grid_thw is needed to calculate positional encoding. - "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), - } - } - - # MiniCPM-V - llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_size_list is needed to calculate details of the sliced image. - "image_size_list": [image.size for image in images], # list of image sizes - } - } - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Online Inference ----------------- - -Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. - -.. important:: - A chat template is **required** to use Chat Completions API. - - Although most models come with a chat template, for others you have to define one yourself. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. - -Image -^^^^^ - -Image input is supported according to `OpenAI Vision API `_. -Here is a simple example using Phi-3.5-Vision. - -First, launch the OpenAI-compatible server: - -.. code-block:: bash - - vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 - -Then, you can use the OpenAI client as follows: - -.. code-block:: python - - from openai import OpenAI - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Single-image input inference - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - - # Multi-image input inference - image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" - image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. tip:: - Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, - and pass the file path as ``url`` in the API request. - -.. tip:: - There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. - In fact, you can place image placeholders in the middle of the text by interleaving text and image content. - -.. note:: - - By default, the timeout for fetching images through HTTP URL is ``5`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_IMAGE_FETCH_TIMEOUT= - -Video -^^^^^ - -Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. - -You can use `these tests `_ as reference. - -.. note:: - - By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_VIDEO_FETCH_TIMEOUT= - -Audio -^^^^^ - -Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`. - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. note:: - - By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_AUDIO_FETCH_TIMEOUT= - -Embedding -^^^^^^^^^ - -vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. - -.. tip:: - The schema of ``messages`` is exactly the same as in Chat Completions API. - You can refer to the above tutorials for more details on how to pass each type of multi-modal data. - -Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. -Refer to the examples below for illustration. - -Here is an end-to-end example using VLM2Vec. To serve the model: - -.. code-block:: bash - - vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja - -.. important:: - - Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` - to run this model in embedding mode instead of text generation mode. - - The custom chat template is completely different from the original one for this model, - and can be found `here `__. - -Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: - -.. code-block:: python - - import requests - - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, - ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) - -Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. - -.. code-block:: bash - - vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja - -.. important:: - - Like with VLM2Vec, we have to explicitly pass ``--task embedding``. - - Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled - by `this custom chat template `__. - -.. important:: - - Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code - example below for details. - -A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/usage/performance.rst b/docs/source/usage/performance.md similarity index 61% rename from docs/source/usage/performance.rst rename to docs/source/usage/performance.md index 23b5ab79a7378..4a4b0d52f9eb1 100644 --- a/docs/source/usage/performance.rst +++ b/docs/source/usage/performance.md @@ -1,17 +1,16 @@ -.. _performance: +(performance)= -Performance and Tuning -====================== +# Performance and Tuning + +## Preemption -Preemption ----------- Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +`` ` WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 -``` +` `` While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency. If you frequently encounter preemptions from the vLLM engine, consider the following actions: @@ -22,44 +21,44 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. -.. _chunked-prefill: +(chunked-prefill)= -Chunked Prefill ---------------- -vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. +## Chunked Prefill -You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor. +vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. -.. code-block:: python +You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor. - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) - # Set max_num_batched_tokens to tune performance. - # NOTE: 512 is the default max_num_batched_tokens for chunked prefill. - # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +```python +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) +# Set max_num_batched_tokens to tune performance. +# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. +# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +``` By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization. Once chunked prefill is enabled, the policy is changed to prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill. -When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills. -If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it. +When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. +If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it. This policy has two benefits: - It improves ITL and generation decode because decode requests are prioritized. - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. -You can tune the performance by changing ``max_num_batched_tokens``. +You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). -Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes. -Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch. +Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. +Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. -- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). -- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler. +- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). +- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. -We recommend you set ``max_num_batched_tokens > 2048`` for throughput. +We recommend you set `max_num_batched_tokens > 2048` for throughput. -See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). +See related papers for more details ( or ). -Please try out this feature and let us know your feedback via GitHub issues! \ No newline at end of file +Please try out this feature and let us know your feedback via GitHub issues! diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md new file mode 100644 index 0000000000000..77e35c437de30 --- /dev/null +++ b/docs/source/usage/spec_decode.md @@ -0,0 +1,205 @@ +(spec-decode)= + +# Speculative decoding + +```{warning} +Please note that speculative decoding in vLLM is not yet optimized and does +not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work +to optimize it is ongoing and can be followed in [this issue.](https://github.com/vllm-project/vllm/issues/4630) +``` + +```{warning} +Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. +``` + +This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. +Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. + +## Speculating with a draft model + +The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +To perform the same with an online mode launch the server: + +```bash +python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 +``` + +Then use a client: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Completion API +stream = False +completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, +) + +print("Completion results:") +if stream: + for c in completion: + print(c) +else: + print(completion) +``` + +## Speculating by matching n-grams in the prompt + +The following code configures vLLM to use speculative decoding where proposals are generated by +matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="[ngram]", + num_speculative_tokens=5, + ngram_prompt_lookup_max=4, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +## Speculating using MLP speculators + +The following code configures vLLM to use speculative decoding where proposals are generated by +draft models that conditioning draft predictions on both context vectors and sampled tokens. +For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or +[this technical report](https://arxiv.org/abs/2404.19124). + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_draft_tensor_parallel_size=1, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +Note that these speculative models currently need to be run without tensor parallelism, although +it is possible to run the main model using tensor parallelism (see example above). Since the +speculative models are relatively small, we still see significant speedups. However, this +limitation will be fixed in a future release. + +A variety of speculative models of this type are available on HF hub: + +- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) +- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) +- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) +- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) + +## Lossless guarantees of Speculative Decoding + +In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of +speculative decoding, breaking down the guarantees into three key areas: + +1. **Theoretical Losslessness** + \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might + cause slight variations in output distributions, as discussed + in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318) + +2. **Algorithmic Losslessness** + \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: + + > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target + > distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252) + > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling + > without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, + > provides a lossless guarantee. Almost all of the tests in [this directory](https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e) + > verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291) + +3. **vLLM Logprob Stability** + \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the + same request across runs. For more details, see the FAQ section + titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +**Conclusion** + +While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding +can occur due to following factors: + +- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. +- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially + due to non-deterministic behavior in batched operations or numerical instability. + +**Mitigation Strategies** + +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +## Resources for vLLM contributors + +- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4) +- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a) +- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8) +- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565) diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst deleted file mode 100644 index f1f1917f974bb..0000000000000 --- a/docs/source/usage/spec_decode.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. _spec_decode: - -Speculative decoding -==================== - -.. warning:: - Please note that speculative decoding in vLLM is not yet optimized and does - not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work - to optimize it is ongoing and can be followed in `this issue. `_ - -.. warning:: - Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. - -This document shows how to use `Speculative Decoding `_ with vLLM. -Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. - -Speculating with a draft model ------------------------------- - -The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="facebook/opt-125m", - num_speculative_tokens=5, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -To perform the same with an online mode launch the server: - -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ - --num_speculative_tokens 5 --gpu_memory_utilization 0.8 - -Then use a client: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, - ) - - models = client.models.list() - model = models.data[0].id - - # Completion API - stream = False - completion = client.completions.create( - model=model, - prompt="The future of AI is", - echo=False, - n=1, - stream=stream, - ) - - print("Completion results:") - if stream: - for c in completion: - print(c) - else: - print(completion) - -Speculating by matching n-grams in the prompt ---------------------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -matching n-grams in the prompt. For more information read `this thread. `_ - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="[ngram]", - num_speculative_tokens=5, - ngram_prompt_lookup_max=4, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Speculating using MLP speculators ---------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -draft models that conditioning draft predictions on both context vectors and sampled tokens. -For more information see `this blog `_ or -`this technical report `_. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="meta-llama/Meta-Llama-3.1-70B-Instruct", - tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", - speculative_draft_tensor_parallel_size=1, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Note that these speculative models currently need to be run without tensor parallelism, although -it is possible to run the main model using tensor parallelism (see example above). Since the -speculative models are relatively small, we still see significant speedups. However, this -limitation will be fixed in a future release. - -A variety of speculative models of this type are available on HF hub: - -* `llama-13b-accelerator `_ -* `llama3-8b-accelerator `_ -* `codellama-34b-accelerator `_ -* `llama2-70b-accelerator `_ -* `llama3-70b-accelerator `_ -* `granite-3b-code-instruct-accelerator `_ -* `granite-8b-code-instruct-accelerator `_ -* `granite-7b-instruct-accelerator `_ -* `granite-20b-code-instruct-accelerator `_ - -Lossless guarantees of Speculative Decoding -------------------------------------------- -In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of -speculative decoding, breaking down the guarantees into three key areas: - -1. **Theoretical Losslessness** - - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might - cause slight variations in output distributions, as discussed - in `Accelerating Large Language Model Decoding with Speculative Sampling `_ - -2. **Algorithmic Losslessness** - - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: - - - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target - distribution. `View Test Code `_ - - - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling - without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, - provides a lossless guarantee. Almost all of the tests in `this directory `_ - verify this property using `this assertion implementation `_ - -3. **vLLM Logprob Stability** - - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the - same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - - -**Conclusion** - -While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding -can occur due to following factors: - -- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. - -- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially - due to non-deterministic behavior in batched operations or numerical instability. - -**Mitigation Strategies** - -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - -Resources for vLLM contributors -------------------------------- -* `A Hacker's Guide to Speculative Decoding in vLLM `_ -* `What is Lookahead Scheduling in vLLM? `_ -* `Information on batch expansion `_ -* `Dynamic speculative decoding `_ diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md new file mode 100644 index 0000000000000..23104817a2711 --- /dev/null +++ b/docs/source/usage/structured_outputs.md @@ -0,0 +1,260 @@ +(structured-outputs)= + +# Structured Outputs + +vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. +This document shows you some examples of the different options that are available to generate structured outputs. + +## Online Inference (OpenAI API) + +You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +The following parameters are supported, which must be added as extra parameters: + +- `guided_choice`: the output will be exactly one of the choices. +- `guided_regex`: the output will follow the regex pattern. +- `guided_json`: the output will follow the JSON schema. +- `guided_grammar`: the output will follow the context free grammar. +- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. +- `guided_decoding_backend`: used to select the guided decoding backend to use. + +You can see the complete list of supported parameters on the [OpenAI Compatible Server](/../serving/openai_compatible_server.html) page. + +Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: + +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", +) + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, +) +print(completion.choices[0].message.content) +``` + +The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: + +```python +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, +) +print(completion.choices[0].message.content) +``` + +One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. +For this we can use the `guided_json` parameter in two different ways: + +- Using directly a [JSON Schema](https://json-schema.org/) +- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). + +The next example shows how to use the `guided_json` parameter with a Pydantic model: + +```python +from pydantic import BaseModel +from enum import Enum + +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + extra_body={"guided_json": json_schema}, +) +print(completion.choices[0].message.content) +``` + +```{tip} +While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. +This can improve the results notably in most cases. +``` + +Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. +It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: + +```python +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, +) +print(completion.choices[0].message.content) +``` + +The complete code of the examples can be found on [examples/openai_chat_completion_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py). + +## Experimental Automatic Parsing (OpenAI API) + +This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types. + +At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104). + +For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct` + +Here is a simple example demonstrating how to get structured output using Pydantic models: + +```python +from pydantic import BaseModel +from openai import OpenAI + + +class Info(BaseModel): + name: str + age: int + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +print("Name:", message.parsed.name) +print("Age:", message.parsed.age) +``` + +Output: + +```console +ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) +Name: Cameron +Age: 28 +``` + +Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: + +```python +from typing import List +from pydantic import BaseModel +from openai import OpenAI + + +class Step(BaseModel): + explanation: str + output: str + + +class MathResponse(BaseModel): + steps: List[Step] + final_answer: str + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) +print("Answer:", message.parsed.final_answer) +``` + +Output: + +```console +ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) +Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' +Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' +Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' +Answer: x = -29/8 +``` + +## Offline Inference + +Offline inference allows for the same types of guided decoding. +To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. +The main available options inside `GuidedDecodingParams` are: + +- `json` +- `regex` +- `choice` +- `grammar` +- `backend` +- `whitespace_pattern` + +These parameters can be used in the same way as the parameters from the Online Inference examples above. +One example for the usage of the `choices` parameter is shown below: + +```python +from vllm import LLM, SamplingParams +from vllm.sampling_params import GuidedDecodingParams + +llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + +guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) +``` + +A complete example with all options can be found in [examples/offline_inference_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py). diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst deleted file mode 100644 index 484e1f17d191e..0000000000000 --- a/docs/source/usage/structured_outputs.rst +++ /dev/null @@ -1,267 +0,0 @@ -.. _structured_outputs: - -Structured Outputs -================== - -vLLM supports the generation of structured outputs using `outlines `_ or `lm-format-enforcer `_ as backends for the guided decoding. -This document shows you some examples of the different options that are available to generate structured outputs. - - -Online Inference (OpenAI API) ------------------------------ - -You can generate structured outputs using the OpenAI's `Completions `_ and `Chat `_ API. - -The following parameters are supported, which must be added as extra parameters: - -- ``guided_choice``: the output will be exactly one of the choices. -- ``guided_regex``: the output will follow the regex pattern. -- ``guided_json``: the output will follow the JSON schema. -- ``guided_grammar``: the output will follow the context free grammar. -- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding. -- ``guided_decoding_backend``: used to select the guided decoding backend to use. - -You can see the complete list of supported parameters on the `OpenAI Compatible Server `_ page. - -Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: - -.. code-block:: python - - from openai import OpenAI - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", - ) - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, - ) - print(completion.choices[0].message.content) - - -The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: - -.. code-block:: python - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", - } - ], - extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, - ) - print(completion.choices[0].message.content) - -One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the ``guided_json`` parameter in two different ways: - -- Using directly a `JSON Schema `_ -- Defining a `Pydantic model `_ and then extracting the JSON Schema from it (which is normally an easier option). - -The next example shows how to use the ``guided_json`` parameter with a Pydantic model: - -.. code-block:: python - - from pydantic import BaseModel - from enum import Enum - - class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - - class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - - json_schema = CarDescription.model_json_schema() - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", - } - ], - extra_body={"guided_json": json_schema}, - ) - print(completion.choices[0].message.content) - -.. tip:: - While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. - This can improve the results notably in most cases. - - -Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. -It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: - -.. code-block:: python - - simplified_sql_grammar = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ - """ - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, - ) - print(completion.choices[0].message.content) - -The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. - -Experimental Automatic Parsing (OpenAI API) --------------------------------------------- - -This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types. - -At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here `_. - -For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct`` - -Here is a simple example demonstrating how to get structured output using Pydantic models: - -.. code-block:: python - - from pydantic import BaseModel - from openai import OpenAI - - - class Info(BaseModel): - name: str - age: int - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, - ], - response_format=Info, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - print("Name:", message.parsed.name) - print("Age:", message.parsed.age) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) - Name: Cameron - Age: 28 - - -Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: - -.. code-block:: python - - from typing import List - from pydantic import BaseModel - from openai import OpenAI - - - class Step(BaseModel): - explanation: str - output: str - - - class MathResponse(BaseModel): - steps: List[Step] - final_answer: str - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful expert math tutor."}, - {"role": "user", "content": "Solve 8x + 31 = 2."}, - ], - response_format=MathResponse, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - for i, step in enumerate(message.parsed.steps): - print(f"Step #{i}:", step) - print("Answer:", message.parsed.final_answer) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) - Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' - Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' - Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' - Answer: x = -29/8 - -Offline Inference ------------------ - -Offline inference allows for the same types of guided decoding. -To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. -The main available options inside ``GuidedDecodingParams`` are: - -- ``json`` -- ``regex`` -- ``choice`` -- ``grammar`` -- ``backend`` -- ``whitespace_pattern`` - -These parameters can be used in the same way as the parameters from the Online Inference examples above. -One example for the usage of the ``choices`` parameter is shown below: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.sampling_params import GuidedDecodingParams - - llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") - - guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) - sampling_params = SamplingParams(guided_decoding=guided_decoding_params) - outputs = llm.generate( - prompts="Classify this sentiment: vLLM is wonderful!", - sampling_params=sampling_params, - ) - print(outputs[0].outputs[0].text) - -A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. From df9919c9aabeb6a8366dcb097b446fa11a2eb7a3 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 16 Dec 2024 14:43:26 -0500 Subject: [PATCH 02/27] Update myst-parser version Signed-off-by: Rafael Vasquez --- docs/requirements-docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index ca2da4cd66d2d..4859c8ac08bea 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==6.2.1 sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 -myst-parser==2.0.0 +myst-parser==3.0.1 sphinx-argparse==0.4.0 msgspec cloudpickle From 10cbcf2a3deacf3f5a6286c56fff1c04ce324e06 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 16 Dec 2024 14:44:15 -0500 Subject: [PATCH 03/27] Update docs, refs to .rst Signed-off-by: Rafael Vasquez --- .gitignore | 2 + docs/source/conf.py | 2 +- docs/source/generate_examples.py | 6 +- .../getting_started/examples/api_client.md | 12 +-- .../getting_started/examples/aqlm_example.md | 12 +-- .../getting_started/examples/cpu_offload.md | 12 +-- .../examples/examples_index.md | 76 ++++++++++--------- .../examples/florence2_inference.md | 12 +-- .../examples/gguf_inference.md | 12 +-- .../gradio_openai_chatbot_webserver.md | 12 +-- .../examples/gradio_webserver.md | 12 +-- .../examples/llm_engine_example.md | 12 +-- .../lora_with_quantization_inference.md | 12 +-- .../examples/multilora_inference.md | 12 +-- .../examples/offline_chat_with_tools.md | 12 +-- .../examples/offline_inference.md | 12 +-- .../examples/offline_inference_arctic.md | 12 +-- .../offline_inference_audio_language.md | 12 +-- .../examples/offline_inference_chat.md | 12 +-- .../examples/offline_inference_distributed.md | 12 +-- .../examples/offline_inference_embedding.md | 12 +-- .../offline_inference_encoder_decoder.md | 12 +-- .../offline_inference_mlpspeculator.md | 12 +-- .../examples/offline_inference_neuron.md | 12 +-- ...line_inference_neuron_int8_quantization.md | 12 +-- .../examples/offline_inference_pixtral.md | 12 +-- .../examples/offline_inference_tpu.md | 12 +-- .../offline_inference_vision_language.md | 12 +-- ...ine_inference_vision_language_embedding.md | 12 +-- ...e_inference_vision_language_multi_image.md | 12 +-- .../examples/offline_inference_with_prefix.md | 12 +-- .../offline_inference_with_profiler.md | 12 +-- .../examples/offline_profile.md | 12 +-- .../examples/openai_chat_completion_client.md | 12 +-- ...i_chat_completion_client_for_multimodal.md | 12 +-- ...penai_chat_completion_client_with_tools.md | 12 +-- ...ai_chat_embedding_client_for_multimodal.md | 12 +-- .../examples/openai_completion_client.md | 12 +-- .../examples/openai_embedding_client.md | 12 +-- .../examples/save_sharded_state.md | 12 +-- .../examples/tensorize_vllm_model.md | 12 +-- .../serving/openai_compatible_server.md | 12 +-- vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/config.py | 6 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/executor/cpu_executor.py | 2 +- vllm/platforms/cpu.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 2 +- vllm/worker/utils.py | 2 +- 52 files changed, 286 insertions(+), 280 deletions(-) diff --git a/.gitignore b/.gitignore index ceef6a5fba456..bb7e4d5b244a8 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,8 @@ instance/ docs/_build/ docs/source/getting_started/examples/*.rst !**/*.template.rst +docs/source/getting_started/examples/*.md +!**/*.template.md # PyBuilder .pybuilder/ diff --git a/docs/source/conf.py b/docs/source/conf.py index e9d9ac68c9560..6f1d1842fe686 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,7 +51,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.rst"] +exclude_patterns: List[str] = ["**/*.template.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 79b49a186236a..8e43870246ae6 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -38,7 +38,7 @@ def generate_examples(): # Destination paths doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths] + doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] # Generate the example docs for each example script for script_path, doc_path in zip(script_paths, doc_paths): @@ -54,8 +54,8 @@ def generate_examples(): f.write(content) # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.rst") as f: + with open(doc_dir / "examples_index.template.md") as f: examples_index = f.read() - with open(doc_dir / "examples_index.rst", "w+") as f: + with open(doc_dir / "examples_index.md", "w+") as f: example_docs = "\n ".join(path.stem for path in script_paths) f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) diff --git a/docs/source/getting_started/examples/api_client.md b/docs/source/getting_started/examples/api_client.md index 925f74cdc7eb4..31c984671fb9d 100644 --- a/docs/source/getting_started/examples/api_client.md +++ b/docs/source/getting_started/examples/api_client.md @@ -1,8 +1,8 @@ -# API Client +API Client +========== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/api_client.py. -```{literalinclude} ../../../../examples/api_client.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/api_client.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/aqlm_example.md b/docs/source/getting_started/examples/aqlm_example.md index 51ed19a3c61fd..455aff5645459 100644 --- a/docs/source/getting_started/examples/aqlm_example.md +++ b/docs/source/getting_started/examples/aqlm_example.md @@ -1,8 +1,8 @@ -# Aqlm Example +Aqlm Example +============ -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/aqlm_example.py. -```{literalinclude} ../../../../examples/aqlm_example.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/aqlm_example.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/cpu_offload.md b/docs/source/getting_started/examples/cpu_offload.md index f5b0d58b743a2..df28306947873 100644 --- a/docs/source/getting_started/examples/cpu_offload.md +++ b/docs/source/getting_started/examples/cpu_offload.md @@ -1,8 +1,8 @@ -# Cpu Offload +Cpu Offload +=========== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/cpu_offload.py. -```{literalinclude} ../../../../examples/cpu_offload.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/cpu_offload.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/examples_index.md b/docs/source/getting_started/examples/examples_index.md index d074aae3edb8c..690a557fd9d68 100644 --- a/docs/source/getting_started/examples/examples_index.md +++ b/docs/source/getting_started/examples/examples_index.md @@ -5,40 +5,44 @@ :maxdepth: 1 api_client -aqlm_example -cpu_offload -florence2_inference -gguf_inference -gradio_openai_chatbot_webserver -gradio_webserver -llm_engine_example -lora_with_quantization_inference -multilora_inference -offline_chat_with_tools -offline_inference -offline_inference_arctic -offline_inference_audio_language -offline_inference_chat -offline_inference_distributed -offline_inference_embedding -offline_inference_encoder_decoder -offline_inference_mlpspeculator -offline_inference_neuron -offline_inference_neuron_int8_quantization -offline_inference_pixtral -offline_inference_tpu -offline_inference_vision_language -offline_inference_vision_language_embedding -offline_inference_vision_language_multi_image -offline_inference_with_prefix -offline_inference_with_profiler -offline_profile -openai_chat_completion_client -openai_chat_completion_client_for_multimodal -openai_chat_completion_client_with_tools -openai_chat_embedding_client_for_multimodal -openai_completion_client -openai_embedding_client -save_sharded_state -tensorize_vllm_model + aqlm_example + cpu_offload + florence2_inference + gguf_inference + gradio_openai_chatbot_webserver + gradio_webserver + llm_engine_example + lora_with_quantization_inference + multilora_inference + offline_chat_with_tools + offline_inference + offline_inference_arctic + offline_inference_audio_language + offline_inference_chat + offline_inference_cli + offline_inference_distributed + offline_inference_embedding + offline_inference_encoder_decoder + offline_inference_mlpspeculator + offline_inference_neuron + offline_inference_neuron_int8_quantization + offline_inference_pixtral + offline_inference_structured_outputs + offline_inference_tpu + offline_inference_vision_language + offline_inference_vision_language_embedding + offline_inference_vision_language_multi_image + offline_inference_with_prefix + offline_inference_with_profiler + offline_profile + openai_chat_completion_client + openai_chat_completion_client_for_multimodal + openai_chat_completion_client_with_tools + openai_chat_completion_structured_outputs + openai_chat_embedding_client_for_multimodal + openai_completion_client + openai_cross_encoder_score + openai_embedding_client + save_sharded_state + tensorize_vllm_model ``` diff --git a/docs/source/getting_started/examples/florence2_inference.md b/docs/source/getting_started/examples/florence2_inference.md index 3805648736b7c..e8e3a99d6d1b1 100644 --- a/docs/source/getting_started/examples/florence2_inference.md +++ b/docs/source/getting_started/examples/florence2_inference.md @@ -1,8 +1,8 @@ -# Florence2 Inference +Florence2 Inference +=================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/florence2_inference.py. -```{literalinclude} ../../../../examples/florence2_inference.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/florence2_inference.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/gguf_inference.md b/docs/source/getting_started/examples/gguf_inference.md index 96d6da400f4f6..dbac9dd722ed7 100644 --- a/docs/source/getting_started/examples/gguf_inference.md +++ b/docs/source/getting_started/examples/gguf_inference.md @@ -1,8 +1,8 @@ -# Gguf Inference +Gguf Inference +============== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/gguf_inference.py. -```{literalinclude} ../../../../examples/gguf_inference.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/gguf_inference.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md b/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md index 926c10d95efe1..7fca8b17c7660 100644 --- a/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md +++ b/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md @@ -1,8 +1,8 @@ -# Gradio OpenAI Chatbot Webserver +Gradio OpenAI Chatbot Webserver +=============================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/gradio_openai_chatbot_webserver.py. -```{literalinclude} ../../../../examples/gradio_openai_chatbot_webserver.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/gradio_openai_chatbot_webserver.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/gradio_webserver.md b/docs/source/getting_started/examples/gradio_webserver.md index c5f8f7a739da6..7ec4576a301c9 100644 --- a/docs/source/getting_started/examples/gradio_webserver.md +++ b/docs/source/getting_started/examples/gradio_webserver.md @@ -1,8 +1,8 @@ -# Gradio Webserver +Gradio Webserver +================ -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/gradio_webserver.py. -```{literalinclude} ../../../../examples/gradio_webserver.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/gradio_webserver.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/llm_engine_example.md b/docs/source/getting_started/examples/llm_engine_example.md index 909b730a5b143..3c5c4f99dcee1 100644 --- a/docs/source/getting_started/examples/llm_engine_example.md +++ b/docs/source/getting_started/examples/llm_engine_example.md @@ -1,8 +1,8 @@ -# LLM Engine Example +LLM Engine Example +================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/llm_engine_example.py. -```{literalinclude} ../../../../examples/llm_engine_example.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/llm_engine_example.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/lora_with_quantization_inference.md b/docs/source/getting_started/examples/lora_with_quantization_inference.md index ba9af0bf3aa1e..313c7914cf224 100644 --- a/docs/source/getting_started/examples/lora_with_quantization_inference.md +++ b/docs/source/getting_started/examples/lora_with_quantization_inference.md @@ -1,8 +1,8 @@ -# Lora With Quantization Inference +Lora With Quantization Inference +================================ -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/lora_with_quantization_inference.py. -```{literalinclude} ../../../../examples/lora_with_quantization_inference.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/lora_with_quantization_inference.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/multilora_inference.md b/docs/source/getting_started/examples/multilora_inference.md index 8c5213c238519..f1b4a9587d95b 100644 --- a/docs/source/getting_started/examples/multilora_inference.md +++ b/docs/source/getting_started/examples/multilora_inference.md @@ -1,8 +1,8 @@ -# MultiLoRA Inference +MultiLoRA Inference +=================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py. -```{literalinclude} ../../../../examples/multilora_inference.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/multilora_inference.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_chat_with_tools.md b/docs/source/getting_started/examples/offline_chat_with_tools.md index dc126d1efc522..6bd28e2c0a1e5 100644 --- a/docs/source/getting_started/examples/offline_chat_with_tools.md +++ b/docs/source/getting_started/examples/offline_chat_with_tools.md @@ -1,8 +1,8 @@ -# Offline Chat With Tools +Offline Chat With Tools +======================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_chat_with_tools.py. -```{literalinclude} ../../../../examples/offline_chat_with_tools.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_chat_with_tools.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference.md b/docs/source/getting_started/examples/offline_inference.md index 20c8e848089a1..3826ed0bcab44 100644 --- a/docs/source/getting_started/examples/offline_inference.md +++ b/docs/source/getting_started/examples/offline_inference.md @@ -1,8 +1,8 @@ -# Offline Inference +Offline Inference +================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py. -```{literalinclude} ../../../../examples/offline_inference.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_arctic.md b/docs/source/getting_started/examples/offline_inference_arctic.md index 4ec02315dc0b1..b9d4f7ba9f7de 100644 --- a/docs/source/getting_started/examples/offline_inference_arctic.md +++ b/docs/source/getting_started/examples/offline_inference_arctic.md @@ -1,8 +1,8 @@ -# Offline Inference Arctic +Offline Inference Arctic +======================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_arctic.py. -```{literalinclude} ../../../../examples/offline_inference_arctic.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_arctic.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_audio_language.md b/docs/source/getting_started/examples/offline_inference_audio_language.md index fc14e4d6c6f60..b9dde35e2d810 100644 --- a/docs/source/getting_started/examples/offline_inference_audio_language.md +++ b/docs/source/getting_started/examples/offline_inference_audio_language.md @@ -1,8 +1,8 @@ -# Offline Inference Audio Language +Offline Inference Audio Language +================================ -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py. -```{literalinclude} ../../../../examples/offline_inference_audio_language.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_audio_language.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_chat.md b/docs/source/getting_started/examples/offline_inference_chat.md index 46f6eb4faa84c..fff3c7486a387 100644 --- a/docs/source/getting_started/examples/offline_inference_chat.md +++ b/docs/source/getting_started/examples/offline_inference_chat.md @@ -1,8 +1,8 @@ -# Offline Inference Chat +Offline Inference Chat +====================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py. -```{literalinclude} ../../../../examples/offline_inference_chat.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_chat.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_distributed.md b/docs/source/getting_started/examples/offline_inference_distributed.md index e9c07c6fd9877..eb4351892954c 100644 --- a/docs/source/getting_started/examples/offline_inference_distributed.md +++ b/docs/source/getting_started/examples/offline_inference_distributed.md @@ -1,8 +1,8 @@ -# Offline Inference Distributed +Offline Inference Distributed +============================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_distributed.py. -```{literalinclude} ../../../../examples/offline_inference_distributed.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_distributed.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_embedding.md b/docs/source/getting_started/examples/offline_inference_embedding.md index ea4a37a83714a..c744c79e62e84 100644 --- a/docs/source/getting_started/examples/offline_inference_embedding.md +++ b/docs/source/getting_started/examples/offline_inference_embedding.md @@ -1,8 +1,8 @@ -# Offline Inference Embedding +Offline Inference Embedding +=========================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py. -```{literalinclude} ../../../../examples/offline_inference_embedding.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_embedding.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_encoder_decoder.md b/docs/source/getting_started/examples/offline_inference_encoder_decoder.md index f18a6eba70796..c044a47abfa37 100644 --- a/docs/source/getting_started/examples/offline_inference_encoder_decoder.md +++ b/docs/source/getting_started/examples/offline_inference_encoder_decoder.md @@ -1,8 +1,8 @@ -# Offline Inference Encoder Decoder +Offline Inference Encoder Decoder +================================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_encoder_decoder.py. -```{literalinclude} ../../../../examples/offline_inference_encoder_decoder.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_encoder_decoder.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_mlpspeculator.md b/docs/source/getting_started/examples/offline_inference_mlpspeculator.md index 27481d5b6f9e2..3e22585549144 100644 --- a/docs/source/getting_started/examples/offline_inference_mlpspeculator.md +++ b/docs/source/getting_started/examples/offline_inference_mlpspeculator.md @@ -1,8 +1,8 @@ -# Offline Inference Mlpspeculator +Offline Inference Mlpspeculator +=============================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_mlpspeculator.py. -```{literalinclude} ../../../../examples/offline_inference_mlpspeculator.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_mlpspeculator.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_neuron.md b/docs/source/getting_started/examples/offline_inference_neuron.md index 943c9da23d936..c29f6831ee2c0 100644 --- a/docs/source/getting_started/examples/offline_inference_neuron.md +++ b/docs/source/getting_started/examples/offline_inference_neuron.md @@ -1,8 +1,8 @@ -# Offline Inference Neuron +Offline Inference Neuron +======================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_neuron.py. -```{literalinclude} ../../../../examples/offline_inference_neuron.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_neuron.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md b/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md index 69992f85c4f46..19d13d14771c6 100644 --- a/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md +++ b/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md @@ -1,8 +1,8 @@ -# Offline Inference Neuron Int8 Quantization +Offline Inference Neuron Int8 Quantization +========================================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_neuron_int8_quantization.py. -```{literalinclude} ../../../../examples/offline_inference_neuron_int8_quantization.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_neuron_int8_quantization.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_pixtral.md b/docs/source/getting_started/examples/offline_inference_pixtral.md index 7d141c2894d7f..bbcb8736f4917 100644 --- a/docs/source/getting_started/examples/offline_inference_pixtral.md +++ b/docs/source/getting_started/examples/offline_inference_pixtral.md @@ -1,8 +1,8 @@ -# Offline Inference Pixtral +Offline Inference Pixtral +========================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_pixtral.py. -```{literalinclude} ../../../../examples/offline_inference_pixtral.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_pixtral.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_tpu.md b/docs/source/getting_started/examples/offline_inference_tpu.md index ad7a5482ac67b..fd7d0dfba62d6 100644 --- a/docs/source/getting_started/examples/offline_inference_tpu.md +++ b/docs/source/getting_started/examples/offline_inference_tpu.md @@ -1,8 +1,8 @@ -# Offline Inference Tpu +Offline Inference Tpu +===================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_tpu.py. -```{literalinclude} ../../../../examples/offline_inference_tpu.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_tpu.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language.md b/docs/source/getting_started/examples/offline_inference_vision_language.md index ce4549c8f47b1..fce4d38fd8506 100644 --- a/docs/source/getting_started/examples/offline_inference_vision_language.md +++ b/docs/source/getting_started/examples/offline_inference_vision_language.md @@ -1,8 +1,8 @@ -# Offline Inference Vision Language +Offline Inference Vision Language +================================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py. -```{literalinclude} ../../../../examples/offline_inference_vision_language.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_vision_language.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md b/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md index ef17dcbff8fa6..6b400657290f2 100644 --- a/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md +++ b/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md @@ -1,8 +1,8 @@ -# Offline Inference Vision Language Embedding +Offline Inference Vision Language Embedding +=========================================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_embedding.py. -```{literalinclude} ../../../../examples/offline_inference_vision_language_embedding.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_vision_language_embedding.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md b/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md index c6f38ac8329bc..1da69a6772817 100644 --- a/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md +++ b/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md @@ -1,8 +1,8 @@ -# Offline Inference Vision Language Multi Image +Offline Inference Vision Language Multi Image +============================================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py. -```{literalinclude} ../../../../examples/offline_inference_vision_language_multi_image.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_vision_language_multi_image.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_with_prefix.md b/docs/source/getting_started/examples/offline_inference_with_prefix.md index 8c7bff2cc649a..001238436b1f5 100644 --- a/docs/source/getting_started/examples/offline_inference_with_prefix.md +++ b/docs/source/getting_started/examples/offline_inference_with_prefix.md @@ -1,8 +1,8 @@ -# Offline Inference With Prefix +Offline Inference With Prefix +============================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_prefix.py. -```{literalinclude} ../../../../examples/offline_inference_with_prefix.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_with_prefix.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_with_profiler.md b/docs/source/getting_started/examples/offline_inference_with_profiler.md index cf4b5ae7d1336..25c7b36977708 100644 --- a/docs/source/getting_started/examples/offline_inference_with_profiler.md +++ b/docs/source/getting_started/examples/offline_inference_with_profiler.md @@ -1,8 +1,8 @@ -# Offline Inference With Profiler +Offline Inference With Profiler +=============================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py. -```{literalinclude} ../../../../examples/offline_inference_with_profiler.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_inference_with_profiler.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/offline_profile.md b/docs/source/getting_started/examples/offline_profile.md index c035fd8180a1f..a048261d20223 100644 --- a/docs/source/getting_started/examples/offline_profile.md +++ b/docs/source/getting_started/examples/offline_profile.md @@ -1,8 +1,8 @@ -# Offline Profile +Offline Profile +=============== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/offline_profile.py. -```{literalinclude} ../../../../examples/offline_profile.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/offline_profile.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client.md b/docs/source/getting_started/examples/openai_chat_completion_client.md index 62527126ea015..07341b6ba3768 100644 --- a/docs/source/getting_started/examples/openai_chat_completion_client.md +++ b/docs/source/getting_started/examples/openai_chat_completion_client.md @@ -1,8 +1,8 @@ -# OpenAI Chat Completion Client +OpenAI Chat Completion Client +============================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py. -```{literalinclude} ../../../../examples/openai_chat_completion_client.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_chat_completion_client.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md index 7e10bcd402258..d9d8ef875edc2 100644 --- a/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md +++ b/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md @@ -1,8 +1,8 @@ -# OpenAI Chat Completion Client For Multimodal +OpenAI Chat Completion Client For Multimodal +============================================ -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py. -```{literalinclude} ../../../../examples/openai_chat_completion_client_for_multimodal.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_chat_completion_client_for_multimodal.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md b/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md index 699b66cbf9878..dc9962c681bd2 100644 --- a/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md +++ b/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md @@ -1,8 +1,8 @@ -# OpenAI Chat Completion Client With Tools +OpenAI Chat Completion Client With Tools +======================================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_with_tools.py. -```{literalinclude} ../../../../examples/openai_chat_completion_client_with_tools.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_chat_completion_client_with_tools.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md index ee4496e851b3d..c3acaf8c5f8bb 100644 --- a/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md +++ b/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md @@ -1,8 +1,8 @@ -# OpenAI Chat Embedding Client For Multimodal +OpenAI Chat Embedding Client For Multimodal +=========================================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py. -```{literalinclude} ../../../../examples/openai_chat_embedding_client_for_multimodal.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_chat_embedding_client_for_multimodal.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_completion_client.md b/docs/source/getting_started/examples/openai_completion_client.md index d0e6fb1e26e9c..7962f3d1054c1 100644 --- a/docs/source/getting_started/examples/openai_completion_client.md +++ b/docs/source/getting_started/examples/openai_completion_client.md @@ -1,8 +1,8 @@ -# OpenAI Completion Client +OpenAI Completion Client +======================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py. -```{literalinclude} ../../../../examples/openai_completion_client.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_completion_client.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/openai_embedding_client.md b/docs/source/getting_started/examples/openai_embedding_client.md index 730e4656f2c81..9024f84d01c97 100644 --- a/docs/source/getting_started/examples/openai_embedding_client.md +++ b/docs/source/getting_started/examples/openai_embedding_client.md @@ -1,8 +1,8 @@ -# OpenAI Embedding Client +OpenAI Embedding Client +======================= -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py. -```{literalinclude} ../../../../examples/openai_embedding_client.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/openai_embedding_client.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/save_sharded_state.md b/docs/source/getting_started/examples/save_sharded_state.md index acaf64f9f2ab2..4d99b7123f052 100644 --- a/docs/source/getting_started/examples/save_sharded_state.md +++ b/docs/source/getting_started/examples/save_sharded_state.md @@ -1,8 +1,8 @@ -# Save Sharded State +Save Sharded State +================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/save_sharded_state.py. -```{literalinclude} ../../../../examples/save_sharded_state.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/save_sharded_state.py + :language: python + :linenos: diff --git a/docs/source/getting_started/examples/tensorize_vllm_model.md b/docs/source/getting_started/examples/tensorize_vllm_model.md index 5ceb8ab492f0a..cc9d87f8baa9b 100644 --- a/docs/source/getting_started/examples/tensorize_vllm_model.md +++ b/docs/source/getting_started/examples/tensorize_vllm_model.md @@ -1,8 +1,8 @@ -# Tensorize vLLM Model +Tensorize vLLM Model +==================== -Source . +Source https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py. -```{literalinclude} ../../../../examples/tensorize_vllm_model.py -:language: python -:linenos: true -``` +.. literalinclude:: ../../../../examples/tensorize_vllm_model.py + :language: python + :linenos: diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index f75653106cf66..f9271bdd2e183 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,7 +2,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. -You can start the server using Python, or using [Docker](deploying_with_docker.rst): +You can start the server using Python, or using [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` @@ -32,7 +32,7 @@ We currently support the following OpenAI APIs: - [Completions API](https://platform.openai.com/docs/api-reference/completions) - *Note: `suffix` parameter is not supported.* - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.md). - *Note: `image_url.detail` parameter is not supported.* - We also support `audio_url` content type for audio files. - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. @@ -41,7 +41,7 @@ We currently support the following OpenAI APIs: - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), which will be treated as a single prompt to the model according to its chat template. - - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. + - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Score API for Cross Encoder Models @@ -232,7 +232,7 @@ print(completion._request_id) ### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -250,7 +250,7 @@ The following extra parameters are supported: ### Extra Parameters for Chat Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -268,7 +268,7 @@ The following extra parameters are supported: ### Extra Parameters for Embeddings API -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 19daeb729ee61..480901f71047f 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/config.py b/vllm/config.py index 322c8f8990a40..ff8e508a2b0e1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -561,7 +561,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( @@ -581,7 +581,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -1778,7 +1778,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: logger.warning("LoRA with chunked prefill is still experimental " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7337522bc9952..ba11600b602e4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1116,7 +1116,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index a9b638ed02a1e..1c6f735f39e04 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 2816b5c5c1f88..5495bc50ede83 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index e5142b985d1f2..a1f2beaf22867 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 2689802161987..de593113b938b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 1882264c19775..c0ae7c72592d8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -51,7 +51,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index e08a61e31fe42..5188d60e5846c 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -819,7 +819,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 5f71ec0c14df8..8f2d343440d3e 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: From 6789c5175c3c76df1203efb9ee883bdef2596518 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 16 Dec 2024 14:44:28 -0500 Subject: [PATCH 04/27] Update ref to .rst Signed-off-by: Rafael Vasquez --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 682f046d4b6ec..b38113f524a17 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.rst and +# docs/source/dev/dockerfile/dockerfile.md and # docs/source/assets/dev/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 From d86a856003665d65f4681816fd017b02f9b52def Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 16 Dec 2024 16:04:36 -0500 Subject: [PATCH 05/27] Remove generated exmpls, fix md formatting Signed-off-by: Rafael Vasquez --- docs/source/generate_examples.py | 16 +- .../getting_started/examples/api_client.md | 8 - .../getting_started/examples/aqlm_example.md | 8 - .../getting_started/examples/cpu_offload.md | 8 - .../examples/examples_index.md | 48 -- .../examples/examples_index.template.md | 4 +- .../examples/florence2_inference.md | 8 - .../examples/gguf_inference.md | 8 - .../gradio_openai_chatbot_webserver.md | 8 - .../examples/gradio_webserver.md | 8 - .../examples/llm_engine_example.md | 8 - .../lora_with_quantization_inference.md | 8 - .../examples/multilora_inference.md | 8 - .../examples/offline_chat_with_tools.md | 8 - .../examples/offline_inference.md | 8 - .../examples/offline_inference_arctic.md | 8 - .../offline_inference_audio_language.md | 8 - .../examples/offline_inference_chat.md | 8 - .../examples/offline_inference_distributed.md | 8 - .../examples/offline_inference_embedding.md | 8 - .../offline_inference_encoder_decoder.md | 8 - .../offline_inference_mlpspeculator.md | 8 - .../examples/offline_inference_neuron.md | 8 - ...line_inference_neuron_int8_quantization.md | 8 - .../examples/offline_inference_pixtral.md | 8 - .../examples/offline_inference_tpu.md | 8 - .../offline_inference_vision_language.md | 8 - ...ine_inference_vision_language_embedding.md | 8 - ...e_inference_vision_language_multi_image.md | 8 - .../examples/offline_inference_with_prefix.md | 8 - .../offline_inference_with_profiler.md | 8 - .../examples/offline_profile.md | 8 - .../examples/openai_chat_completion_client.md | 8 - ...i_chat_completion_client_for_multimodal.md | 8 - ...penai_chat_completion_client_with_tools.md | 8 - ...ai_chat_embedding_client_for_multimodal.md | 8 - .../examples/openai_completion_client.md | 8 - .../examples/openai_embedding_client.md | 8 - .../examples/save_sharded_state.md | 8 - .../examples/tensorize_vllm_model.md | 8 - docs/source/models/supported_models.md | 2 + docs/source/models/supported_models.rst | 812 ------------------ docs/source/serving/metrics.md | 2 +- 43 files changed, 10 insertions(+), 1170 deletions(-) delete mode 100644 docs/source/getting_started/examples/api_client.md delete mode 100644 docs/source/getting_started/examples/aqlm_example.md delete mode 100644 docs/source/getting_started/examples/cpu_offload.md delete mode 100644 docs/source/getting_started/examples/examples_index.md delete mode 100644 docs/source/getting_started/examples/florence2_inference.md delete mode 100644 docs/source/getting_started/examples/gguf_inference.md delete mode 100644 docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md delete mode 100644 docs/source/getting_started/examples/gradio_webserver.md delete mode 100644 docs/source/getting_started/examples/llm_engine_example.md delete mode 100644 docs/source/getting_started/examples/lora_with_quantization_inference.md delete mode 100644 docs/source/getting_started/examples/multilora_inference.md delete mode 100644 docs/source/getting_started/examples/offline_chat_with_tools.md delete mode 100644 docs/source/getting_started/examples/offline_inference.md delete mode 100644 docs/source/getting_started/examples/offline_inference_arctic.md delete mode 100644 docs/source/getting_started/examples/offline_inference_audio_language.md delete mode 100644 docs/source/getting_started/examples/offline_inference_chat.md delete mode 100644 docs/source/getting_started/examples/offline_inference_distributed.md delete mode 100644 docs/source/getting_started/examples/offline_inference_embedding.md delete mode 100644 docs/source/getting_started/examples/offline_inference_encoder_decoder.md delete mode 100644 docs/source/getting_started/examples/offline_inference_mlpspeculator.md delete mode 100644 docs/source/getting_started/examples/offline_inference_neuron.md delete mode 100644 docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md delete mode 100644 docs/source/getting_started/examples/offline_inference_pixtral.md delete mode 100644 docs/source/getting_started/examples/offline_inference_tpu.md delete mode 100644 docs/source/getting_started/examples/offline_inference_vision_language.md delete mode 100644 docs/source/getting_started/examples/offline_inference_vision_language_embedding.md delete mode 100644 docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md delete mode 100644 docs/source/getting_started/examples/offline_inference_with_prefix.md delete mode 100644 docs/source/getting_started/examples/offline_inference_with_profiler.md delete mode 100644 docs/source/getting_started/examples/offline_profile.md delete mode 100644 docs/source/getting_started/examples/openai_chat_completion_client.md delete mode 100644 docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md delete mode 100644 docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md delete mode 100644 docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md delete mode 100644 docs/source/getting_started/examples/openai_completion_client.md delete mode 100644 docs/source/getting_started/examples/openai_embedding_client.md delete mode 100644 docs/source/getting_started/examples/save_sharded_state.md delete mode 100644 docs/source/getting_started/examples/tensorize_vllm_model.md delete mode 100644 docs/source/models/supported_models.rst diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 8e43870246ae6..90694833e6357 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -15,18 +15,12 @@ def fix_case(text: str) -> str: return text -def underline(title: str, character: str = "=") -> str: - return f"{title}\n{character * len(title)}" - - def generate_title(filename: str) -> str: # Turn filename into a title title = filename.replace("_", " ").title() # Handle acronyms and names title = fix_case(title) - # Underline title - title = underline(title) - return title + return f"# {title}" def generate_examples(): @@ -47,9 +41,9 @@ def generate_examples(): include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" f"Source {script_url}.\n\n" - f".. literalinclude:: {include_path}\n" - " :language: python\n" - " :linenos:\n") + f"```{{literalinclude}} {include_path}\n" + ":language: python\n" + ":linenos:\n```") with open(doc_path, "w+") as f: f.write(content) @@ -57,5 +51,5 @@ def generate_examples(): with open(doc_dir / "examples_index.template.md") as f: examples_index = f.read() with open(doc_dir / "examples_index.md", "w+") as f: - example_docs = "\n ".join(path.stem for path in script_paths) + example_docs = "\n".join(path.stem + ".md" for path in script_paths) f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) diff --git a/docs/source/getting_started/examples/api_client.md b/docs/source/getting_started/examples/api_client.md deleted file mode 100644 index 31c984671fb9d..0000000000000 --- a/docs/source/getting_started/examples/api_client.md +++ /dev/null @@ -1,8 +0,0 @@ -API Client -========== - -Source https://github.com/vllm-project/vllm/blob/main/examples/api_client.py. - -.. literalinclude:: ../../../../examples/api_client.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/aqlm_example.md b/docs/source/getting_started/examples/aqlm_example.md deleted file mode 100644 index 455aff5645459..0000000000000 --- a/docs/source/getting_started/examples/aqlm_example.md +++ /dev/null @@ -1,8 +0,0 @@ -Aqlm Example -============ - -Source https://github.com/vllm-project/vllm/blob/main/examples/aqlm_example.py. - -.. literalinclude:: ../../../../examples/aqlm_example.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/cpu_offload.md b/docs/source/getting_started/examples/cpu_offload.md deleted file mode 100644 index df28306947873..0000000000000 --- a/docs/source/getting_started/examples/cpu_offload.md +++ /dev/null @@ -1,8 +0,0 @@ -Cpu Offload -=========== - -Source https://github.com/vllm-project/vllm/blob/main/examples/cpu_offload.py. - -.. literalinclude:: ../../../../examples/cpu_offload.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/examples_index.md b/docs/source/getting_started/examples/examples_index.md deleted file mode 100644 index 690a557fd9d68..0000000000000 --- a/docs/source/getting_started/examples/examples_index.md +++ /dev/null @@ -1,48 +0,0 @@ -# Examples - -```{toctree} -:caption: Scripts -:maxdepth: 1 - -api_client - aqlm_example - cpu_offload - florence2_inference - gguf_inference - gradio_openai_chatbot_webserver - gradio_webserver - llm_engine_example - lora_with_quantization_inference - multilora_inference - offline_chat_with_tools - offline_inference - offline_inference_arctic - offline_inference_audio_language - offline_inference_chat - offline_inference_cli - offline_inference_distributed - offline_inference_embedding - offline_inference_encoder_decoder - offline_inference_mlpspeculator - offline_inference_neuron - offline_inference_neuron_int8_quantization - offline_inference_pixtral - offline_inference_structured_outputs - offline_inference_tpu - offline_inference_vision_language - offline_inference_vision_language_embedding - offline_inference_vision_language_multi_image - offline_inference_with_prefix - offline_inference_with_profiler - offline_profile - openai_chat_completion_client - openai_chat_completion_client_for_multimodal - openai_chat_completion_client_with_tools - openai_chat_completion_structured_outputs - openai_chat_embedding_client_for_multimodal - openai_completion_client - openai_cross_encoder_score - openai_embedding_client - save_sharded_state - tensorize_vllm_model -``` diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md index f8e57d9e3d64e..de7a91c0ffa48 100644 --- a/docs/source/getting_started/examples/examples_index.template.md +++ b/docs/source/getting_started/examples/examples_index.template.md @@ -1,8 +1,8 @@ # Examples ```{toctree} -:caption: Scripts :maxdepth: 1 +:caption: Scripts %EXAMPLE_DOCS% -``` +``` \ No newline at end of file diff --git a/docs/source/getting_started/examples/florence2_inference.md b/docs/source/getting_started/examples/florence2_inference.md deleted file mode 100644 index e8e3a99d6d1b1..0000000000000 --- a/docs/source/getting_started/examples/florence2_inference.md +++ /dev/null @@ -1,8 +0,0 @@ -Florence2 Inference -=================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/florence2_inference.py. - -.. literalinclude:: ../../../../examples/florence2_inference.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/gguf_inference.md b/docs/source/getting_started/examples/gguf_inference.md deleted file mode 100644 index dbac9dd722ed7..0000000000000 --- a/docs/source/getting_started/examples/gguf_inference.md +++ /dev/null @@ -1,8 +0,0 @@ -Gguf Inference -============== - -Source https://github.com/vllm-project/vllm/blob/main/examples/gguf_inference.py. - -.. literalinclude:: ../../../../examples/gguf_inference.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md b/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md deleted file mode 100644 index 7fca8b17c7660..0000000000000 --- a/docs/source/getting_started/examples/gradio_openai_chatbot_webserver.md +++ /dev/null @@ -1,8 +0,0 @@ -Gradio OpenAI Chatbot Webserver -=============================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/gradio_openai_chatbot_webserver.py. - -.. literalinclude:: ../../../../examples/gradio_openai_chatbot_webserver.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/gradio_webserver.md b/docs/source/getting_started/examples/gradio_webserver.md deleted file mode 100644 index 7ec4576a301c9..0000000000000 --- a/docs/source/getting_started/examples/gradio_webserver.md +++ /dev/null @@ -1,8 +0,0 @@ -Gradio Webserver -================ - -Source https://github.com/vllm-project/vllm/blob/main/examples/gradio_webserver.py. - -.. literalinclude:: ../../../../examples/gradio_webserver.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/llm_engine_example.md b/docs/source/getting_started/examples/llm_engine_example.md deleted file mode 100644 index 3c5c4f99dcee1..0000000000000 --- a/docs/source/getting_started/examples/llm_engine_example.md +++ /dev/null @@ -1,8 +0,0 @@ -LLM Engine Example -================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/llm_engine_example.py. - -.. literalinclude:: ../../../../examples/llm_engine_example.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/lora_with_quantization_inference.md b/docs/source/getting_started/examples/lora_with_quantization_inference.md deleted file mode 100644 index 313c7914cf224..0000000000000 --- a/docs/source/getting_started/examples/lora_with_quantization_inference.md +++ /dev/null @@ -1,8 +0,0 @@ -Lora With Quantization Inference -================================ - -Source https://github.com/vllm-project/vllm/blob/main/examples/lora_with_quantization_inference.py. - -.. literalinclude:: ../../../../examples/lora_with_quantization_inference.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/multilora_inference.md b/docs/source/getting_started/examples/multilora_inference.md deleted file mode 100644 index f1b4a9587d95b..0000000000000 --- a/docs/source/getting_started/examples/multilora_inference.md +++ /dev/null @@ -1,8 +0,0 @@ -MultiLoRA Inference -=================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py. - -.. literalinclude:: ../../../../examples/multilora_inference.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_chat_with_tools.md b/docs/source/getting_started/examples/offline_chat_with_tools.md deleted file mode 100644 index 6bd28e2c0a1e5..0000000000000 --- a/docs/source/getting_started/examples/offline_chat_with_tools.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Chat With Tools -======================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_chat_with_tools.py. - -.. literalinclude:: ../../../../examples/offline_chat_with_tools.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference.md b/docs/source/getting_started/examples/offline_inference.md deleted file mode 100644 index 3826ed0bcab44..0000000000000 --- a/docs/source/getting_started/examples/offline_inference.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference -================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py. - -.. literalinclude:: ../../../../examples/offline_inference.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_arctic.md b/docs/source/getting_started/examples/offline_inference_arctic.md deleted file mode 100644 index b9d4f7ba9f7de..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_arctic.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Arctic -======================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_arctic.py. - -.. literalinclude:: ../../../../examples/offline_inference_arctic.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_audio_language.md b/docs/source/getting_started/examples/offline_inference_audio_language.md deleted file mode 100644 index b9dde35e2d810..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_audio_language.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Audio Language -================================ - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py. - -.. literalinclude:: ../../../../examples/offline_inference_audio_language.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_chat.md b/docs/source/getting_started/examples/offline_inference_chat.md deleted file mode 100644 index fff3c7486a387..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_chat.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Chat -====================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py. - -.. literalinclude:: ../../../../examples/offline_inference_chat.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_distributed.md b/docs/source/getting_started/examples/offline_inference_distributed.md deleted file mode 100644 index eb4351892954c..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_distributed.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Distributed -============================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_distributed.py. - -.. literalinclude:: ../../../../examples/offline_inference_distributed.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_embedding.md b/docs/source/getting_started/examples/offline_inference_embedding.md deleted file mode 100644 index c744c79e62e84..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_embedding.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Embedding -=========================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py. - -.. literalinclude:: ../../../../examples/offline_inference_embedding.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_encoder_decoder.md b/docs/source/getting_started/examples/offline_inference_encoder_decoder.md deleted file mode 100644 index c044a47abfa37..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_encoder_decoder.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Encoder Decoder -================================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_encoder_decoder.py. - -.. literalinclude:: ../../../../examples/offline_inference_encoder_decoder.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_mlpspeculator.md b/docs/source/getting_started/examples/offline_inference_mlpspeculator.md deleted file mode 100644 index 3e22585549144..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_mlpspeculator.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Mlpspeculator -=============================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_mlpspeculator.py. - -.. literalinclude:: ../../../../examples/offline_inference_mlpspeculator.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_neuron.md b/docs/source/getting_started/examples/offline_inference_neuron.md deleted file mode 100644 index c29f6831ee2c0..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_neuron.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Neuron -======================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_neuron.py. - -.. literalinclude:: ../../../../examples/offline_inference_neuron.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md b/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md deleted file mode 100644 index 19d13d14771c6..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_neuron_int8_quantization.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Neuron Int8 Quantization -========================================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_neuron_int8_quantization.py. - -.. literalinclude:: ../../../../examples/offline_inference_neuron_int8_quantization.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_pixtral.md b/docs/source/getting_started/examples/offline_inference_pixtral.md deleted file mode 100644 index bbcb8736f4917..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_pixtral.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Pixtral -========================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_pixtral.py. - -.. literalinclude:: ../../../../examples/offline_inference_pixtral.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_tpu.md b/docs/source/getting_started/examples/offline_inference_tpu.md deleted file mode 100644 index fd7d0dfba62d6..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_tpu.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Tpu -===================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_tpu.py. - -.. literalinclude:: ../../../../examples/offline_inference_tpu.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language.md b/docs/source/getting_started/examples/offline_inference_vision_language.md deleted file mode 100644 index fce4d38fd8506..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_vision_language.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Vision Language -================================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py. - -.. literalinclude:: ../../../../examples/offline_inference_vision_language.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md b/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md deleted file mode 100644 index 6b400657290f2..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_vision_language_embedding.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Vision Language Embedding -=========================================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_embedding.py. - -.. literalinclude:: ../../../../examples/offline_inference_vision_language_embedding.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md b/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md deleted file mode 100644 index 1da69a6772817..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_vision_language_multi_image.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference Vision Language Multi Image -============================================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py. - -.. literalinclude:: ../../../../examples/offline_inference_vision_language_multi_image.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_with_prefix.md b/docs/source/getting_started/examples/offline_inference_with_prefix.md deleted file mode 100644 index 001238436b1f5..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_with_prefix.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference With Prefix -============================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_prefix.py. - -.. literalinclude:: ../../../../examples/offline_inference_with_prefix.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_inference_with_profiler.md b/docs/source/getting_started/examples/offline_inference_with_profiler.md deleted file mode 100644 index 25c7b36977708..0000000000000 --- a/docs/source/getting_started/examples/offline_inference_with_profiler.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference With Profiler -=============================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py. - -.. literalinclude:: ../../../../examples/offline_inference_with_profiler.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/offline_profile.md b/docs/source/getting_started/examples/offline_profile.md deleted file mode 100644 index a048261d20223..0000000000000 --- a/docs/source/getting_started/examples/offline_profile.md +++ /dev/null @@ -1,8 +0,0 @@ -Offline Profile -=============== - -Source https://github.com/vllm-project/vllm/blob/main/examples/offline_profile.py. - -.. literalinclude:: ../../../../examples/offline_profile.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client.md b/docs/source/getting_started/examples/openai_chat_completion_client.md deleted file mode 100644 index 07341b6ba3768..0000000000000 --- a/docs/source/getting_started/examples/openai_chat_completion_client.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Chat Completion Client -============================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py. - -.. literalinclude:: ../../../../examples/openai_chat_completion_client.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md deleted file mode 100644 index d9d8ef875edc2..0000000000000 --- a/docs/source/getting_started/examples/openai_chat_completion_client_for_multimodal.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Chat Completion Client For Multimodal -============================================ - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py. - -.. literalinclude:: ../../../../examples/openai_chat_completion_client_for_multimodal.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md b/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md deleted file mode 100644 index dc9962c681bd2..0000000000000 --- a/docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Chat Completion Client With Tools -======================================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_with_tools.py. - -.. literalinclude:: ../../../../examples/openai_chat_completion_client_with_tools.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md b/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md deleted file mode 100644 index c3acaf8c5f8bb..0000000000000 --- a/docs/source/getting_started/examples/openai_chat_embedding_client_for_multimodal.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Chat Embedding Client For Multimodal -=========================================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py. - -.. literalinclude:: ../../../../examples/openai_chat_embedding_client_for_multimodal.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_completion_client.md b/docs/source/getting_started/examples/openai_completion_client.md deleted file mode 100644 index 7962f3d1054c1..0000000000000 --- a/docs/source/getting_started/examples/openai_completion_client.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Completion Client -======================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py. - -.. literalinclude:: ../../../../examples/openai_completion_client.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/openai_embedding_client.md b/docs/source/getting_started/examples/openai_embedding_client.md deleted file mode 100644 index 9024f84d01c97..0000000000000 --- a/docs/source/getting_started/examples/openai_embedding_client.md +++ /dev/null @@ -1,8 +0,0 @@ -OpenAI Embedding Client -======================= - -Source https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py. - -.. literalinclude:: ../../../../examples/openai_embedding_client.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/save_sharded_state.md b/docs/source/getting_started/examples/save_sharded_state.md deleted file mode 100644 index 4d99b7123f052..0000000000000 --- a/docs/source/getting_started/examples/save_sharded_state.md +++ /dev/null @@ -1,8 +0,0 @@ -Save Sharded State -================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/save_sharded_state.py. - -.. literalinclude:: ../../../../examples/save_sharded_state.py - :language: python - :linenos: diff --git a/docs/source/getting_started/examples/tensorize_vllm_model.md b/docs/source/getting_started/examples/tensorize_vllm_model.md deleted file mode 100644 index cc9d87f8baa9b..0000000000000 --- a/docs/source/getting_started/examples/tensorize_vllm_model.md +++ /dev/null @@ -1,8 +0,0 @@ -Tensorize vLLM Model -==================== - -Source https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py. - -.. literalinclude:: ../../../../examples/tensorize_vllm_model.py - :language: python - :linenos: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 59b677c6a4edd..b8a1b880589a3 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -707,11 +707,13 @@ See {ref}`this page ` for more information on how to use gene - ``` +```{eval-rst} {sup}`E` Pre-computed embeddings can be inputted for this modality. {sup}`+` +``` Multiple items can be inputted per text prompt for this modality. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst deleted file mode 100644 index b9957cf9563b1..0000000000000 --- a/docs/source/models/supported_models.rst +++ /dev/null @@ -1,812 +0,0 @@ -.. _supported_models: - -Supported Models -================ - -vLLM supports generative and pooling models across various tasks. -If a model supports more than one task, you can set the task via the :code:`--task` argument. - -For each task, we list the model architectures that have been implemented in vLLM. -Alongside each architecture, we include some popular models that use it. - -Loading a Model -^^^^^^^^^^^^^^^ - -HuggingFace Hub -+++++++++++++++ - -By default, vLLM loads models from `HuggingFace (HF) Hub `_. - -To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository. -If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. - -.. tip:: - The easiest way to check if your model is really supported at runtime is to run the program below: - - .. code-block:: python - - from vllm import LLM - - # For generative models (task=generate) only - llm = LLM(model=..., task="generate") # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) - - # For pooling models (task={embed,classify,reward}) only - llm = LLM(model=..., task="embed") # Name or path of your model - output = llm.encode("Hello, my name is") - print(output) - - If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. - -Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` -for instructions on how to implement your model in vLLM. -Alternatively, you can `open an issue on GitHub `_ to request vLLM support. - -ModelScope -++++++++++ - -To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: - -.. code-block:: shell - - $ export VLLM_USE_MODELSCOPE=True - -And use with :code:`trust_remote_code=True`. - -.. code-block:: python - - from vllm import LLM - - llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) - - # For generative models (task=generate) only - output = llm.generate("Hello, my name is") - print(output) - - # For pooling models (task={embed,classify,reward}) only - output = llm.encode("Hello, my name is") - print(output) - -List of Text-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Generative Models -+++++++++++++++++ - -See :ref:`this page ` for more information on how to use generative models. - -Text Generation (``--task generate``) -------------------------------------- - -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`AquilaForCausalLM` - - Aquila, Aquila2 - - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`ArcticForCausalLM` - - Arctic - - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. - - - - ✅︎ - * - :code:`BaiChuanForCausalLM` - - Baichuan2, Baichuan - - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - - - - ✅︎ - * - :code:`BartForConditionalGeneration` - - BART - - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. - - - - - * - :code:`ChatGLMModel` - - ChatGLM - - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - - ✅︎ - - ✅︎ - * - :code:`CohereForCausalLM` - - Command-R - - :code:`CohereForAI/c4ai-command-r-v01`, etc. - - ✅︎ - - ✅︎ - * - :code:`DbrxForCausalLM` - - DBRX - - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. - - - - ✅︎ - * - :code:`DeciLMForCausalLM` - - DeciLM - - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. - - - - ✅︎ - * - :code:`DeepseekForCausalLM` - - DeepSeek - - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. - - - - ✅︎ - * - :code:`DeepseekV2ForCausalLM` - - DeepSeek-V2 - - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. - - - - ✅︎ - * - :code:`ExaoneForCausalLM` - - EXAONE-3 - - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`FalconForCausalLM` - - Falcon - - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - - - ✅︎ - * - :code:`FalconMambaForCausalLM` - - FalconMamba - - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`GemmaForCausalLM` - - Gemma - - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`Gemma2ForCausalLM` - - Gemma2 - - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GlmForCausalLM` - - GLM-4 - - :code:`THUDM/glm-4-9b-chat-hf`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPT2LMHeadModel` - - GPT-2 - - :code:`gpt2`, :code:`gpt2-xl`, etc. - - - - ✅︎ - * - :code:`GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPTJForCausalLM` - - GPT-J - - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. - - - - ✅︎ - * - :code:`GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. - - - - ✅︎ - * - :code:`GraniteForCausalLM` - - Granite 3.0, PowerLM - - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GraniteMoeForCausalLM` - - Granite 3.0 MoE, PowerMoE - - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`InternLMForCausalLM` - - InternLM - - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`InternLM2ForCausalLM` - - InternLM2 - - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`JAISLMHeadModel` - - Jais - - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. - - - - ✅︎ - * - :code:`JambaForCausalLM` - - Jamba - - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`LlamaForCausalLM` - - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MambaForCausalLM` - - Mamba - - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. - - - - ✅︎ - * - :code:`MiniCPMForCausalLM` - - MiniCPM - - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. - - ✅︎ - - ✅︎ - * - :code:`MiniCPM3ForCausalLM` - - MiniCPM3 - - :code:`openbmb/MiniCPM3-4B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MistralForCausalLM` - - Mistral, Mistral-Instruct - - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - - - - ✅︎ - * - :code:`NemotronForCausalLM` - - Nemotron-3, Nemotron-4, Minitron - - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - - ✅︎ - - ✅︎ - * - :code:`OLMoForCausalLM` - - OLMo - - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. - - - - ✅︎ - * - :code:`OLMo2ForCausalLM` - - OLMo2 - - :code:`allenai/OLMo2-7B-1124`, etc. - - - - ✅︎ - * - :code:`OLMoEForCausalLM` - - OLMoE - - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`OPTForCausalLM` - - OPT, OPT-IML - - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. - - - - ✅︎ - * - :code:`OrionForCausalLM` - - Orion - - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. - - - - ✅︎ - * - :code:`PhiForCausalLM` - - Phi - - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3ForCausalLM` - - Phi-3 - - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3SmallForCausalLM` - - Phi-3-Small - - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. - - - - ✅︎ - * - :code:`PhiMoEForCausalLM` - - Phi-3.5-MoE - - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`PersimmonForCausalLM` - - Persimmon - - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. - - - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen - - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForCausalLM` - - Qwen2 - - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2MoeForCausalLM` - - Qwen2MoE - - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - - - - ✅︎ - * - :code:`StableLmForCausalLM` - - StableLM - - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. - - - - ✅︎ - * - :code:`Starcoder2ForCausalLM` - - Starcoder2 - - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. - - - - ✅︎ - * - :code:`SolarForCausalLM` - - Solar Pro - - :code:`upstage/solar-pro-preview-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`TeleChat2ForCausalLM` - - TeleChat2 - - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. - - ✅︎ - - ✅︎ - * - :code:`XverseForCausalLM` - - XVERSE - - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - - ✅︎ - - ✅︎ - -.. note:: - Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. - -Pooling Models -++++++++++++++ - -See :ref:`this page ` for more information on how to use pooling models. - -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. - -Text Embedding (``--task embed``) ---------------------------------- - -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. - -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. - -The following table lists those that are tested in vLLM. - -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertModel` - - BERT-based - - :code:`BAAI/bge-base-en-v1.5`, etc. - - - - - * - :code:`Gemma2Model` - - Gemma2-based - - :code:`BAAI/bge-multilingual-gemma2`, etc. - - - - ✅︎ - * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. - - Llama-based - - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` - - Qwen2-based - - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - - ✅︎ - - ✅︎ - * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` - - RoBERTa-based - - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. - - - - - * - :code:`XLMRobertaModel` - - XLM-RoBERTa-based - - :code:`intfloat/multilingual-e5-large`, etc. - - - - - -.. note:: - :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. - -.. note:: - Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. - You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. - - On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention - despite being described otherwise on its model card. - -Reward Modeling (``--task reward``) ------------------------------------ - -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`LlamaForCausalLM` - - Llama-based - - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForRewardModel` - - Qwen2-based - - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - ✅︎ - - ✅︎ - -.. important:: - For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. - -Classification (``--task classify``) ------------------------------------- - -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`Qwen2ForSequenceClassification` - - Qwen2-based - - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - ✅︎ - - ✅︎ - -Sentence Pair Scoring (``--task score``) ----------------------------------------- - -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertForSequenceClassification` - - BERT-based - - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - - * - :code:`RobertaForSequenceClassification` - - RoBERTa-based - - :code:`cross-encoder/quora-roberta-base`, etc. - - - - - * - :code:`XLMRobertaForSequenceClassification` - - XLM-RoBERTa-based - - :code:`BAAI/bge-reranker-v2-m3`, etc. - - - - - -.. _supported_mm_models: - -List of Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The following modalities are supported depending on the model: - -- **T**\ ext -- **I**\ mage -- **V**\ ideo -- **A**\ udio - -Any combination of modalities joined by :code:`+` are supported. - -- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. - -On the other hand, modalities separated by :code:`/` are mutually exclusive. - -- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. - -See :ref:`this page ` on how to pass multi-modal inputs to the model. - -Generative Models -+++++++++++++++++ - -See :ref:`this page ` for more information on how to use generative models. - -Text Generation (``--task generate``) -------------------------------------- - -.. list-table:: - :widths: 25 25 15 20 5 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - - V1 - * - :code:`AriaForConditionalGeneration` - - Aria - - T + I - - :code:`rhymes-ai/Aria` - - - - ✅︎ - - - * - :code:`Blip2ForConditionalGeneration` - - BLIP-2 - - T + I\ :sup:`E` - - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - - - ✅︎ - - - * - :code:`ChameleonForConditionalGeneration` - - Chameleon - - T + I - - :code:`facebook/chameleon-7b` etc. - - - - ✅︎ - - - * - :code:`FuyuForCausalLM` - - Fuyu - - T + I - - :code:`adept/fuyu-8b` etc. - - - - ✅︎ - - - * - :code:`ChatGLMModel` - - GLM-4V - - T + I - - :code:`THUDM/glm-4v-9b` etc. - - ✅︎ - - ✅︎ - - - * - :code:`H2OVLChatModel` - - H2OVL - - T + I\ :sup:`E+` - - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - - - ✅︎ - - - * - :code:`Idefics3ForConditionalGeneration` - - Idefics3 - - T + I - - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - - ✅︎ - - - - - * - :code:`InternVLChatModel` - - InternVL 2.5, Mono-InternVL, InternVL 2.0 - - T + I\ :sup:`E+` - - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaForConditionalGeneration` - - LLaVA-1.5 - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaNextVideoForConditionalGeneration` - - LLaVA-NeXT-Video - - T + V - - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaOnevisionForConditionalGeneration` - - LLaVA-Onevision - - T + I\ :sup:`+` + V\ :sup:`+` - - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - - - ✅︎ - - - * - :code:`MiniCPMV` - - MiniCPM-V - - T + I\ :sup:`E+` - - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`MllamaForConditionalGeneration` - - Llama 3.2 - - T + I\ :sup:`+` - - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - - - - - - * - :code:`MolmoForCausalLM` - - Molmo - - T + I - - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`NVLM_D_Model` - - NVLM-D 1.0 - - T + I\ :sup:`E+` - - :code:`nvidia/NVLM-D-72B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`PaliGemmaForConditionalGeneration` - - PaliGemma - - T + I\ :sup:`E` - - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. - - - - ✅︎ - - - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision, Phi-3.5-Vision - - T + I\ :sup:`E+` - - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - - - ✅︎ - - ✅︎ - * - :code:`PixtralForConditionalGeneration` - - Pixtral - - T + I\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - - - ✅︎ - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen-VL - - T + I\ :sup:`E+` - - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`Qwen2AudioForConditionalGeneration` - - Qwen2-Audio - - T + A\ :sup:`+` - - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - - - ✅︎ - - - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`E+` - - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`UltravoxModel` - - Ultravox - - T + A\ :sup:`E+` - - :code:`fixie-ai/ultravox-v0_3` - - - - ✅︎ - - - -| :sup:`E` Pre-computed embeddings can be inputted for this modality. -| :sup:`+` Multiple items can be inputted per text prompt for this modality. - -.. important:: - To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) - or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - - .. code-block:: python - - llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, - ) - - .. code-block:: bash - - vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 - -.. note:: - vLLM currently only supports adding LoRA to the language backbone of multimodal models. - -.. note:: - To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) - and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. - -.. note:: - The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. - For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 - -Pooling Models -++++++++++++++ - -See :ref:`this page ` for more information on how to use pooling models. - -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. - -Text Embedding (``--task embed``) ---------------------------------- - -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. - -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. - -The following table lists those that are tested in vLLM. - -.. list-table:: - :widths: 25 25 15 25 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT-based - - T / I - - :code:`royokong/e5-v` - - - - ✅︎ - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision-based - - T + I - - :code:`TIGER-Lab/VLM2Vec-Full` - - 🚧 - - ✅︎ - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL-based - - T + I - - :code:`MrLight/dse-qwen2-2b-mrl-v1` - - - - ✅︎ - ----- - -Model Support Policy -===================== - -At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: - -1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! - -2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - -.. tip:: - When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json `__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - -3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. - -4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. - -5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. - -Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. - -Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. - -We have the following levels of testing for models: - -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests `_ for the models that have passed this test. -2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. -4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 03b691c98b04e..3c4f724f28bbd 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using \[Docker\](deploying_with_docker.rst): +You can start the server using Python, or using \[Docker\](deploying_with_docker.md): ```console $ vllm serve unsloth/Llama-3.2-1B-Instruct From 690a1f6c4d4430bf1432d27f37f268e972eddf17 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 16 Dec 2024 16:50:04 -0500 Subject: [PATCH 06/27] Test cross ref Signed-off-by: Rafael Vasquez --- docs/source/design/arch_overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index a798ca3d2425c..511bee20a91f4 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -139,7 +139,7 @@ input tensors and capturing cudagraphs. ## Model Every model runner object has one model object, which is the actual -`torch.nn.Module` instance. See {ref}`huggingface_integration` for how various +`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various configurations affect the class we ultimately get. ## Class Hierarchy From 7802a08a7c65573a7e84bd0ea35a358007c0816d Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 17 Dec 2024 17:06:36 -0500 Subject: [PATCH 07/27] Fixes many references for markdown Signed-off-by: Rafael Vasquez --- .../input_processing/model_inputs_index.md | 4 +- .../design/multimodal/multimodal_index.md | 6 +-- .../getting_started/amd-installation.md | 4 +- .../getting_started/arm-installation.md | 6 +-- .../getting_started/cpu-installation.md | 12 +++--- .../getting_started/neuron-installation.md | 10 ++--- .../getting_started/openvino-installation.md | 10 ++--- docs/source/getting_started/quickstart.md | 6 +-- .../getting_started/xpu-installation.md | 6 +-- docs/source/models/adding_model.md | 8 ++-- .../models/enabling_multimodal_inputs.md | 14 +++---- docs/source/models/generative_models.md | 2 +- docs/source/models/pooling_models.md | 8 ++-- docs/source/models/supported_models.md | 41 +++++++++---------- docs/source/performance/benchmarks.md | 4 +- docs/source/serving/deploying_with_nginx.md | 14 +++---- .../serving/openai_compatible_server.md | 2 +- docs/source/usage/compatibility_matrix.md | 24 +++++------ docs/source/usage/faq.md | 2 +- docs/source/usage/multimodal_inputs.md | 2 +- docs/source/usage/structured_outputs.md | 2 +- vllm/engine/llm_engine.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/inputs/__init__.py | 2 +- vllm/inputs/registry.py | 6 +-- vllm/multimodal/__init__.py | 2 +- vllm/multimodal/base.py | 14 +++---- vllm/multimodal/inputs.py | 2 +- vllm/multimodal/processing.py | 2 +- vllm/multimodal/registry.py | 6 +-- vllm/scripts.py | 2 +- 31 files changed, 113 insertions(+), 114 deletions(-) diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md index 3ef8d8878a47b..cb415366e5a66 100644 --- a/docs/source/design/input_processing/model_inputs_index.md +++ b/docs/source/design/input_processing/model_inputs_index.md @@ -6,10 +6,10 @@ .. currentmodule:: vllm.inputs ``` -Each model can override parts of vLLM's {ref}`input processing pipeline ` via +Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via {data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. -Currently, this mechanism is only utilized in {ref}`multi-modal ` models for preprocessing multi-modal input +Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input data in addition to input prompt, but it can be extended to text-only language models when needed. ## Guides diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md index a240a7446b953..88af07afc7018 100644 --- a/docs/source/design/multimodal/multimodal_index.md +++ b/docs/source/design/multimodal/multimodal_index.md @@ -8,13 +8,13 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. -Multi-modal inputs can be passed alongside text and token prompts to {ref}`supported models ` +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following {ref}`this guide `. +by following [this guide](#adding-multimodal-plugin). -Looking to add your own multi-modal model? Please follow the instructions listed {ref}`here `. +Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). ## Guides diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md index 3a90760e0c882..056858b6f3aea 100644 --- a/docs/source/getting_started/amd-installation.md +++ b/docs/source/getting_started/amd-installation.md @@ -13,8 +13,8 @@ vLLM supports AMD GPUs with ROCm 6.2. Installation options: -1. {ref}`Build from source with docker ` -2. {ref}`Build from source ` +1. [Build from source with docker](#build-from-source-docker-rocm) +2. [Build from source](#build-from-source-rocm) (build-from-source-docker-rocm)= diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md index 51d6b2215cecd..de807e198b4f6 100644 --- a/docs/source/getting_started/arm-installation.md +++ b/docs/source/getting_started/arm-installation.md @@ -11,9 +11,9 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. Contents: -1. {ref}`Requirements ` -2. {ref}`Quick Start with Dockerfile ` -3. {ref}`Building from Source ` +1. [Requirements](#arm-backend-requirements) +2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) +3. [Building from Source](#build-arm-backend-from-source) (arm-backend-requirements)= diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md index 63be5275b6180..8a1ec7345befe 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/cpu-installation.md @@ -12,12 +12,12 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform, Table of contents: -1. {ref}`Requirements ` -2. {ref}`Quick start using Dockerfile ` -3. {ref}`Build from source ` -4. {ref}`Related runtime environment variables ` -5. {ref}`Intel Extension for PyTorch ` -6. {ref}`Performance tips ` +1. [Requirements](#cpu-backend-requirements) +2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) +3. [Build from source](#build-cpu-backend-from-source) +4. [Related runtime environment variables](#env-intro) +5. [Intel Extension for PyTorch](#ipex-guidance) +6. [Performance tips](#cpu-backend-performance-tips) (cpu-backend-requirements)= diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md index 1adcefcb6124a..d6de5760cc82c 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/neuron-installation.md @@ -16,12 +16,12 @@ Data types currently supported in Neuron SDK are FP16 and BF16. Installation steps: -- {ref}`Build from source ` +- [Build from source](#build-from-source-neuron) - - {ref}`Step 0. Launch Trn1/Inf2 instances ` - - {ref}`Step 1. Install drivers and tools ` - - {ref}`Step 2. Install transformers-neuronx and its dependencies ` - - {ref}`Step 3. Install vLLM from source ` + - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) + - [Step 1. Install drivers and tools](#install-drivers) + - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) + - [Step 3. Install vLLM from source](#install-vllm) (build-from-source-neuron)= diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md index 275ebd9b1892d..8b43c0a90447f 100644 --- a/docs/source/getting_started/openvino-installation.md +++ b/docs/source/getting_started/openvino-installation.md @@ -9,11 +9,11 @@ vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported model **Table of contents**: -- {ref}`Requirements ` -- {ref}`Quick start using Dockerfile ` -- {ref}`Build from source ` -- {ref}`Performance tips ` -- {ref}`Limitations ` +- [Requirements](#openvino-backend-requirements) +- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) +- [Build from source](#install-openvino-backend-from-source) +- [Performance tips](#openvino-backend-performance-tips) +- [Limitations](#openvino-backend-limitations) (openvino-backend-requirements)= diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 5cb39791efcf3..e3508bce68c2d 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -4,8 +4,8 @@ This guide will help you quickly get started with vLLM to: -- {ref}`Run offline batched inference ` -- {ref}`Run OpenAI-compatible inference ` +- [Run offline batched inference](#offline-batched-inference) +- [Run OpenAI-compatible inference](#openai-compatible-server) ## Prerequisites @@ -52,7 +52,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ``` -The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found {ref}`here `. +The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models). ```python llm = LLM(model="facebook/opt-125m") diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md index 9b40ab01d7654..5c57509aef2db 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/xpu-installation.md @@ -6,9 +6,9 @@ vLLM initially supports basic model inferencing and serving on Intel GPU platfor Table of contents: -1. {ref}`Requirements ` -2. {ref}`Quick start using Dockerfile ` -3. {ref}`Build from source ` +1. [Requirements](#xpu-backend-requirements) +2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) +3. [Build from source](#build-xpu-backend-from-source) (xpu-backend-requirements)= diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md index e28d6020c0a7e..3739873bb547b 100644 --- a/docs/source/models/adding_model.md +++ b/docs/source/models/adding_model.md @@ -12,7 +12,7 @@ However, for models that include new operators (e.g., a new attention mechanism) ```{note} By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, -please follow {ref}`this guide ` after implementing the model here. +please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. ``` ```{tip} @@ -22,7 +22,7 @@ We will be happy to help you out! ## 0. Fork the vLLM repository -Start by forking our [GitHub] repository and then {ref}`build it from source `. +Start by forking our [GitHub] repository and then [build it from source](#build-from-source). This gives you the ability to modify the codebase and test your model. ```{tip} @@ -127,7 +127,7 @@ Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in ## 6. Out-of-Tree Model Integration -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see {ref}`plugin_system`. +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). To register the model, use the following code: @@ -147,7 +147,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus ```{important} If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that {ref}`here `. +Read more about that [here](#enabling-multimodal-inputs). ``` ```{note} diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md index bf37ebb6e87c6..17c7e7af89ee5 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -2,15 +2,15 @@ # Enabling Multimodal Inputs -This document walks you through the steps to extend a vLLM model so that it accepts {ref}`multi-modal inputs `. +This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). ```{seealso} -{ref}`adding_a_new_model` +[adding-a-new-model](adding-a-new-model) ``` ## 1. Update the base vLLM model -It is assumed that you have already implemented the model in vLLM according to {ref}`these steps `. +It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). Further update the model as follows: - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. @@ -57,7 +57,7 @@ This decorator accepts a function that maps multi-modal inputs to the keyword ar A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. ```{seealso} -{ref}`input_processing_pipeline` +[input_processing_pipeline](#input-processing-pipeline) ``` ## 3. Register maximum number of multi-modal tokens @@ -82,7 +82,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -{ref}`input_processing_pipeline` +[input_processing_pipeline](#input-processing-pipeline) ``` ## 4. (Optional) Register dummy data @@ -111,7 +111,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -{ref}`input_processing_pipeline` +[input_processing_pipeline](#input-processing-pipeline) ``` ## 5. (Optional) Register input processor @@ -139,5 +139,5 @@ Here are some examples: - Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -{ref}`input_processing_pipeline` +[input_processing_pipeline](#input-processing-pipeline) ``` diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 33ebc1244cd21..7582b21bdc2dc 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -132,7 +132,7 @@ A code example can be found in [examples/openai_completion_client.py](https://gi ### Chat API -Our Chat API is similar to `LLM.chat`, accepting both text and {ref}`multi-modal inputs `. +Our Chat API is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs). It is compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) so that you can use OpenAI client to interact with it. A code example can be found in [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py). diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index caab85f5be8c1..ad5b37aaf457c 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -10,7 +10,7 @@ before returning them. ```{note} We currently support pooling models primarily as a matter of convenience. -As shown in the {ref}`Compatibility Matrix `, most vLLM features are not applicable to +As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. ``` @@ -76,7 +76,7 @@ Please click on the above link for more details on how to launch the server. ### Embeddings API -Our Embeddings API is similar to `LLM.encode`, accepting both text and {ref}`multi-modal inputs `. +Our Embeddings API is similar to `LLM.encode`, accepting both text and [multi-modal inputs](#multimodal-inputs). The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) so that you can use OpenAI client to interact with it. @@ -84,9 +84,9 @@ A code example can be found in [examples/openai_embedding_client.py](https://git The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat), -so it is not part of the OpenAI standard. Please see {ref}`this page ` for more details on how to use it. +so it is not part of the OpenAI standard. Please see [this page](#multimodal-inputs) for more details on how to use it. ### Score API Our Score API is similar to `LLM.score`. -Please see [this page](../serving/openai_compatible_server.html#score-api-for-cross-encoder-models) for more details on how to use it. +Please see [this page](../serving/openai_compatible_server#score-api-for-cross-encoder-models) for more details on how to use it. diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index b8a1b880589a3..59bc7d2606626 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -37,8 +37,7 @@ print(output) If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. ```` -Otherwise, please refer to {ref}`Adding a New Model ` and {ref}`Enabling Multimodal Inputs ` -for instructions on how to implement your model in vLLM. +Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. ### ModelScope @@ -69,7 +68,7 @@ print(output) ### Generative Models -See {ref}`this page ` for more information on how to use generative models. +See [this page](#generative-models) for more information on how to use generative models. #### Text Generation (`--task generate`) @@ -81,8 +80,8 @@ See {ref}`this page ` for more information on how to use gene * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`AquilaForCausalLM` - Aquila, Aquila2 - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. @@ -356,7 +355,7 @@ Currently, the ROCm version of vLLM supports Mistral and Mixtral only for contex ### Pooling Models -See {ref}`this page ` for more information on how to use pooling models. +See [this page](pooling-models) for more information on how to use pooling models. ```{important} Since some model architectures support both generative and pooling tasks, @@ -381,8 +380,8 @@ The following table lists those that are tested in vLLM. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`BertModel` - BERT-based - :code:`BAAI/bge-base-en-v1.5`, etc. @@ -438,8 +437,8 @@ despite being described otherwise on its model card. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`LlamaForCausalLM` - Llama-based - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. @@ -467,8 +466,8 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`Qwen2ForSequenceClassification` - Qwen2-based - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. @@ -486,8 +485,8 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`BertForSequenceClassification` - BERT-based - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. @@ -524,11 +523,11 @@ On the other hand, modalities separated by {code}`/` are mutually exclusive. - e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -See {ref}`this page ` on how to pass multi-modal inputs to the model. +See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. ### Generative Models -See {ref}`this page ` for more information on how to use generative models. +See [this page](#generative-models) for more information on how to use generative models. #### Text Generation (`--task generate`) @@ -541,8 +540,8 @@ See {ref}`this page ` for more information on how to use gene - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) - V1 * - :code:`AriaForConditionalGeneration` - Aria @@ -749,7 +748,7 @@ For more details, please see: ` for more information on how to use pooling models. +See [this page](pooling-models) for more information on how to use pooling models. ```{important} Since some model architectures support both generative and pooling tasks, @@ -775,8 +774,8 @@ The following table lists those that are tested in vLLM. - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - [LoRA](#lora) + - [PP](#distributed-serving) * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT-based - T / I diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md index 7859c4ed2353d..50ef4a1f3b54d 100644 --- a/docs/source/performance/benchmarks.md +++ b/docs/source/performance/benchmarks.md @@ -4,8 +4,8 @@ vLLM contains two sets of benchmarks: -- {ref}`Performance benchmarks ` -- {ref}`Nightly benchmarks ` +- [Performance benchmarks](#performance-benchmarks) +- [Nightly benchmarks](#nightly-benchmarks) (performance-benchmarks)= diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md index e06182bf4a32b..4d766a2bb5c8a 100644 --- a/docs/source/serving/deploying_with_nginx.md +++ b/docs/source/serving/deploying_with_nginx.md @@ -6,13 +6,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx Table of contents: -1. {ref}`Build Nginx Container ` -2. {ref}`Create Simple Nginx Config file ` -3. {ref}`Build vLLM Container ` -4. {ref}`Create Docker Network ` -5. {ref}`Launch vLLM Containers ` -6. {ref}`Launch Nginx ` -7. {ref}`Verify That vLLM Servers Are Ready ` +1. [Build Nginx Container](#nginxloadbalancer-nginx-build) +2. [Create Simple Nginx Config file](#nginxloadbalancer_nginx_conf) +3. [Build vLLM Container](#nginxloadbalancer_nginx_vllm_container) +4. [Create Docker Network](#nginxloadbalancer_nginx_docker_network) +5. [Launch vLLM Containers](#nginxloadbalancer_nginx_launch_container) +6. [Launch Nginx](#nginxloadbalancer_nginx_launch_nginx) +7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer_nginx_verify_nginx) (nginxloadbalancer-nginx-build)= diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index f9271bdd2e183..b010b976fec44 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -342,7 +342,7 @@ to override which format to use. The `serve` module can also accept arguments from a config file in `yaml` format. The arguments in the yaml must be specified using the -long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): +long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference): For example: diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md index f339a84ed12e9..ff9b790c0d54c 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/usage/compatibility_matrix.md @@ -31,11 +31,11 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar :widths: auto * - Feature - - :ref:`CP ` - - :ref:`APC ` - - :ref:`LoRA ` + - [CP](#chunked-prefill) + - [APC](#apc) + - [LoRA](#lora) - :abbr:`prmpt adptr (Prompt Adapter)` - - :ref:`SD ` + - [SD](#spec-decode) - CUDA graph - :abbr:`pooling (Pooling Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` @@ -47,7 +47,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` - * - :ref:`CP ` + * - [CP](#chunked-prefill) - - - @@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :ref:`APC ` + * - [APC](#apc) - ✅ - - @@ -81,7 +81,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :ref:`LoRA ` + * - [LoRA](#lora) - `✗ `__ - ✅ - @@ -115,7 +115,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :ref:`SD ` + * - [SD](#spec-decode) - ✅ - ✅ - ✗ @@ -337,7 +337,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - Hopper - CPU - AMD - * - :ref:`CP ` + * - [CP](#chunked-prefill) - `✗ `__ - ✅ - ✅ @@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :ref:`APC ` + * - [APC](#apc) - `✗ `__ - ✅ - ✅ @@ -353,7 +353,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :ref:`LoRA ` + * - [LoRA](#lora) - ✅ - ✅ - ✅ @@ -369,7 +369,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - `✗ `__ - ✅ - * - :ref:`SD ` + * - [SD](#spec-decode) - ✅ - ✅ - ✅ diff --git a/docs/source/usage/faq.md b/docs/source/usage/faq.md index c388939c8dd71..fde2954f10c59 100644 --- a/docs/source/usage/faq.md +++ b/docs/source/usage/faq.md @@ -11,7 +11,7 @@ ______________________________________________________________________ > Q: Which model to use for offline inference embedding? A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); -more are listed {ref}`here `. +more are listed [here](#supported-models). By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md index 86a003153b633..ba307f6684e08 100644 --- a/docs/source/usage/multimodal_inputs.md +++ b/docs/source/usage/multimodal_inputs.md @@ -2,7 +2,7 @@ # Multimodal Inputs -This page teaches you how to pass multi-modal inputs to {ref}`multi-modal models ` in vLLM. +This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. ```{note} We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes, diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 23104817a2711..e0a1ae8b521ab 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - `guided_decoding_backend`: used to select the guided decoding backend to use. -You can see the complete list of supported parameters on the [OpenAI Compatible Server](/../serving/openai_compatible_server.html) page. +You can see the complete list of supported parameters on the [OpenAI Compatible Server](/..serving/openai_compatible_server.md) page. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9be30c635cb2c..34b3101ec65e5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -149,7 +149,7 @@ class LLMEngine: and the :class:`AsyncLLMEngine` class wraps this class for online serving. The config arguments are derived from :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + [engine-args](#engine-args)) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0bec978c4869c..1ec047afb83fe 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -113,7 +113,7 @@ class LLM: integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + [engine-args](#engine-args)) Note: This class is intended to be used for offline inference. For online diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index d4402e77a3886..4ebd8201b66bf 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - :ref:`input_processing_pipeline` + [input_processing_pipeline](#input-processing-pipeline) """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0dfed3b7e61bf..7ab00f52f4a01 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -224,7 +224,7 @@ def dummy_data_for_profiling( The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + [enabling-multimodal-inputs](#enabling-multimodal-inputs) Note: This should be called after @@ -301,7 +301,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - :ref:`input_processing_pipeline` + [input_processing_pipeline](#input-processing-pipeline) """ def wrapper(model_cls: N) -> N: @@ -344,7 +344,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - :ref:`input_processing_pipeline` + [input_processing_pipeline](#input-processing-pipeline) """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 928c31a2f2843..fcf2c67929f9e 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - :ref:`input_processing_pipeline` + [input_processing_pipeline](#input_processing_pipeline) """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7dba94b885b6d..7a30998543b22 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): (i.e., the modality of the data). See also: - :ref:`adding_multimodal_plugin` + [adding_multimodal_plugin](#adding-multimodal-plugin) """ def __init__(self) -> None: @@ -94,8 +94,8 @@ def register_input_mapper( If `None` is provided, then the default input mapper is used instead. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - [input-processing-pipeline](#input-processing-pipeline) + - [enabling-multimodal-inputs](#enabling-multimodal-inputs) """ def wrapper(model_cls: N) -> N: @@ -130,8 +130,8 @@ def map_input( TypeError: If the data type is not supported. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - [input-processing-pipeline](#input-processing-pipeline) + - [enabling-multimodal-inputs](#enabling-multimodal-inputs) """ # Avoid circular import @@ -190,7 +190,7 @@ def register_max_multimodal_tokens( If `None` is provided, then the default calculation is used instead. See also: - :ref:`enabling_multimodal_inputs` + [enabling-multimodal-inputs](#enabling-multimodal-inputs) """ def wrapper(model_cls: N) -> N: @@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + [enabling-multimodal-inputs](#enabling-multimodal-inputs) """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 229a8fbdf5831..cbed77915008b 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -74,7 +74,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalDataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. + Read more on that [here](#adding-multimodal-plugin). """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 922c83b6fd8a9..65f65bd3b5b3a 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -107,7 +107,7 @@ class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. + Read more on that [here](#adding-multimodal-plugin). """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6ab6c0fe2f12e..208f1a70e24ca 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None: Register a multi-modal plugin so it can be recognized by vLLM. See also: - :ref:`adding_multimodal_plugin` + [adding_multimodal_plugin](#adding-multimodal-plugin) """ data_type_key = plugin.get_data_key() @@ -294,8 +294,8 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - [input-processing-pipeline](#input-processing-pipeline) + - [enabling-multimodal-inputs](#enabling-multimodal-inputs) """ def wrapper(model_cls: N) -> N: diff --git a/vllm/scripts.py b/vllm/scripts.py index a51c21cfa29e7..42e1c639eda10 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -165,7 +165,7 @@ def main(): required=False, help="Read CLI options from a config file." "Must be a YAML with the following options:" - "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server" + "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) serve_parser = make_arg_parser(serve_parser) serve_parser.set_defaults(dispatch_function=serve) From d7db80bc071f85a57a7c8fb4c7d791e47f3c86a7 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 17 Dec 2024 17:29:53 -0500 Subject: [PATCH 08/27] Fixes more targets Signed-off-by: Rafael Vasquez --- docs/source/design/plugin_system.md | 2 +- docs/source/models/enabling_multimodal_inputs.md | 8 ++++---- docs/source/models/generative_models.md | 2 +- docs/source/models/pooling_models.md | 4 ++-- docs/source/serving/deploying_with_nginx.md | 12 ++++++------ docs/source/serving/openai_compatible_server.md | 2 ++ docs/source/usage/structured_outputs.md | 2 +- vllm/inputs/__init__.py | 2 +- vllm/inputs/registry.py | 4 ++-- vllm/multimodal/__init__.py | 2 +- 10 files changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md index 147b5cbd58bc3..79aff757518f2 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/source/design/plugin_system.md @@ -6,7 +6,7 @@ The community frequently requests the ability to extend vLLM with custom feature ## How Plugins Work in vLLM -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see {ref}`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. ## How vLLM Discovers Plugins diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md index 17c7e7af89ee5..c4a1402d7346d 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -57,7 +57,7 @@ This decorator accepts a function that maps multi-modal inputs to the keyword ar A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. ```{seealso} -[input_processing_pipeline](#input-processing-pipeline) +[input-processing-pipeline](#input-processing-pipeline) ``` ## 3. Register maximum number of multi-modal tokens @@ -82,7 +82,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input_processing_pipeline](#input-processing-pipeline) +[input-processing-pipeline](#input-processing-pipeline) ``` ## 4. (Optional) Register dummy data @@ -111,7 +111,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input_processing_pipeline](#input-processing-pipeline) +[input-processing-pipeline](#input-processing-pipeline) ``` ## 5. (Optional) Register input processor @@ -139,5 +139,5 @@ Here are some examples: - Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input_processing_pipeline](#input-processing-pipeline) +[input-processing-pipeline](#input-processing-pipeline) ``` diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 7582b21bdc2dc..7aeaba855dcfb 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -11,7 +11,7 @@ which are then passed through {class}`~vllm.model_executor.layers.Sampler` to ob ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. -See {ref}`Engine Arguments ` for a list of options when initializing the model. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. For generative models, the only supported {code}`task` option is {code}`"generate"`. Usually, this is automatically inferred so you don't have to specify it. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index ad5b37aaf457c..f421707434280 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -17,7 +17,7 @@ pooling models as they only work on the generation or decode stage, so performan ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. -See {ref}`Engine Arguments ` for a list of options when initializing the model. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. For pooling models, we support the following {code}`task` options: @@ -89,4 +89,4 @@ so it is not part of the OpenAI standard. Please see [this page](#multimodal-inp ### Score API Our Score API is similar to `LLM.score`. -Please see [this page](../serving/openai_compatible_server#score-api-for-cross-encoder-models) for more details on how to use it. +Please see [this page](#score-api-for-cross-encoder-models) for more details on how to use it. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md index 4d766a2bb5c8a..a1f00d8536465 100644 --- a/docs/source/serving/deploying_with_nginx.md +++ b/docs/source/serving/deploying_with_nginx.md @@ -7,12 +7,12 @@ This document shows how to launch multiple vLLM serving containers and use Nginx Table of contents: 1. [Build Nginx Container](#nginxloadbalancer-nginx-build) -2. [Create Simple Nginx Config file](#nginxloadbalancer_nginx_conf) -3. [Build vLLM Container](#nginxloadbalancer_nginx_vllm_container) -4. [Create Docker Network](#nginxloadbalancer_nginx_docker_network) -5. [Launch vLLM Containers](#nginxloadbalancer_nginx_launch_container) -6. [Launch Nginx](#nginxloadbalancer_nginx_launch_nginx) -7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer_nginx_verify_nginx) +2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf) +3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container) +4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network) +5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container) +6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx) +7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx) (nginxloadbalancer-nginx-build)= diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index b010b976fec44..70c010dd2c76e 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -44,6 +44,8 @@ We currently support the following OpenAI APIs: - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* +(score-api-for-cross-encoder-models)= + ## Score API for Cross Encoder Models vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index e0a1ae8b521ab..14dd387743aac 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - `guided_decoding_backend`: used to select the guided decoding backend to use. -You can see the complete list of supported parameters on the [OpenAI Compatible Server](/..serving/openai_compatible_server.md) page. +You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 4ebd8201b66bf..dc61049fe6231 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - [input_processing_pipeline](#input-processing-pipeline) + [input-processing-pipeline](#input-processing-pipeline) """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 7ab00f52f4a01..0bf1f87444860 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -301,7 +301,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - [input_processing_pipeline](#input-processing-pipeline) + [input-processing-pipeline](#input-processing-pipeline) """ def wrapper(model_cls: N) -> N: @@ -344,7 +344,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - [input_processing_pipeline](#input-processing-pipeline) + [input-processing-pipeline](#input-processing-pipeline) """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index fcf2c67929f9e..fce71e25e6fe5 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - [input_processing_pipeline](#input_processing_pipeline) + [input-processing-pipeline](#input-processing-pipeline) """ __all__ = [ From bdc99b1795d120e238e48ca3dbbded04f615603f Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 17 Dec 2024 17:42:24 -0500 Subject: [PATCH 09/27] Fixes relative link Signed-off-by: Rafael Vasquez --- docs/source/getting_started/cpu-installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md index 8a1ec7345befe..4ab5437f091d5 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/cpu-installation.md @@ -151,4 +151,4 @@ $ python examples/offline_inference.py $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.html) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). From 1bfa5d5b1aba7a487a2df34ca65d6ddd99f89b56 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Tue, 17 Dec 2024 19:02:38 -0500 Subject: [PATCH 10/27] Fix compability matrix formatting Signed-off-by: Rafael Vasquez --- docs/source/usage/compatibility_matrix.md | 74 ++++++++++++----------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md index ff9b790c0d54c..469ff7f4764df 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/usage/compatibility_matrix.md @@ -28,26 +28,27 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ```{eval-rst} .. list-table:: :header-rows: 1 + :stub-columns: 1 :widths: auto * - Feature - - [CP](#chunked-prefill) - - [APC](#apc) - - [LoRA](#lora) - - :abbr:`prmpt adptr (Prompt Adapter)` - - [SD](#spec-decode) + - Chunked-Prefill + - APC + - LoRA + - Prompt Adapter + - Speculative Decoding - CUDA graph - - :abbr:`pooling (Pooling Models)` - - :abbr:`enc-dec (Encoder-Decoder Models)` - - :abbr:`logP (Logprobs)` - - :abbr:`prmpt logP (Prompt Logprobs)` - - :abbr:`async output (Async Output Processing)` + - Pooling Models + - Encoder-Decoder + - Logprobs + - Prompt Logprobs + - Async Output Processing - multi-step - - :abbr:`mm (Multimodal Inputs)` + - Multimodal Inputs - best-of - beam-search - - :abbr:`guided dec (Guided Decoding)` - * - [CP](#chunked-prefill) + - Guided Decoding + * - Chunked-Prefill - - - @@ -64,7 +65,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [APC](#apc) + * - APC - ✅ - - @@ -81,7 +82,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [LoRA](#lora) + * - LoRA - `✗ `__ - ✅ - @@ -98,7 +99,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`prmpt adptr (Prompt Adapter)` + * - Prompt Adapter - ✅ - ✅ - ✅ @@ -115,7 +116,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [SD](#spec-decode) + * - Speculative Decoding - ✅ - ✅ - ✗ @@ -149,7 +150,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`pooling (Pooling Models)` + * - Pooling Models - ✗ - ✗ - ✗ @@ -166,7 +167,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`enc-dec (Encoder-Decoder Models)` + * - Encoder-Decoder - ✗ - `✗ `__ - ✗ @@ -183,7 +184,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`logP (Logprobs)` + * - Logprobs - ✅ - ✅ - ✅ @@ -200,7 +201,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`prmpt logP (Prompt Logprobs)` + * - Prompt Logprobs - ✅ - ✅ - ✅ @@ -217,7 +218,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`async output (Async Output Processing)` + * - Async Output Processing - ✅ - ✅ - ✅ @@ -251,7 +252,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - :abbr:`mm (Multimodal Inputs)` + * - Multimodal Inputs - ✅ - `✗ `__ - `✗ `__ @@ -302,7 +303,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - - - * - :abbr:`guided dec (Guided Decoding)` + * - Guided Decoding - ✅ - ✅ - ? @@ -327,6 +328,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ```{eval-rst} .. list-table:: :header-rows: 1 + :stub-columns: 1 :widths: auto * - Feature @@ -337,7 +339,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - Hopper - CPU - AMD - * - [CP](#chunked-prefill) + * - Chunked-Prefill - `✗ `__ - ✅ - ✅ @@ -345,7 +347,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [APC](#apc) + * - APC - `✗ `__ - ✅ - ✅ @@ -353,7 +355,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [LoRA](#lora) + * - LoRA - ✅ - ✅ - ✅ @@ -361,7 +363,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - `✗ `__ - ✅ - * - :abbr:`prmpt adptr (Prompt Adapter)` + * - Prompt Adapter - ✅ - ✅ - ✅ @@ -369,7 +371,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - `✗ `__ - ✅ - * - [SD](#spec-decode) + * - Speculative Decoding - ✅ - ✅ - ✅ @@ -385,7 +387,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✗ - ✅ - * - :abbr:`pooling (Pooling Models)` + * - Pooling Models - ✅ - ✅ - ✅ @@ -393,7 +395,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ? - * - :abbr:`enc-dec (Encoder-Decoder Models)` + * - Encoder-Decoder - ✅ - ✅ - ✅ @@ -401,7 +403,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✗ - * - :abbr:`mm (Multimodal Inputs)` + * - Multimodal Inputs - ✅ - ✅ - ✅ @@ -409,7 +411,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :abbr:`logP (Logprobs)` + * - Logprobs - ✅ - ✅ - ✅ @@ -417,7 +419,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :abbr:`prmpt logP (Prompt Logprobs)` + * - Prompt Logprobs - ✅ - ✅ - ✅ @@ -425,7 +427,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :abbr:`async output (Async Output Processing)` + * - Async Output Processing - ✅ - ✅ - ✅ @@ -457,7 +459,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - :abbr:`guided dec (Guided Decoding)` + * - Guided Decoding - ✅ - ✅ - ✅ From 238067039f3dd8d63b709b900d366585daaf128d Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 18 Dec 2024 13:05:21 -0500 Subject: [PATCH 11/27] Fix refs Signed-off-by: Rafael Vasquez --- docs/source/models/pooling_models.md | 10 +++++----- .../serving/openai_compatible_server.md | 20 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 41a042da5510f..3e103b99e9b2a 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -10,14 +10,14 @@ before returning them. ```{note} We currently support pooling models primarily as a matter of convenience. -As shown in the {ref}`Compatibility Matrix `, most vLLM features are not applicable to +As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. ``` ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. -See {ref}`Engine Arguments ` for a list of options when initializing the model. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. For pooling models, we support the following {code}`task` options: @@ -106,12 +106,12 @@ A code example can be found in [examples/offline_inference_scoring.py](https://g ## Online Inference -Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference. +Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) can be used for online inference. Please click on the above link for more details on how to launch the server. ### Embeddings API -Our Embeddings API is similar to `LLM.embed`, accepting both text and {ref}`multi-modal inputs `. +Our Embeddings API is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs). The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) so that you can use OpenAI client to interact with it. @@ -119,7 +119,7 @@ A code example can be found in [examples/openai_embedding_client.py](https://git The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat), -so it is not part of the OpenAI standard. Please see {ref}`this page ` for more details on how to use it. +so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for more details on how to use it. ### Score API diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 8712d82ca075e..e1c3a6cc6cce9 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -30,20 +30,20 @@ print(completion.choices[0].message) We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. - [Score API](#score-api) (`/score`) - - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= ## Chat Template @@ -183,7 +183,7 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -206,12 +206,12 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -236,12 +236,12 @@ If the model has a [chat template](#chat-template), you can replace `inputs` wit which will be treated as a single prompt to the model. ```{tip} -This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. +This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. ``` #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -418,7 +418,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python From 826b3540f025338d15ec6a61de6c776f82fe1eb3 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 18 Dec 2024 13:19:51 -0500 Subject: [PATCH 12/27] Fix link Signed-off-by: Rafael Vasquez --- docs/source/models/pooling_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 3e103b99e9b2a..9629430439aa4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -124,4 +124,4 @@ so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for ### Score API Our Score API is similar to `LLM.score`. -Please see [this page](../serving/openai_compatible_server.html#score-api-for-cross-encoder-models) for more details on how to use it. +Please see [this page](../serving/openai_compatible_server#score-api) for more details on how to use it. From c824a4b909200dc35bee91272c29e802c7933b87 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 18 Dec 2024 13:32:47 -0500 Subject: [PATCH 13/27] Fix link Signed-off-by: Rafael Vasquez --- docs/source/models/pooling_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 9629430439aa4..20a7b8f33947d 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -124,4 +124,4 @@ so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for ### Score API Our Score API is similar to `LLM.score`. -Please see [this page](../serving/openai_compatible_server#score-api) for more details on how to use it. +Please see [this page](#score-api) for more details on how to use it. From 67420a9071b459e15a1ac40a10361878010c8a63 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 18 Dec 2024 13:55:20 -0500 Subject: [PATCH 14/27] Merge updated openai server doc Signed-off-by: Rafael Vasquez --- .../serving/openai_compatible_server.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index e1c3a6cc6cce9..1bc8d32d2d161 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,7 +2,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! -You can start the server using Python, or using [Docker](deploying_with_docker.md): +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` @@ -30,20 +30,20 @@ print(completion.choices[0].message) We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). + - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). + - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). + - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. - [Score API](#score-api) (`/score`) - - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). + - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). (chat-template)= ## Chat Template @@ -183,7 +183,7 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -206,12 +206,12 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. +see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. - *Note: `image_url.detail` parameter is not supported.* #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -236,12 +236,12 @@ If the model has a [chat template](#chat-template), you can replace `inputs` wit which will be treated as a single prompt to the model. ```{tip} -This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. +This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. ``` #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -418,7 +418,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python From a44d8c76b4b47cfd57b0ed168abdf175af4c15ff Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 18 Dec 2024 14:10:41 -0500 Subject: [PATCH 15/27] Update .rst to .md Signed-off-by: Rafael Vasquez --- .../serving/openai_compatible_server.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1bc8d32d2d161..934a7cea7b9cb 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,7 +2,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst): +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` @@ -30,20 +30,20 @@ print(completion.choices[0].message) We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. - [Score API](#score-api) (`/score`) - - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= ## Chat Template @@ -183,7 +183,7 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -206,12 +206,12 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -236,12 +236,12 @@ If the model has a [chat template](#chat-template), you can replace `inputs` wit which will be treated as a single prompt to the model. ```{tip} -This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. +This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. ``` #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -418,7 +418,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python From ef080bf446513b275c1749d12934799ae4e24aef Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 19 Dec 2024 14:50:00 -0500 Subject: [PATCH 16/27] Revert docstring refs to rst style Signed-off-by: Rafael Vasquez --- vllm/engine/llm_engine.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/inputs/__init__.py | 2 +- vllm/inputs/registry.py | 6 +++--- vllm/multimodal/__init__.py | 2 +- vllm/multimodal/base.py | 14 +++++++------- vllm/multimodal/inputs.py | 2 +- vllm/multimodal/registry.py | 6 +++--- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 95e3d53d43b8d..dc2d77d6927cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -148,7 +148,7 @@ class LLMEngine: and the :class:`AsyncLLMEngine` class wraps this class for online serving. The config arguments are derived from :class:`~vllm.EngineArgs`. (See - [engine-args](#engine-args)) + :ref:`engine_args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2042a0ca9f38b..58ab892676b9a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ class LLM: integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - [engine-args](#engine-args)) + :ref:`engine_args`) Note: This class is intended to be used for offline inference. For online diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index dc61049fe6231..d4402e77a3886 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - [input-processing-pipeline](#input-processing-pipeline) + :ref:`input_processing_pipeline` """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 82cdfea32a1c3..0b85484c48714 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -274,7 +274,7 @@ def dummy_data_for_profiling( The model is identified by ``model_config``. See also: - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + :ref:`enabling_multimodal_inputs` Note: This should be called after @@ -351,7 +351,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - [input-processing-pipeline](#input-processing-pipeline) + :ref:`input_processing_pipeline` """ def wrapper(model_cls: N) -> N: @@ -395,7 +395,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - [input-processing-pipeline](#input-processing-pipeline) + :ref:`input_processing_pipeline` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index fce71e25e6fe5..928c31a2f2843 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - [input-processing-pipeline](#input-processing-pipeline) + :ref:`input_processing_pipeline` """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 03f972dfa1dd7..fe77a4635f7d8 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): (i.e., the modality of the data). See also: - [adding_multimodal_plugin](#adding-multimodal-plugin) + :ref:`adding_multimodal_plugin` """ def __init__(self) -> None: @@ -94,8 +94,8 @@ def register_input_mapper( If `None` is provided, then the default input mapper is used instead. See also: - - [input-processing-pipeline](#input-processing-pipeline) - - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` """ def wrapper(model_cls: N) -> N: @@ -130,8 +130,8 @@ def map_input( TypeError: If the data type is not supported. See also: - - [input-processing-pipeline](#input-processing-pipeline) - - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` """ # Avoid circular import @@ -190,7 +190,7 @@ def register_max_multimodal_tokens( If `None` is provided, then the default calculation is used instead. See also: - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + :ref:`enabling_multimodal_inputs` """ def wrapper(model_cls: N) -> N: @@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: The model is identified by ``model_config``. See also: - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + :ref:`enabling_multimodal_inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 6f4438f92328f..c00943a5f26d9 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -74,7 +74,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalDataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that [here](#adding-multimodal-plugin). + Read more on that :ref:`here `. """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 64d3fb4e0e469..6cd79d414c978 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None: Register a multi-modal plugin so it can be recognized by vLLM. See also: - [adding_multimodal_plugin](#adding-multimodal-plugin) + :ref:`adding_multimodal_plugin` """ data_type_key = plugin.get_data_key() @@ -311,8 +311,8 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. See also: - - [input-processing-pipeline](#input-processing-pipeline) - - [enabling-multimodal-inputs](#enabling-multimodal-inputs) + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` """ def wrapper(model_cls: N) -> N: From 5bdcbdb985bca9d726f6293de550a93045eb3e52 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 19 Dec 2024 15:14:41 -0500 Subject: [PATCH 17/27] Update rst refs Signed-off-by: Rafael Vasquez --- vllm/engine/llm_engine.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/inputs/__init__.py | 2 +- vllm/inputs/registry.py | 6 +++--- vllm/multimodal/__init__.py | 2 +- vllm/multimodal/base.py | 14 +++++++------- vllm/multimodal/inputs.py | 2 +- vllm/multimodal/registry.py | 6 +++--- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dc2d77d6927cd..3bcc0fc2604f3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -148,7 +148,7 @@ class LLMEngine: and the :class:`AsyncLLMEngine` class wraps this class for online serving. The config arguments are derived from :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 58ab892676b9a..cf0d04dc3ddb3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ class LLM: integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Note: This class is intended to be used for offline inference. For online diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index d4402e77a3886..aaeecab7ffde1 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0b85484c48714..8f18d7e8bfd6d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -274,7 +274,7 @@ def dummy_data_for_profiling( The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` Note: This should be called after @@ -351,7 +351,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ def wrapper(model_cls: N) -> N: @@ -395,7 +395,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 928c31a2f2843..9255e062e4870 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fe77a4635f7d8..1e5a46946c6c0 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): (i.e., the modality of the data). See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ def __init__(self) -> None: @@ -94,8 +94,8 @@ def register_input_mapper( If `None` is provided, then the default input mapper is used instead. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -130,8 +130,8 @@ def map_input( TypeError: If the data type is not supported. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ # Avoid circular import @@ -190,7 +190,7 @@ def register_max_multimodal_tokens( If `None` is provided, then the default calculation is used instead. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index c00943a5f26d9..cad21eeff3c34 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -74,7 +74,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalDataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. + Read more on that :ref:`here `. """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6cd79d414c978..ded45a7184b5d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None: Register a multi-modal plugin so it can be recognized by vLLM. See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ data_type_key = plugin.get_data_key() @@ -311,8 +311,8 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: From 9ae5d000060cd366ab4baf9c562dec1aedc35764 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 19 Dec 2024 16:44:48 -0500 Subject: [PATCH 18/27] Fix supported models page Signed-off-by: Rafael Vasquez --- docs/source/models/supported_models.md | 34 ++++++++++++-------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index f04f420928b54..b0529e247ac66 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -80,8 +80,8 @@ See [this page](#generative-models) for more information on how to use generativ * - Architecture - Models - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`AquilaForCausalLM` - Aquila, Aquila2 - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. @@ -385,8 +385,8 @@ The following table lists those that are tested in vLLM. * - Architecture - Models - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertModel` - BERT-based - :code:`BAAI/bge-base-en-v1.5`, etc. @@ -447,8 +447,8 @@ despite being described otherwise on its model card. * - Architecture - Models - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlamaForCausalLM` - Llama-based - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. @@ -477,7 +477,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 - Models - Example HF Models - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`PP ` * - :code:`JambaForSequenceClassification` - Jamba - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. @@ -500,8 +500,8 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 * - Architecture - Models - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertForSequenceClassification` - BERT-based - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. @@ -555,8 +555,8 @@ See [this page](#generative-models) for more information on how to use generativ - Models - Inputs - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` - V1 * - :code:`AriaForConditionalGeneration` - Aria @@ -722,15 +722,11 @@ See [this page](#generative-models) for more information on how to use generativ ``` ```{eval-rst} -{sup}`E` - - Pre-computed embeddings can be inputted for this modality. +:sup:`E` Pre-computed embeddings can be inputted for this modality. -{sup}`+` +:sup:`+` Multiple items can be inputted per text prompt for this modality. ``` - Multiple items can be inputted per text prompt for this modality. - ````{important} To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: @@ -789,8 +785,8 @@ The following table lists those that are tested in vLLM. - Models - Inputs - Example HF Models - - [LoRA](#lora) - - [PP](#distributed-serving) + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT-based - T / I From 5cb56bd900829839228e2f26d43e2013005b449b Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 20 Dec 2024 11:58:05 -0500 Subject: [PATCH 19/27] Fix code blocks Signed-off-by: Rafael Vasquez --- docs/source/serving/runai_model_streamer.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md index 7f5b312e45f13..1b5756a95075a 100644 --- a/docs/source/serving/runai_model_streamer.md +++ b/docs/source/serving/runai_model_streamer.md @@ -37,16 +37,16 @@ You can tune parameters using `--model-loader-extra-config`: You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. For reading from S3, it will be the number of client instances the host is opening to the S3 server. -> ```console -> $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' -> ``` +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +``` You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). -> ```console -> $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' -> ``` +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +``` ```{note} For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). From c79201a78bb0007afe88aa0bedc9651557bdf76e Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Sun, 22 Dec 2024 19:57:49 -0500 Subject: [PATCH 20/27] Fix pip editable flag link Signed-off-by: Rafael Vasquez --- docs/source/getting_started/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md index 0029f68ef8441..8ca634f966a06 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation.md @@ -81,7 +81,7 @@ The latest code can contain bugs and may not be stable. Please use it with cauti ### Python-only build (without compilation) -If you only need to change Python code, you can build and install vLLM without compilation. Using `` pip's ` ``--editable\`\` flag \<>\`\_, changes you make to the code will be reflected when you run vLLM: +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console $ git clone https://github.com/vllm-project/vllm.git From 07d544ba90fc3b66611a789c2ac0d29b00a76d0a Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Sun, 22 Dec 2024 20:04:43 -0500 Subject: [PATCH 21/27] Fixes source links in examples Signed-off-by: Rafael Vasquez --- docs/source/generate_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 90694833e6357..e0bc42851a51e 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -40,7 +40,7 @@ def generate_examples(): # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source {script_url}.\n\n" + f"Source: [{script_path.name}]({script_url}).\n\n" f"```{{literalinclude}} {include_path}\n" ":language: python\n" ":linenos:\n```") From d109106480bee250e4f74e59cf22b28435cb4f05 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 09:28:20 -0500 Subject: [PATCH 22/27] Fix link and names Signed-off-by: Rafael Vasquez --- docs/source/models/enabling_multimodal_inputs.md | 10 +++++----- docs/source/serving/metrics.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md index c4a1402d7346d..2f93eb826fb1e 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -5,7 +5,7 @@ This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). ```{seealso} -[adding-a-new-model](adding-a-new-model) +[Adding a New Model](adding-a-new-model) ``` ## 1. Update the base vLLM model @@ -57,7 +57,7 @@ This decorator accepts a function that maps multi-modal inputs to the keyword ar A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. ```{seealso} -[input-processing-pipeline](#input-processing-pipeline) +[Input Processing Pipeline](#input-processing-pipeline) ``` ## 3. Register maximum number of multi-modal tokens @@ -82,7 +82,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input-processing-pipeline](#input-processing-pipeline) +[Input Processing Pipeline](#input-processing-pipeline) ``` ## 4. (Optional) Register dummy data @@ -111,7 +111,7 @@ Here are some examples: - Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input-processing-pipeline](#input-processing-pipeline) +[Input Processing Pipeline](#input-processing-pipeline) ``` ## 5. (Optional) Register input processor @@ -139,5 +139,5 @@ Here are some examples: - Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) ```{seealso} -[input-processing-pipeline](#input-processing-pipeline) +[Input Processing Pipeline](#input-processing-pipeline) ``` diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 3c4f724f28bbd..2dc78643f6d8f 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using \[Docker\](deploying_with_docker.md): +You can start the server using Python, or using [Docker](deploying_with_docker.md): ```console $ vllm serve unsloth/Llama-3.2-1B-Instruct From ee833d15e1cf7ac5432c0cfc05c98c0ab5af05ce Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 09:34:48 -0500 Subject: [PATCH 23/27] Resolve github links in example pages Signed-off-by: Rafael Vasquez --- docs/source/generate_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index e0bc42851a51e..4c5a9d9c1da38 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -40,7 +40,7 @@ def generate_examples(): # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: [{script_path.name}]({script_url}).\n\n" + f"Source: <{script_url}>.\n\n" f"```{{literalinclude}} {include_path}\n" ":language: python\n" ":linenos:\n```") From a19792109fbe3807bd59f74feb62545bb9c87ff6 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 12:46:56 -0500 Subject: [PATCH 24/27] Convert compatibility matrix to markdown Signed-off-by: Rafael Vasquez --- docs/source/usage/compatibility_matrix.md | 112 +++++++++++----------- 1 file changed, 55 insertions(+), 57 deletions(-) diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md index 469ff7f4764df..3b7dfa540cd72 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/usage/compatibility_matrix.md @@ -25,30 +25,29 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ``` -```{eval-rst} -.. list-table:: +```{list-table} :header-rows: 1 :stub-columns: 1 :widths: auto * - Feature - - Chunked-Prefill - - APC - - LoRA - - Prompt Adapter - - Speculative Decoding + - [CP](#chunked-prefill) + - [APC](#apc) + - [LoRA](#lora) + - prmpt adptr + - [SD](#spec_decode) - CUDA graph - - Pooling Models - - Encoder-Decoder - - Logprobs - - Prompt Logprobs - - Async Output Processing + - pooling + - enc-dec + - logP + - prmpt logP + - async output - multi-step - - Multimodal Inputs + - mm - best-of - beam-search - - Guided Decoding - * - Chunked-Prefill + - guided dec + * - [CP](#chunked-prefill) - - - @@ -65,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - APC + * - [APC](#apc) - ✅ - - @@ -82,8 +81,8 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - LoRA - - `✗ `__ + * - [LoRA](#lora) + - [✗](https://github.com/vllm-project/vllm/pull/9057) - ✅ - - @@ -99,7 +98,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Prompt Adapter + * - prmpt adptr - ✅ - ✅ - ✅ @@ -116,7 +115,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Speculative Decoding + * - [SD](#spec_decode) - ✅ - ✅ - ✗ @@ -150,7 +149,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Pooling Models + * - pooling - ✗ - ✗ - ✗ @@ -167,12 +166,12 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Encoder-Decoder + * - enc-dec - ✗ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/7366) - ✗ - ✗ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/7366) - ✅ - ✅ - @@ -184,7 +183,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Logprobs + * - logP - ✅ - ✅ - ✅ @@ -201,12 +200,12 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Prompt Logprobs + * - prmpt logP - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/pull/8199) - ✅ - ✗ - ✅ @@ -218,7 +217,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - Async Output Processing + * - async output - ✅ - ✅ - ✅ @@ -245,17 +244,17 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✗ - ✗ - ✅ - - `✗ `__ + - [✗](ttps://github.com/vllm-project/vllm/issues/8198) - ✅ - - - - - - * - Multimodal Inputs + * - mm - ✅ - - `✗ `__ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/pull/8348) + - [✗](https://github.com/vllm-project/vllm/pull/7199) - ? - ? - ✅ @@ -274,14 +273,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/6137) - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/7968) - ✅ - - @@ -291,19 +290,19 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/6137) - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/7968>) - ? - ✅ - - - * - Guided Decoding + * - guided dec - ✅ - ✅ - ? @@ -315,7 +314,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/9893) - ? - ✅ - ✅ @@ -325,8 +324,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ### Feature x Hardware -```{eval-rst} -.. list-table:: +```{list-table} :header-rows: 1 :stub-columns: 1 :widths: auto @@ -339,39 +337,39 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - Hopper - CPU - AMD - * - Chunked-Prefill - - `✗ `__ + * - [CP](#chunked-prefill) + - [✗](https://github.com/vllm-project/vllm/issues/2729) - ✅ - ✅ - ✅ - ✅ - ✅ - ✅ - * - APC - - `✗ `__ + * - [APC](#apc) + - [✗](https://github.com/vllm-project/vllm/issues/3687) - ✅ - ✅ - ✅ - ✅ - ✅ - ✅ - * - LoRA + * - [LoRA](#lora) - ✅ - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/pull/4830) - ✅ - * - Prompt Adapter + * - prmpt adptr - ✅ - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/8475) - ✅ - * - Speculative Decoding + * - [SD](#spec_decode) - ✅ - ✅ - ✅ @@ -387,7 +385,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✗ - ✅ - * - Pooling Models + * - pooling - ✅ - ✅ - ✅ @@ -395,7 +393,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ? - * - Encoder-Decoder + * - enc-dec - ✅ - ✅ - ✅ @@ -403,7 +401,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✗ - * - Multimodal Inputs + * - mm - ✅ - ✅ - ✅ @@ -411,7 +409,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - Logprobs + * - logP - ✅ - ✅ - ✅ @@ -419,7 +417,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - Prompt Logprobs + * - prmpt logP - ✅ - ✅ - ✅ @@ -427,7 +425,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - Async Output Processing + * - async output - ✅ - ✅ - ✅ @@ -441,7 +439,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - `✗ `__ + - [✗](https://github.com/vllm-project/vllm/issues/8477) - ✅ * - best-of - ✅ @@ -459,7 +457,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - Guided Decoding + * - guided dec - ✅ - ✅ - ✅ From d40c5591ace718f0568d8f219ec4582b581b6ccb Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 12:50:21 -0500 Subject: [PATCH 25/27] Fix warning output code block Signed-off-by: Rafael Vasquez --- docs/source/usage/performance.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md index 4a4b0d52f9eb1..f028e28627a9f 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/usage/performance.md @@ -8,9 +8,9 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -`` ` -WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 -` `` +``` +WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 +``` While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency. If you frequently encounter preemptions from the vLLM engine, consider the following actions: From e126c480ccb8338dba58e7d37e2a6af8eb461a36 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 12:51:25 -0500 Subject: [PATCH 26/27] Add inline code formatting Signed-off-by: Rafael Vasquez --- docs/source/usage/usage_stats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md index a1e4b1c38acae..a7eb6144571a4 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/usage/usage_stats.md @@ -47,7 +47,7 @@ tail ~/.config/vllm/usage_stats.json ## Opt-out of Usage Stats Collection -You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: +You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: ```bash # Any of the following methods can disable usage stats collection From 5010a0870255ae598bf4742592f1f14053f9c579 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Mon, 23 Dec 2024 13:43:32 -0500 Subject: [PATCH 27/27] Update lora-adapter ref and fix link Signed-off-by: Rafael Vasquez --- docs/source/models/supported_models.md | 14 +++++++------- docs/source/usage/compatibility_matrix.md | 8 ++++---- docs/source/usage/lora.md | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index b0529e247ac66..650293d864011 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -80,7 +80,7 @@ See [this page](#generative-models) for more information on how to use generativ * - Architecture - Models - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`AquilaForCausalLM` - Aquila, Aquila2 @@ -385,7 +385,7 @@ The following table lists those that are tested in vLLM. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`BertModel` - BERT-based @@ -447,7 +447,7 @@ despite being described otherwise on its model card. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`LlamaForCausalLM` - Llama-based @@ -476,7 +476,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 * - Architecture - Models - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`JambaForSequenceClassification` - Jamba @@ -500,7 +500,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 * - Architecture - Models - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`BertForSequenceClassification` - BERT-based @@ -555,7 +555,7 @@ See [this page](#generative-models) for more information on how to use generativ - Models - Inputs - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` - V1 * - :code:`AriaForConditionalGeneration` @@ -785,7 +785,7 @@ The following table lists those that are tested in vLLM. - Models - Inputs - Example HF Models - - :ref:`LoRA ` + - :ref:`LoRA ` - :ref:`PP ` * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT-based diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md index 3b7dfa540cd72..763b49dac4f8a 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/usage/compatibility_matrix.md @@ -33,7 +33,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar * - Feature - [CP](#chunked-prefill) - [APC](#apc) - - [LoRA](#lora) + - [LoRA](#lora-adapter) - prmpt adptr - [SD](#spec_decode) - CUDA graph @@ -81,7 +81,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [LoRA](#lora) + * - [LoRA](#lora-adapter) - [✗](https://github.com/vllm-project/vllm/pull/9057) - ✅ - @@ -244,7 +244,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✗ - ✗ - ✅ - - [✗](ttps://github.com/vllm-project/vllm/issues/8198) + - [✗](https://github.com/vllm-project/vllm/issues/8198) - ✅ - - @@ -353,7 +353,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [LoRA](#lora) + * - [LoRA](#lora-adapter) - ✅ - ✅ - ✅ diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md index a7bb881951abf..e2ddde74aaa45 100644 --- a/docs/source/usage/lora.md +++ b/docs/source/usage/lora.md @@ -1,4 +1,4 @@ -(lora)= +(lora-adapter)= # LoRA Adapters