From 9cdba9669cb32191aa0ae6782c0648be3e0e44ed Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 11 Nov 2024 20:55:09 -0500
Subject: [PATCH] [Doc] Update help text for `--distributed-executor-backend`
 (#10231)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/config.py           |  9 ++++++---
 vllm/engine/arg_utils.py | 11 ++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index dc9c06d7fb16e..bb9fee30c8445 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -951,9 +951,12 @@ class ParallelConfig:
             https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
         placement_group: ray distributed model workers placement group.
         distributed_executor_backend: Backend to use for distributed model
-            workers, either "ray" or "mp" (multiprocessing). If either
-            pipeline_parallel_size or tensor_parallel_size is greater than 1,
-            will default to "ray" if Ray is installed or "mp" otherwise.
+            workers, either "ray" or "mp" (multiprocessing). If the product
+            of pipeline_parallel_size and tensor_parallel_size is less than
+            or equal to the number of GPUs available, "mp" will be used to
+            keep processing on a single host. Otherwise, this will default
+            to "ray" if Ray is installed and fail otherwise. Note that tpu
+            and hpu only support Ray for distributed inference.
     """
 
     def __init__(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 02e67f89e5a8d..1591059a89f92 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -369,9 +369,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--distributed-executor-backend',
             choices=['ray', 'mp'],
             default=EngineArgs.distributed_executor_backend,
-            help='Backend to use for distributed serving. When more than 1 GPU '
-            'is used, will be automatically set to "ray" if installed '
-            'or "mp" (multiprocessing) otherwise.')
+            help='Backend to use for distributed model '
+            'workers, either "ray" or "mp" (multiprocessing). If the product '
+            'of pipeline_parallel_size and tensor_parallel_size is less than '
+            'or equal to the number of GPUs available, "mp" will be used to '
+            'keep processing on a single host. Otherwise, this will default '
+            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
+            'and hpu only support Ray for distributed inference.')
+
         parser.add_argument(
             '--worker-use-ray',
             action='store_true',