Fix spawn vs fork issue using approach from #8823

Signed-off-by: Tyler Michael Smith <[email protected]>
vllm-project · Nov 26, 2024 · bedd593 · bedd593
1 parent c4fcfce
commit bedd593
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -9,8 +9,8 @@
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.logger import init_logger
 from vllm.triton_utils import maybe_set_triton_cache_manager
-from vllm.utils import (get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id)
+from vllm.utils import (cuda_is_initialized, get_distributed_init_method,
+                        get_open_port, get_vllm_instance_id)
 from vllm.v1.core.scheduler_output import ExecutorMsg, ExecutorMsgType
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import WorkerProc, WorkerProcHandle
@@ -43,6 +43,13 @@ def __init__(self, vllm_config: VllmConfig) -> None:
             f"tensor_parallel_size ({tensor_parallel_size}) -- pipeline "
             f"parallelism is not yet implemented in v1")
 
+        if (cuda_is_initialized()
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+            logger.warning("CUDA was previously initialized. We must use "
+                           "the `spawn` multiprocessing start method. Setting "
+                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
         os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
 

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -1,6 +1,5 @@
 """A GPU worker class."""
 import gc
-import multiprocessing
 import os
 import pickle
 from dataclasses import dataclass
@@ -19,6 +18,7 @@
                               set_custom_all_reduce)
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
@@ -343,13 +343,7 @@ def make_worker_process(
             distributed_init_method: str,
             input_shm_handle,  # Receive SchedulerOutput
     ) -> WorkerProcHandle:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-
-        # TODO(tms): fix before landing
-        context = multiprocessing.get_context("fork")
+        context = get_mp_context()
 
         # ZMQ paths to send back and forth to worker process
         # Used for initialization.