diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 7f57d5cf9b182..a89fa445bf96a 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,3 +1,4 @@ +import os import subprocess import sys import time @@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, "127.0.0.1", "--tokenizer-pool-size", str(tokenizer_pool_size) ] + + # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1` + # to prevent `--engine-use-ray` raises an exception due to it deprecation + env_vars = os.environ.copy() + env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1" + if engine_use_ray: commands.append("--engine-use-ray") if worker_use_ray: commands.append("--worker-use-ray") - uvicorn_process = subprocess.Popen(commands) + uvicorn_process = subprocess.Popen(commands, env=env_vars) yield uvicorn_process.terminate() diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index aa2b6e22208f3..d763f2c2e07b1 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -1,4 +1,5 @@ import asyncio +import os from dataclasses import dataclass import pytest @@ -106,11 +107,16 @@ async def test_new_requests_event(): assert engine.engine.add_request_calls == 3 assert engine.engine.step_calls == old_step_calls + 1 + # Allow deprecated engine_use_ray to not raise exception + os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1" + engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) assert engine.get_model_config() is not None assert engine.get_tokenizer() is not None assert engine.get_decoding_config() is not None + os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY") + def test_asyncio_run(): wait_for_gpu_memory_to_clear( diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 0d53b39e7ce1c..d5c88708d047b 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -23,7 +23,11 @@ def server(): str(chatml_jinja_path), ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # Allow `--engine-use-ray`, otherwise the launch of the server throw + # an error due to try to use a deprecated feature + env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"} + with RemoteOpenAIServer(MODEL_NAME, args, + env_dict=env_dict) as remote_server: yield remote_server diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f9f246436c0f7..d0f91a63b2d6a 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,4 +1,5 @@ import asyncio +import os from itertools import cycle from typing import Dict, List, Optional, Sequence, Tuple, Union @@ -56,6 +57,11 @@ def __init__( ) -> None: if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + + # Needed to engine_use_ray works as a deprecated feature, + # otherwise the following constructor will raise an exception + os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1" + engine_args = AsyncEngineArgs( model=model, tokenizer=tokenizer, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8d5690acfab73..8911d91420d70 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -923,7 +923,13 @@ def add_cli_args(parser: FlexibleArgumentParser, parser.add_argument('--engine-use-ray', action='store_true', help='Use Ray to start the LLM engine in a ' - 'separate process as the server process.') + 'separate process as the server process.' + '(DEPRECATED. This argument is deprecated ' + 'and will be removed in a future update. ' + 'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force ' + 'use it. See ' + 'https://github.com/vllm-project/vllm/issues/7045.' + ')') parser.add_argument('--disable-log-requests', action='store_true', help='Disable logging requests.') diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ff5019f2ef67b..a28b20fcbbcd8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -29,6 +29,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.usage.usage_lib import UsageContext +from vllm.utils import print_warning_once logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -510,6 +511,20 @@ def __init__(self, self.log_requests = log_requests self.engine = self._init_engine(*args, **kwargs) + if self.engine_use_ray: + print_warning_once( + "DEPRECATED. `--engine-use-ray` is deprecated and will " + "be removed in a future update. " + "See https://github.com/vllm-project/vllm/issues/7045.") + + if envs.VLLM_ALLOW_ENGINE_USE_RAY: + print_warning_once( + "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray") + else: + raise ValueError("`--engine-use-ray` is deprecated. " + "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to " + "force use it") + self.background_loop: Optional[asyncio.Future] = None # We need to keep a reference to unshielded # task as well to prevent it from being garbage diff --git a/vllm/envs.py b/vllm/envs.py index 22b2aa37a925e..5518cd9ced9be 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,6 +55,7 @@ VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_TEST_FORCE_FP8_MARLIN: bool = False + VLLM_ALLOW_ENGINE_USE_RAY: bool = False VLLM_PLUGINS: Optional[List[str]] = None @@ -364,6 +365,14 @@ def get_default_config_root(): (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in ("1", "true")), + # If set, allow running the engine as a separate ray actor, + # which is a deprecated feature soon to be removed. + # See https://github.com/vllm-project/vllm/issues/7045 + "VLLM_ALLOW_ENGINE_USE_RAY": + lambda: + (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in + ("1", "true")), + # a list of plugin names to load, separated by commas. # if this is not set, it means all plugins will be loaded # if this is set to an empty string, no plugins will be loaded