From ee2c7f56fce31bbb7bfcb9b25f1ea712bafe72e5 Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Mon, 25 Nov 2024 21:09:43 -0800 Subject: [PATCH] [v1] EngineArgs for better config handling for v1 (#10382) Signed-off-by: rickyx Signed-off-by: Andrew Feldman --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/engine/test_async_llm.py | 3 ++ tests/v1/engine/test_engine_args.py | 42 +++++++++++++++++ tests/v1/engine/test_engine_core.py | 3 +- tests/v1/engine/test_engine_core_client.py | 6 ++- vllm/engine/arg_utils.py | 53 ++++++++++++++++++++-- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 2 +- vllm/engine/multiprocessing/engine.py | 2 +- vllm/entrypoints/openai/api_server.py | 4 +- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 13 ------ vllm/v1/engine/llm_engine.py | 2 +- 13 files changed, 109 insertions(+), 27 deletions(-) create mode 100644 tests/v1/engine/test_engine_args.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff33d35b423e..fc23c9cff0d87 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -172,7 +172,7 @@ steps: - vllm/ - tests/v1 commands: - - pytest -v -s v1 + - VLLM_USE_V1=1 pytest -v -s v1 - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 1f26fe0fc892f..fffb5b8100ec7 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str, @pytest.mark.asyncio async def test_load(monkeypatch): + # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 + # so that in the future when we switch, we don't have to change all the + # tests. with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py new file mode 100644 index 0000000000000..69cfdf5a395c1 --- /dev/null +++ b/tests/v1/engine/test_engine_args.py @@ -0,0 +1,42 @@ +import pytest + +from vllm import envs +from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs +from vllm.usage.usage_lib import UsageContext + +if not envs.VLLM_USE_V1: + pytest.skip( + "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", + allow_module_level=True, + ) + + +def test_defaults(): + engine_args = EngineArgs(model="facebook/opt-125m") + + # Assert V1 defaults + assert (engine_args.enable_prefix_caching + ), "V1 turns on prefix caching by default" + + +def test_defaults_with_usage_context(): + engine_args = EngineArgs(model="facebook/opt-125m") + vllm_config: VllmConfig = engine_args.create_engine_config( + UsageContext.LLM_CLASS) + + assert vllm_config.scheduler_config.max_num_seqs == 1024 + assert vllm_config.scheduler_config.max_num_batched_tokens == 8192 + + engine_args = EngineArgs(model="facebook/opt-125m") + vllm_config = engine_args.create_engine_config( + UsageContext.OPENAI_API_SERVER) + assert vllm_config.scheduler_config.max_num_seqs == 1024 + assert vllm_config.scheduler_config.max_num_batched_tokens == 2048 + + +def test_prefix_cache_disabled_with_multimodel(): + engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf") + + vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS) + assert not vllm_config.cache_config.enable_prefix_caching diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index b3692b594326a..bd11ff1877064 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -43,7 +43,8 @@ def test_engine_core(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e248e35ae4069..582192196aaf9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -82,7 +82,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( vllm_config, @@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch): m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( vllm_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca68c1d57151c..60ad5ee54a2f2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file +from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean if TYPE_CHECKING: @@ -113,7 +114,7 @@ class EngineArgs: # NOTE(kzawora): default block size for Gaudi should be 128 # smaller sizes still work, but very inefficiently block_size: int = 16 if not current_platform.is_hpu() else 128 - enable_prefix_caching: bool = False + enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True swap_space: float = 4 # GiB @@ -197,6 +198,11 @@ def __post_init__(self): if not self.tokenizer: self.tokenizer = self.model + # Override the default value of enable_prefix_caching if it's not set + # by user. + if self.enable_prefix_caching is None: + self.enable_prefix_caching = bool(envs.VLLM_USE_V1) + # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -953,7 +959,12 @@ def create_load_config(self) -> LoadConfig: ignore_patterns=self.ignore_patterns, ) - def create_engine_config(self) -> VllmConfig: + def create_engine_config(self, + usage_context: Optional[UsageContext] = None + ) -> VllmConfig: + if envs.VLLM_USE_V1: + self._override_v1_engine_args(usage_context) + # gguf file needs a specific model loader and doesn't use hf_repo if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" @@ -1170,7 +1181,7 @@ def create_engine_config(self) -> VllmConfig: or "all" in detailed_trace_modules, ) - return VllmConfig( + config = VllmConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, @@ -1185,6 +1196,42 @@ def create_engine_config(self) -> VllmConfig: compilation_config=self.compilation_config, ) + if envs.VLLM_USE_V1: + self._override_v1_engine_config(config) + return config + + def _override_v1_engine_args(self, usage_context: UsageContext) -> None: + """ + Override the EngineArgs's args based on the usage context for V1. + """ + assert envs.VLLM_USE_V1, "V1 is not enabled" + + if self.max_num_batched_tokens is None: + # When no user override, set the default values based on the + # usage context. + if usage_context == UsageContext.LLM_CLASS: + logger.warning("Setting max_num_batched_tokens to 8192 " + "for LLM_CLASS usage context.") + self.max_num_seqs = 1024 + self.max_num_batched_tokens = 8192 + elif usage_context == UsageContext.OPENAI_API_SERVER: + logger.warning("Setting max_num_batched_tokens to 2048 " + "for OPENAI_API_SERVER usage context.") + self.max_num_seqs = 1024 + self.max_num_batched_tokens = 2048 + + def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: + """ + Override the EngineConfig's configs based on the usage context for V1. + """ + assert envs.VLLM_USE_V1, "V1 is not enabled" + # TODO (ywang96): Enable APC by default when VLM supports it. + if engine_config.model_config.is_multimodal_model: + logger.warning( + "Prefix caching is currently not supported for multimodal " + "models and has been disabled.") + engine_config.cache_config.enable_prefix_caching = False + @dataclass class AsyncEngineArgs(EngineArgs): diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5a5388708b1c6..3224577c567f8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -680,7 +680,7 @@ def from_engine_args( """Creates an async LLM engine from the engine arguments.""" # Create the engine configs. if engine_config is None: - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(engine_config) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fb21b2dedeb74..a4975cece9a81 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -568,7 +568,7 @@ def from_engine_args( ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(engine_config) # Create the LLM engine. engine = cls( diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 7de23643a2e1c..49a90b321dac4 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -111,7 +111,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, from vllm.plugins import load_general_plugins load_general_plugins() - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = LLMEngine._get_executor_cls(engine_config) use_async_sockets = engine_config.model_config.use_async_output_proc diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bc018be982bff..6bc31ef83ded4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -135,8 +135,8 @@ async def build_async_engine_client_from_engine_args( # TODO: fill out feature matrix. if (MQLLMEngineClient.is_unsupported_config(engine_args) or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): - - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config( + UsageContext.OPENAI_API_SERVER) uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), "uses_ray", False) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c44ebb2a85ba0..a17c8eac4b77c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -94,7 +94,7 @@ def from_engine_args( # Create the engine configs. if engine_config is None: - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config(usage_context) else: vllm_config = engine_config diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 1a978fbe7355f..34f99dd30ef2e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -41,19 +41,6 @@ def __init__( executor_class: Type[GPUExecutor], usage_context: UsageContext, ): - # Override the configs for V1. - # FIXME - if usage_context == UsageContext.LLM_CLASS: - vllm_config.scheduler_config.max_num_seqs = 1024 - vllm_config.scheduler_config.max_num_batched_tokens = 8192 - elif usage_context == UsageContext.OPENAI_API_SERVER: - vllm_config.scheduler_config.max_num_seqs = 1024 - vllm_config.scheduler_config.max_num_batched_tokens = 2048 - - # TODO (ywang96): Enable APC by default when VLM supports it. - if not vllm_config.model_config.is_multimodal_model: - vllm_config.cache_config.enable_prefix_caching = True - assert vllm_config.model_config.task != "embedding" logger.info("Initializing an LLM engine (v%s) with config: %s", diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 75a77be750acd..7a5482f03b6fa 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -82,7 +82,7 @@ def from_engine_args( """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(vllm_config) if VLLM_ENABLE_V1_MULTIPROCESSING: