diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0efb0cbbf8bec..208766a18e99c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2,8 +2,8 @@ import dataclasses import json from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, - Type, Union) +from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, + Tuple, Type, Union) import torch @@ -177,6 +177,7 @@ class EngineArgs: disable_async_output_proc: bool = False override_neuron_config: Optional[Dict[str, Any]] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None + scheduling_policy: Literal["fcfs", "priority"] = "fcfs" def __post_init__(self): if self.tokenizer is None: @@ -797,6 +798,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=None, help="override or set neuron device configuration.") + parser.add_argument( + '--scheduling-policy', + choices=['fcfs', 'priority'], + default="fcfs", + help='The scheduling policy to use. "fcfs" (first come first served' + ', i.e. requests are handled in order of arrival; default) ' + 'or "priority" (requests are handled based on given ' + 'priority (lower value means earlier handling) and time of ' + 'arrival deciding any ties).') + return parser @classmethod @@ -1011,6 +1022,7 @@ def create_engine_config(self) -> EngineConfig: multi_step_stream_outputs=self.multi_step_stream_outputs, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), + policy=self.scheduling_policy, ) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank,