Skip to content

Commit

Permalink
Updated vLLM to 0.5.3.post1 (#13)
Browse files Browse the repository at this point in the history
Small changes due to vLLM release 0.5.3.post1:

max_log_len is now in application_settings instead of model_setting
prompt_adapters has been added to application_settings
update of the various libraries version
small changes to OpenAIServingCompletion and OpenAIServingChat reflecting those of vllm
Added new arguments following vLLM
deleting arguments pertaining to vision models mimicking vLLM in doing so
  • Loading branch information
gsolard authored Jul 26, 2024
1 parent 30e0be8 commit a470160
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 36 deletions.
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
# CHAT_TEMPLATE=None
# RESPONSE_ROLE="assistant"
# WITH_LAUNCH_ARGUMENTS=false
# MAX_LOG_LEN=None
# PROMPT_ADAPTERS=None

### Model settings ###

Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ license = {file="LICENSE"}
readme = "README.md"
requires-python = ">=3.10,<4.0"
dependencies = [
"vllm>=0.5.0.post1,<1.0",
"fastapi>=0.111.0,<1.0",
"vllm>=0.5.3.post1,<1.0",
"fastapi>=0.111.1,<1.0",
"pydantic_settings>=2.3.4,<3.0",
"uvicorn[standard]>=0.30.1,<1.0",
"uvicorn[standard]>=0.30.3,<1.0",
"prometheus_client>=0.20.0,<1.0",
"numpy>=1.26.4",
"jsonschema>=4.22.0,<5.0"
"numpy>=1.26.4,<2.0",
"jsonschema>=4.23.0,<5.0"
]
classifiers = [
"Programming Language :: Python :: 3",
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
vllm==0.5.0.post1
fastapi==0.111.0
vllm==0.5.3.post1
fastapi==0.111.1
pydantic-settings==2.3.4
uvicorn[standard]==0.30.1
uvicorn[standard]==0.30.3
prometheus_client==0.20.0
numpy==1.26.4
jsonschema==4.22.0
jsonschema==4.23.0
16 changes: 13 additions & 3 deletions src/happy_vllm/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from lmformatenforcer.integrations.transformers import build_token_enforcer_tokenizer_data

from happy_vllm import utils
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -85,11 +86,20 @@ async def _load_model(self, args: Namespace, **kwargs) -> None:
self.max_model_len = self._model.engine.model_config.max_model_len # type: ignore
self.original_truncation_side = self._tokenizer.truncation_side
model_config = await self._model.get_model_config()
if args.disable_log_requests:
request_logger = None
else:
request_logger = RequestLogger(max_log_len=args.max_log_len)
self.openai_serving_chat = OpenAIServingChat(self._model, model_config, [args.model_name],
args.response_role,
args.lora_modules,
args.chat_template)
self.openai_serving_completion = OpenAIServingCompletion(self._model, model_config, [args.model_name], args.lora_modules)
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
chat_template=args.chat_template,)
self.openai_serving_completion = OpenAIServingCompletion(self._model, model_config, [args.model_name],
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,)
# For test purpose
else:
self.max_model_len = 2048
Expand Down
2 changes: 1 addition & 1 deletion src/happy_vllm/routers/technical.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def get_live_metrics() -> JSONResponse:
@router.get("/v1/models", response_model=technical_schema.HappyvllmModelList)
async def show_available_models():
model: Model = RESOURCES.get(RESOURCE_MODEL)
models = await model.openai_serving_chat.show_available_models()
models = await model.openai_serving_completion.show_available_models()
return JSONResponse(content=models.model_dump())


Expand Down
71 changes: 48 additions & 23 deletions src/happy_vllm/utils_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
import ssl
import json

from typing import Optional, Tuple
from typing import Optional, Tuple, Union, List
from pydantic_settings import BaseSettings, SettingsConfigDict
from argparse import Namespace, ArgumentParser, BooleanOptionalAction
from argparse import Namespace, BooleanOptionalAction

from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import LoRAParserAction
from vllm.utils import FlexibleArgumentParser
from vllm.executor.executor_base import ExecutorBase
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.openai.cli_args import LoRAParserAction, PromptAdapterParserAction
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import BaseTokenizerGroup


DEFAULT_MODEL_NAME = '?'
Expand All @@ -47,6 +50,8 @@
DEFAULT_CHAT_TEMPLATE = None
DEFAULT_RESPONSE_ROLE = "assistant"
DEFAULT_WITH_LAUNCH_ARGUMENTS = False
DEFAULT_MAX_LOG_LEN = None
DEFAULT_PROMPT_ADAPTERS = None


class ApplicationSettings(BaseSettings):
Expand Down Expand Up @@ -79,17 +84,19 @@ class ApplicationSettings(BaseSettings):
chat_template : Optional[str] = DEFAULT_CHAT_TEMPLATE
response_role: str = DEFAULT_RESPONSE_ROLE
with_launch_arguments: bool = DEFAULT_WITH_LAUNCH_ARGUMENTS
max_log_len: Optional[int] = DEFAULT_MAX_LOG_LEN
prompt_adapters: Optional[str] = DEFAULT_PROMPT_ADAPTERS

model_config = SettingsConfigDict(env_file=".env", extra='ignore', protected_namespaces=('settings', ))


def get_model_settings(parser: ArgumentParser) -> BaseSettings:
def get_model_settings(parser: FlexibleArgumentParser) -> BaseSettings:
"""Gets the model settings. It corresponds to the variables added via AsyncEngineArgs.add_cli_args plus model-name.
First we use the parser to get the default values of vLLM for these variables. We instantiate a BaseSettings model
with these values as default. They are possibly overwritten by environnement variables or those of a .env
Args:
parser (ArgumentParser) : The parser containing all the model variables with thei default values from vLLM
parser (FlexibleArgumentParser) : The parser containing all the model variables with thei default values from vLLM
"""

default_args = parser.parse_args([])
Expand All @@ -111,14 +118,15 @@ class ModelSettings(BaseSettings):
seed: int = default_args.seed
max_model_len: Optional[int] = default_args.max_model_len
worker_use_ray: bool = False
distributed_executor_backend: Optional[str] = default_args.distributed_executor_backend
distributed_executor_backend: Optional[Union[str, ExecutorBase]] = default_args.distributed_executor_backend
pipeline_parallel_size: int = default_args.pipeline_parallel_size
tensor_parallel_size: int = default_args.tensor_parallel_size
max_parallel_loading_workers: Optional[int] = default_args.max_parallel_loading_workers
block_size: int = default_args.block_size
enable_prefix_caching: bool = False
disable_sliding_window: bool = False
swap_space: int = default_args.swap_space
swap_space: int = default_args.swap_space # GiB
cpu_offload_gb: int = default_args.cpu_offload_gb # GiB
gpu_memory_utilization: float = default_args.gpu_memory_utilization
max_num_batched_tokens: Optional[int] = default_args.max_num_batched_tokens
max_num_seqs: int = default_args.max_num_seqs
Expand All @@ -136,6 +144,9 @@ class ModelSettings(BaseSettings):
enable_lora: bool = False
max_loras: int = default_args.max_loras
max_lora_rank: int = default_args.max_lora_rank
enable_prompt_adapter: bool = False
max_prompt_adapters: int = default_args.max_prompt_adapters
max_prompt_adapter_token: int = default_args.max_prompt_adapter_token
fully_sharded_loras: bool = False
lora_extra_vocab_size: int = default_args.lora_extra_vocab_size
long_lora_scaling_factors: Optional[Tuple[float]] = default_args.long_lora_scaling_factors
Expand All @@ -146,33 +157,33 @@ class ModelSettings(BaseSettings):
num_gpu_blocks_override: Optional[int] = default_args.num_gpu_blocks_override
num_lookahead_slots: int = default_args.num_lookahead_slots
model_loader_extra_config: Optional[dict] = default_args.model_loader_extra_config
preemption_mode: Optional[str] = None
max_log_len: Optional[int] = default_args.max_log_len
ignore_patterns: Optional[Union[str, List[str]]] = default_args.ignore_patterns
preemption_mode: Optional[str] = default_args.preemption_mode
disable_log_requests: bool = False
engine_use_ray: bool = False
use_v2_block_manager: bool = False
max_logprobs: int = default_args.max_logprobs
tokenizer_pool_size: int = default_args.tokenizer_pool_size
tokenizer_pool_type: str = default_args.tokenizer_pool_type
tokenizer_pool_type: Union[str, BaseTokenizerGroup] = default_args.tokenizer_pool_type
tokenizer_pool_extra_config: Optional[str] = default_args.tokenizer_pool_extra_config
image_input_type: Optional[str] = default_args.image_input_type
image_token_id: Optional[int] = None
image_input_shape: Optional[str] = default_args.image_input_shape
image_feature_size: Optional[int] = default_args.image_feature_size
image_processor: Optional[str] = None
image_processor_revision: Optional[str] = None
disable_image_processor: bool = False
scheduler_delay_factor: float = default_args.scheduler_delay_factor
enable_chunked_prefill: bool = False
enable_chunked_prefill: Optional[bool] = default_args.enable_chunked_prefill
guided_decoding_backend: str = default_args.guided_decoding_backend
# Speculative decoding configuration.
speculative_model: Optional[str] = default_args.speculative_model
speculative_draft_tensor_parallel_size: Optional[int] = default_args.speculative_draft_tensor_parallel_size
num_speculative_tokens: Optional[int] = default_args.num_speculative_tokens
speculative_max_model_len: Optional[int] = default_args.speculative_max_model_len
speculative_disable_by_batch_size: Optional[int] = default_args.speculative_disable_by_batch_size
ngram_prompt_lookup_max: Optional[int] = default_args.ngram_prompt_lookup_max
ngram_prompt_lookup_min: Optional[int] = default_args.ngram_prompt_lookup_min
qlora_adapter_name_or_path: Optional[str] = None
spec_decoding_acceptance_method: str = default_args.spec_decoding_acceptance_method
typical_acceptance_sampler_posterior_threshold: Optional[float] = default_args.typical_acceptance_sampler_posterior_threshold
typical_acceptance_sampler_posterior_alpha: Optional[float] = default_args.typical_acceptance_sampler_posterior_alpha
qlora_adapter_name_or_path: Optional[str] = default_args.qlora_adapter_name_or_path
disable_logprobs_during_spec_decoding: Optional[bool] = default_args.disable_logprobs_during_spec_decoding

otlp_traces_endpoint: Optional[str] = default_args.otlp_traces_endpoint

model_config = SettingsConfigDict(env_file=".env", extra='ignore', protected_namespaces=('settings', ))

Expand All @@ -181,15 +192,15 @@ class ModelSettings(BaseSettings):
return model_settings


def get_parser() -> ArgumentParser:
def get_parser() -> FlexibleArgumentParser:
"""Gets the parser. The default values of all application variables (see ApplicationSettings) are properly
set to the BaseSetting value defined via pydantic. The default values of all model variables (ie those added
via AsyncEngineArgs.add_cli_args plus model-name) are not properly set via pydantic at this point.
Returns:
ArgumentParser : The argparse parser
FlexibleArgumentParser : The argparse parser
"""
parser = ArgumentParser(description="REST API server for vLLM, production ready")
parser = FlexibleArgumentParser(description="REST API server for vLLM, production ready")

application_settings = ApplicationSettings(_env_parse_none_str='None') # type: ignore

Expand Down Expand Up @@ -280,6 +291,20 @@ def get_parser() -> ArgumentParser:
type=bool,
default=application_settings.with_launch_arguments,
help="Whether the route launch_arguments should display the launch arguments")
parser.add_argument('--max-log-len',
type=int,
default=application_settings.max_log_len,
help='Max number of prompt characters or prompt '
'ID numbers being printed in log.'
'\n\nDefault: Unlimited')
parser.add_argument(
"--prompt-adapters",
type=nullable_str,
default=application_settings.prompt_adapters,
nargs='+',
action=PromptAdapterParserAction,
help="Prompt adapter configurations in the format name=path. "
"Multiple adapters can be specified.")
parser = AsyncEngineArgs.add_cli_args(parser)
return parser

Expand Down

0 comments on commit a470160

Please sign in to comment.