diff --git a/docs/endpoints/endpoints.md b/docs/endpoints/endpoints.md index 74c365c..3667449 100644 --- a/docs/endpoints/endpoints.md +++ b/docs/endpoints/endpoints.md @@ -68,4 +68,14 @@ Used to know which part of a prompt will be truncated (more details [here](data_ ### /v1/split_text (POST) -Splits a text on some separators, for example to prepare for some RAG (more details [here](data_manipulation.md)) \ No newline at end of file +Splits a text on some separators, for example to prepare for some RAG (more details [here](data_manipulation.md)) + +## Lora endpoints + +### /v1/load_lora_adapter (POST) + +Load a specific Lora adapter (more details in [vLLM documentation](https://docs.vllm.ai/en/latest/models/lora.html)) + +### /v1/unload_lora_adapter (POST) + +Unload a Lora adapter (more details in [vLLM documentation](https://docs.vllm.ai/en/latest/models/lora.html)) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1e3bb1d..df2bfb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,9 +13,9 @@ license = {file="LICENSE"} readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ - "vllm>=0.6.0,<1.0", - "fastapi>=0.112.2,<1.0", - "pydantic_settings>=2.4.0,<3.0", + "vllm>=0.6.1.post1,<1.0", + "fastapi>=0.114.1,<1.0", + "pydantic_settings>=2.5.2,<3.0", "uvicorn[standard]>=0.30.6,<1.0", "prometheus_client>=0.20.0,<1.0", "numpy>=1.26.4,<2.0", diff --git a/requirements.txt b/requirements.txt index c124428..7d505b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -vllm==0.6.0 -fastapi==0.112.2 -pydantic-settings==2.4.0 +vllm==0.6.1.post1 +fastapi==0.114.1 +pydantic-settings==2.5.2 uvicorn[standard]==0.30.6 prometheus_client==0.20.0 numpy==1.26.4 diff --git a/src/happy_vllm/model/model_base.py b/src/happy_vllm/model/model_base.py index 7233a73..ae5c9d8 100644 --- a/src/happy_vllm/model/model_base.py +++ b/src/happy_vllm/model/model_base.py @@ -91,7 +91,11 @@ async def _load_model(self, async_engine_client: AsyncEngineRPCClient, args: Nam self._tokenizer = self._model.tokenizer # type: ignore self._tokenizer_lmformatenforcer = build_token_enforcer_tokenizer_data(self._tokenizer) self.max_model_len = self._model.model_config.max_model_len # type: ignore - self.original_truncation_side = self._tokenizer.truncation_side # type: ignore + # To take into account Mistral tokenizers + try: + self.original_truncation_side = self._tokenizer.truncation_side # type: ignore + except: + self.original_truncation_side = "left" model_config = await self._model._get_model_config_rpc() if args.disable_log_requests: request_logger = None diff --git a/src/happy_vllm/routers/functional.py b/src/happy_vllm/routers/functional.py index 5c3b4fd..7290330 100644 --- a/src/happy_vllm/routers/functional.py +++ b/src/happy_vllm/routers/functional.py @@ -16,6 +16,7 @@ import os import json +import vllm.envs as envs from vllm.utils import random_uuid from fastapi import APIRouter, Body from pydantic import BaseModel, Field @@ -417,4 +418,37 @@ async def create_completion(request: Annotated[vllm_protocol.CompletionRequest, @router.post("/v1/abort_request") async def abort_request(request: functional_schema.RequestAbortRequest): model: Model = RESOURCES.get(RESOURCE_MODEL) - model._model.engine.abort_request(request.request_id) \ No newline at end of file + model._model.engine.abort_request(request.request_id) + + +if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + + @router.post("/v1/load_lora_adapter") + async def load_lora_adapter(request: vllm_protocol.LoadLoraAdapterRequest): + model: Model = RESOURCES.get(RESOURCE_MODEL) + response = await model.openai_serving_chat.load_lora_adapter(request) + if isinstance(response, vllm_protocol.ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + response = await model.openai_serving_completion.load_lora_adapter(request) + if isinstance(response, vllm_protocol.ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200, content=response) + + @router.post("/v1/unload_lora_adapter") + async def unload_lora_adapter(request: vllm_protocol.UnloadLoraAdapterRequest): + model: Model = RESOURCES.get(RESOURCE_MODEL) + response = await model.openai_serving_chat.unload_lora_adapter(request) + if isinstance(response, vllm_protocol.ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + response = await model.openai_serving_completion.unload_lora_adapter(request) + if isinstance(response, vllm_protocol.ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200, content=response) \ No newline at end of file diff --git a/src/happy_vllm/utils_args.py b/src/happy_vllm/utils_args.py index dd416b8..edb3bf0 100644 --- a/src/happy_vllm/utils_args.py +++ b/src/happy_vllm/utils_args.py @@ -123,6 +123,7 @@ class ModelSettings(BaseSettings): trust_remote_code: bool = False download_dir: Optional[str] = default_args.download_dir load_format: str = default_args.load_format + config_format: str = default_args.config_format dtype: str = default_args.dtype kv_cache_dtype: str = default_args.kv_cache_dtype quantization_param_path: Optional[str] = default_args.quantization_param_path