Skip to content

Commit

Permalink
Updated vllm to 0.6.1.post1 (#36)
Browse files Browse the repository at this point in the history
* Updated vllm to 0.6.1

* Forgot some changes

* Changes to take into account mistral tokenizers

* Updated to 0.6.1.post1
  • Loading branch information
gsolard authored Sep 13, 2024
1 parent 412a14a commit ef5c06e
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 9 deletions.
12 changes: 11 additions & 1 deletion docs/endpoints/endpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,14 @@ Used to know which part of a prompt will be truncated (more details [here](data_

### /v1/split_text (POST)

Splits a text on some separators, for example to prepare for some RAG (more details [here](data_manipulation.md))
Splits a text on some separators, for example to prepare for some RAG (more details [here](data_manipulation.md))

## Lora endpoints

### /v1/load_lora_adapter (POST)

Load a specific Lora adapter (more details in [vLLM documentation](https://docs.vllm.ai/en/latest/models/lora.html))

### /v1/unload_lora_adapter (POST)

Unload a Lora adapter (more details in [vLLM documentation](https://docs.vllm.ai/en/latest/models/lora.html))
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ license = {file="LICENSE"}
readme = "README.md"
requires-python = ">=3.10,<4.0"
dependencies = [
"vllm>=0.6.0,<1.0",
"fastapi>=0.112.2,<1.0",
"pydantic_settings>=2.4.0,<3.0",
"vllm>=0.6.1.post1,<1.0",
"fastapi>=0.114.1,<1.0",
"pydantic_settings>=2.5.2,<3.0",
"uvicorn[standard]>=0.30.6,<1.0",
"prometheus_client>=0.20.0,<1.0",
"numpy>=1.26.4,<2.0",
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
vllm==0.6.0
fastapi==0.112.2
pydantic-settings==2.4.0
vllm==0.6.1.post1
fastapi==0.114.1
pydantic-settings==2.5.2
uvicorn[standard]==0.30.6
prometheus_client==0.20.0
numpy==1.26.4
Expand Down
6 changes: 5 additions & 1 deletion src/happy_vllm/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ async def _load_model(self, async_engine_client: AsyncEngineRPCClient, args: Nam
self._tokenizer = self._model.tokenizer # type: ignore
self._tokenizer_lmformatenforcer = build_token_enforcer_tokenizer_data(self._tokenizer)
self.max_model_len = self._model.model_config.max_model_len # type: ignore
self.original_truncation_side = self._tokenizer.truncation_side # type: ignore
# To take into account Mistral tokenizers
try:
self.original_truncation_side = self._tokenizer.truncation_side # type: ignore
except:
self.original_truncation_side = "left"
model_config = await self._model._get_model_config_rpc()
if args.disable_log_requests:
request_logger = None
Expand Down
36 changes: 35 additions & 1 deletion src/happy_vllm/routers/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import os
import json
import vllm.envs as envs
from vllm.utils import random_uuid
from fastapi import APIRouter, Body
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -417,4 +418,37 @@ async def create_completion(request: Annotated[vllm_protocol.CompletionRequest,
@router.post("/v1/abort_request")
async def abort_request(request: functional_schema.RequestAbortRequest):
model: Model = RESOURCES.get(RESOURCE_MODEL)
model._model.engine.abort_request(request.request_id)
model._model.engine.abort_request(request.request_id)


if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:

@router.post("/v1/load_lora_adapter")
async def load_lora_adapter(request: vllm_protocol.LoadLoraAdapterRequest):
model: Model = RESOURCES.get(RESOURCE_MODEL)
response = await model.openai_serving_chat.load_lora_adapter(request)
if isinstance(response, vllm_protocol.ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

response = await model.openai_serving_completion.load_lora_adapter(request)
if isinstance(response, vllm_protocol.ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

return Response(status_code=200, content=response)

@router.post("/v1/unload_lora_adapter")
async def unload_lora_adapter(request: vllm_protocol.UnloadLoraAdapterRequest):
model: Model = RESOURCES.get(RESOURCE_MODEL)
response = await model.openai_serving_chat.unload_lora_adapter(request)
if isinstance(response, vllm_protocol.ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

response = await model.openai_serving_completion.unload_lora_adapter(request)
if isinstance(response, vllm_protocol.ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

return Response(status_code=200, content=response)
1 change: 1 addition & 0 deletions src/happy_vllm/utils_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class ModelSettings(BaseSettings):
trust_remote_code: bool = False
download_dir: Optional[str] = default_args.download_dir
load_format: str = default_args.load_format
config_format: str = default_args.config_format
dtype: str = default_args.dtype
kv_cache_dtype: str = default_args.kv_cache_dtype
quantization_param_path: Optional[str] = default_args.quantization_param_path
Expand Down

0 comments on commit ef5c06e

Please sign in to comment.