France-Travail · maxDavid40 · Aug 29, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/docs/endpoints/endpoints.md b/docs/endpoints/endpoints.md
@@ -45,10 +45,20 @@ DEPRECATED : These two routes take a prompt and completes it (more details [here
 ## Tokenizer endpoints
 
 ### /v1/tokenizer (POST)
+**[!IMPORTANT] Deprecated**
+
+Used to tokenizer a text (more details [here](tokenizer.md))
+
+### /v2/tokenizer (POST)
 
 Used to tokenizer a text (more details [here](tokenizer.md))
 
 ### /v1/decode (POST)
+**[!IMPORTANT] Deprecated**
+
+Used to decode a list of token ids (more details [here](tokenizer.md))
+
+### /v2/decode (POST)
 
 Used to decode a list of token ids (more details [here](tokenizer.md))
 

diff --git a/docs/endpoints/tokenizer.md b/docs/endpoints/tokenizer.md
@@ -2,10 +2,96 @@
 
 ## Tokenizer endpoints
 
-The tokenizer endpoints allow to use the tokenizer underlying the model. These endpoints are [`/v1/tokenizer`](#v1tokenizer-post) and [`/v1/decode`](#v1decode-post) and you can find more details on each below.
+The tokenizer endpoints allow to use the tokenizer underlying the model. These endpoints are [`/v2/tokenizer`](#v2tokenizer-post) and [`/v2/decode`](#v2decode-post) and you can find more details on each below.
 
-### /v1/tokenizer (POST)
+**[!IMPORTANT]** These endpoints [`/v1/tokenizer`](#v1tokenizer-post) and [`/v1/decode`](#v1decode-post) are deprecated
+
+### /v2/tokenizer (POST)
+Tokenizes the given text. The format of the input is as follows according to the method:
+
+#### Completions
+
+```
+{
+  "model": "my_model",
+  "prompt": "This is a text example",
+  "add_special_tokens": true
+}
+```
+
+  - `model`: ID of the model to use
+  - `prompt` : The text to tokenize
+  - `add_special_tokens` : Add a special tokens to the begin (optional, default value : `true`)
 
+ #### Chat/Completions
+
+```
+{
+  "model": "my_model",
+  "messages": [
+    {
+      "role": "system",
+      "content": "This is an example"
+    },
+    {
+      "role": "user",
+      "content": "This is an example"
+    }
+  ],
+  "add_special_tokens": true,
+  "add_generation_prompt": true
+}
+```
+
+ - `model` : ID of the model to use
+ - `messages` : The texts to tokenize
+ - `add_special_tokens` : Add a special tokens to the begin (optional, default value : `false`)
+ - `add_generation_prompt` : Add generation prompt's model in decode response (optional, default value : `true`)
+
+The format of the output is as follows :
+
+```
+{
+  "count": [
+    23
+  ],
+  "max_model_len": 8192,
+  "tokens": [128000, 2028, 374, 264, 1495, 318]
+}
+```
+
+ - `count`: The number of token in the input
+ - `max_model_len`: Max model length in config
+ - `tokens`: The list of token ids given by the tokenizer (give one extra token only if `add_special_tokens` was set to `true` in the request)
+
+
+### /v2/decode (POST)
+
+Decodes the given token ids. The format of the input is as follows :
+
+```
+{
+  "tokens": [128000, 2028, 374, 264, 1495, 318],
+  "model": "my_model"
+}
+```
+
+ - `tokens`: The ids of the tokens we want to decode
+ - `model`: ID of the model to use
+
+The format of the output is as follows:
+
+```
+{
+  "prompt": "<s> Hey, how are you ?"
+}
+```
+
+ - `prompt`: The decoded string corresponding to the token ids
+
+**[!IMPORTANT] Deprecated**
+### /v1/tokenizer (POST)
+**[!IMPORTANT] Deprecated**
 Tokenizes the given text. The format of the input is as follows :
 
 ```
@@ -51,6 +137,7 @@ The format of the output is as follows :
  - `tokens_str`: The string representation of each token (given only if `with_tokens_str` was set to `true` in the request)
 
 ### /v1/decode (POST)
+**[!IMPORTANT] Deprecated**
 
 Decodes the given token ids. The format of the input is as follows :
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,10 +13,10 @@ license = {file="LICENSE"}
 readme = "README.md"
 requires-python = ">=3.10,<4.0"
 dependencies = [
-    "vllm>=0.5.4,<1.0",
-    "fastapi>=0.112.0,<1.0",
+    "vllm>=0.5.5,<1.0",
+    "fastapi>=0.112.2,<1.0",
     "pydantic_settings>=2.4.0,<3.0",
-    "uvicorn[standard]>=0.30.5,<1.0",
+    "uvicorn[standard]>=0.30.6,<1.0",
     "prometheus_client>=0.20.0,<1.0",
     "numpy>=1.26.4,<2.0",
     "jsonschema>=4.23.0,<5.0"

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-vllm==0.5.4
-fastapi==0.112.0
+vllm==0.5.5
+fastapi==0.112.2
 pydantic-settings==2.4.0
-uvicorn[standard]==0.30.5
+uvicorn[standard]==0.30.6
 prometheus_client==0.20.0
 numpy==1.26.4
 jsonschema==4.23.0
diff --git a/src/happy_vllm/application.py b/src/happy_vllm/application.py
@@ -21,6 +21,7 @@
 from .core.resources import get_lifespan
 from prometheus_client import make_asgi_app
 from fastapi.middleware.cors import CORSMiddleware
+from vllm.entrypoints.openai.api_server import mount_metrics
 from vllm.entrypoints.openai.rpc.client import AsyncEngineRPCClient
 
 from happy_vllm import utils
@@ -41,8 +42,7 @@ async def declare_application(async_engine_client: AsyncEngineRPCClient, args: N
     )
 
     # Add prometheus asgi middleware to route /metrics requests
-    metrics_app = make_asgi_app()
-    app.mount("/metrics", metrics_app)
+    mount_metrics(app)
 
     # CORS middleware that allows all origins to avoid CORS problems
     # see https://fastapi.tiangolo.com/tutorial/cors/#use-corsmiddleware

diff --git a/src/happy_vllm/core/resources.py b/src/happy_vllm/core/resources.py
@@ -50,6 +50,9 @@ async def lifespan(app: FastAPI):
 
         RESOURCES[RESOURCE_MODEL] = model
 
+        # Force log once to have informations in /metrics before requests
+        await model._model.do_log_stats()
+
         yield
 
         # Clean up the ML models and release the resources

diff --git a/src/happy_vllm/launch.py b/src/happy_vllm/launch.py
@@ -44,6 +44,7 @@ async def launch_app(args, **uvicorn_kwargs):
     async with happy_vllm_build_async_engine_client(args) as async_engine_client:
         app = await declare_application(async_engine_client, args=args)
         shutdown_task = await serve_http(app,
+                                        engine=async_engine_client,
                                         host=args.host,
                                         port=args.port,
                                         log_level=args.uvicorn_log_level,

diff --git a/src/happy_vllm/model/model_base.py b/src/happy_vllm/model/model_base.py
@@ -270,6 +270,9 @@ async def async_iter(self, my_list):
         for element in my_list:
             yield element
 
+    async def do_log_stats(self):
+        pass
+
 
 class MockGenerateResponse():
 

diff --git a/src/happy_vllm/routers/functional.py b/src/happy_vllm/routers/functional.py
@@ -251,15 +251,17 @@ async def tokenizer_v2(request: Annotated[vllm_protocol.TokenizeRequest,
     """Tokenizes a text
 
     The request should be a JSON object with the following fields:
+
     Completions :
     - model : ID of the model to use
     - prompt : The text to tokenize
-    - add_special_tokens : Add a special tokens to the begin
+    - add_special_tokens : Add a special tokens to the begin (optional, default value : `true`)
+
     Chat Completions:
     - model : ID of the model to use
     - messages: The texts to tokenize
-    - add_special_tokens : Add a special tokens to the begin
-    - add_generation_prompt : TODO: Useless parameters, no change True or False
+    - add_special_tokens : Add a special tokens to the begin (optional, default value : `false`)
+    - add_generation_prompt : Add generation prompt's model in decode response (optional, default value : `true`)
     """
     model: Model = RESOURCES.get(RESOURCE_MODEL)
     generator = await model.openai_serving_tokenization.create_tokenize(request)

diff --git a/src/happy_vllm/rpc/server.py b/src/happy_vllm/rpc/server.py
@@ -1,17 +1,15 @@
-import asyncio
-import cloudpickle
+import uvloop
 
 from prometheus_client import Gauge
-from typing_extensions import Never
 from vllm.usage.usage_lib import UsageContext
 from vllm import AsyncEngineArgs
 from vllm.entrypoints.openai.rpc.server import AsyncEngineRPCServer, run_server
 
 
 def run_rpc_server(async_engine_args: AsyncEngineArgs,
-                   usage_context: UsageContext, port: int):
-    server = AsyncEngineRPCServer(async_engine_args=async_engine_args, usage_context=usage_context, port=port)
+                   usage_context: UsageContext, rpc_path: str):
+    server = AsyncEngineRPCServer(async_engine_args=async_engine_args, usage_context=usage_context, rpc_path=rpc_path)
     model_consumed_memory = Gauge("model_memory_usage", "Model Consumed GPU Memory in GB ")
     model_consumed_memory.set(round(server.engine.engine.model_executor.driver_worker.model_runner.model_memory_usage/float(2**30),2)) # type: ignore
-    asyncio.run(run_server(server))
+    uvloop.run(run_server(server))
 
diff --git a/src/happy_vllm/utils_args.py b/src/happy_vllm/utils_args.py
@@ -17,8 +17,9 @@
 import sys
 import ssl
 import json
+import torch
 
-from typing import Optional, Tuple, Union, List
+from typing import Optional, Tuple, Union, List, Mapping
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from argparse import Namespace, BooleanOptionalAction
 
@@ -111,6 +112,7 @@ def get_model_settings(parser: FlexibleArgumentParser) -> BaseSettings:
     class ModelSettings(BaseSettings):
         model: str = default_args.model
         model_name: str = default_args.model_name
+        served_model_name: Optional[Union[str, List[str]]] = None
         tokenizer: Optional[str] = default_args.tokenizer
         skip_tokenizer_init: bool = False
         tokenizer_mode: str = default_args.tokenizer_mode
@@ -130,8 +132,8 @@ class ModelSettings(BaseSettings):
         block_size: int = default_args.block_size
         enable_prefix_caching: bool = False
         disable_sliding_window: bool = False
-        swap_space: int = default_args.swap_space # GiB
-        cpu_offload_gb: int = default_args.cpu_offload_gb  # GiB
+        swap_space: float = default_args.swap_space # GiB
+        cpu_offload_gb: float = default_args.cpu_offload_gb  # GiB
         gpu_memory_utilization: float = default_args.gpu_memory_utilization
         max_num_batched_tokens: Optional[int] = default_args.max_num_batched_tokens
         max_num_seqs: int = default_args.max_num_seqs
@@ -142,7 +144,7 @@ class ModelSettings(BaseSettings):
         rope_theta: Optional[float] = None
         tokenizer_revision: Optional[str] = default_args.tokenizer_revision
         quantization: Optional[str] = default_args.quantization
-        enforce_eager: bool = False
+        enforce_eager: Optional[bool] = default_args.enforce_eager
         max_context_len_to_capture: Optional[int] = default_args.max_context_len_to_capture
         max_seq_len_to_capture: int = default_args.max_seq_len_to_capture
         disable_custom_all_reduce: bool = False
@@ -155,9 +157,10 @@ class ModelSettings(BaseSettings):
         fully_sharded_loras: bool = False
         lora_extra_vocab_size: int = default_args.lora_extra_vocab_size
         long_lora_scaling_factors: Optional[Tuple[float]] = default_args.long_lora_scaling_factors
-        lora_dtype: str = default_args.lora_dtype
+        lora_dtype: Optional[Union[str, torch.dtype]] = default_args.lora_dtype
         max_cpu_loras: Optional[int] = default_args.max_cpu_loras
         device: str = default_args.device
+        num_scheduler_steps: int = default_args.num_scheduler_steps
         ray_workers_use_nsight: bool = False
         num_gpu_blocks_override: Optional[int] = default_args.num_gpu_blocks_override
         num_lookahead_slots: int = default_args.num_lookahead_slots
@@ -171,11 +174,13 @@ class ModelSettings(BaseSettings):
         tokenizer_pool_size: int = default_args.tokenizer_pool_size
         tokenizer_pool_type: Union[str, BaseTokenizerGroup] = default_args.tokenizer_pool_type
         tokenizer_pool_extra_config: Optional[str] = default_args.tokenizer_pool_extra_config
+        limit_mm_per_prompt: Optional[Mapping[str, int]] = default_args.limit_mm_per_prompt
         scheduler_delay_factor: float = default_args.scheduler_delay_factor
         enable_chunked_prefill: Optional[bool] = default_args.enable_chunked_prefill
         guided_decoding_backend: str = default_args.guided_decoding_backend
         # Speculative decoding configuration.
         speculative_model: Optional[str] = default_args.speculative_model
+        speculative_model_quantization: Optional[str] = default_args.speculative_model_quantization
         speculative_draft_tensor_parallel_size: Optional[int] = default_args.speculative_draft_tensor_parallel_size
         num_speculative_tokens: Optional[int] = default_args.num_speculative_tokens
         speculative_max_model_len: Optional[int] = default_args.speculative_max_model_len
@@ -189,6 +194,7 @@ class ModelSettings(BaseSettings):
         disable_logprobs_during_spec_decoding: Optional[bool] = default_args.disable_logprobs_during_spec_decoding
 
         otlp_traces_endpoint: Optional[str] = default_args.otlp_traces_endpoint
+        collect_detailed_traces: Optional[str] = default_args.collect_detailed_traces
 
         model_config = SettingsConfigDict(env_file=".env", extra='ignore', protected_namespaces=('settings', ))