Skip to content

Commit

Permalink
[Frontend] Use request id from header (#10968)
Browse files Browse the repository at this point in the history
Signed-off-by: Joe Runde <[email protected]>
  • Loading branch information
joerunde authored Dec 10, 2024
1 parent 391d7b2 commit 980ad39
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 13 deletions.
1 change: 1 addition & 0 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ mistral_common >= 1.5.0
aiohttp
starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ async def health(raw_request: Request) -> Response:
async def tokenize(request: TokenizeRequest, raw_request: Request):
handler = tokenization(raw_request)

generator = await handler.create_tokenize(request)
generator = await handler.create_tokenize(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
Expand All @@ -319,7 +319,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
async def detokenize(request: DetokenizeRequest, raw_request: Request):
handler = tokenization(raw_request)

generator = await handler.create_detokenize(request)
generator = await handler.create_detokenize(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ async def create_chat_completion(
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e))

request_id = f"chatcmpl-{request.request_id}"
request_id = "chatcmpl-" \
f"{self._base_request_id(raw_request, request.request_id)}"

request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import merge_async_iterators, random_uuid
from vllm.utils import merge_async_iterators

logger = init_logger(__name__)

Expand Down Expand Up @@ -86,7 +86,7 @@ async def create_completion(
"suffix is not currently supported")

model_name = self.base_model_paths[0].name
request_id = f"cmpl-{random_uuid()}"
request_id = f"cmpl-{self._base_request_id(raw_request)}"
created_time = int(time.time())

request_metadata = RequestResponseMetadata(request_id=request_id)
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/serving_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
from vllm.logger import init_logger
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.utils import merge_async_iterators, random_uuid
from vllm.utils import merge_async_iterators

logger = init_logger(__name__)

Expand Down Expand Up @@ -110,7 +110,7 @@ async def create_embedding(
"dimensions is currently not supported")

model_name = request.model
request_id = f"embd-{random_uuid()}"
request_id = f"embd-{self._base_request_id(raw_request)}"
created_time = int(time.monotonic())

truncate_prompt_tokens = None
Expand Down
11 changes: 10 additions & 1 deletion vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
Optional, Sequence, Tuple, TypedDict, Union)

from fastapi import Request
from pydantic import Field
from starlette.datastructures import Headers
from typing_extensions import Annotated
Expand Down Expand Up @@ -47,7 +48,7 @@
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
log_tracing_disabled_warning)
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import AtomicCounter, is_list_of, make_async
from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid

logger = init_logger(__name__)

Expand Down Expand Up @@ -565,6 +566,14 @@ async def _get_trace_headers(

return None

@staticmethod
def _base_request_id(raw_request: Request,
default: Optional[str] = None) -> Optional[str]:
"""Pulls the request id to use from a header, if provided"""
default = default or random_uuid()
return raw_request.headers.get(
"X-Request-Id", default) if raw_request is not None else default

@staticmethod
def _get_decoded_token(logprob: Logprob,
token_id: int,
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
from vllm.utils import make_async, merge_async_iterators, random_uuid
from vllm.utils import make_async, merge_async_iterators

logger = init_logger(__name__)

Expand Down Expand Up @@ -102,7 +102,7 @@ async def create_score(
return error_check_ret

model_name = request.model
request_id = f"score-{random_uuid()}"
request_id = f"score-{self._base_request_id(raw_request)}"
created_time = int(time.monotonic())
truncate_prompt_tokens = request.truncate_prompt_tokens

Expand Down
9 changes: 6 additions & 3 deletions vllm/entrypoints/openai/serving_tokenization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Final, List, Optional, Union

from fastapi import Request

from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
Expand All @@ -17,7 +19,6 @@
LoRAModulePath,
OpenAIServing)
from vllm.logger import init_logger
from vllm.utils import random_uuid

logger = init_logger(__name__)

Expand Down Expand Up @@ -48,12 +49,13 @@ def __init__(
async def create_tokenize(
self,
request: TokenizeRequest,
raw_request: Request,
) -> Union[TokenizeResponse, ErrorResponse]:
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret

request_id = f"tokn-{random_uuid()}"
request_id = f"tokn-{self._base_request_id(raw_request)}"

try:
(
Expand Down Expand Up @@ -112,12 +114,13 @@ async def create_tokenize(
async def create_detokenize(
self,
request: DetokenizeRequest,
raw_request: Request,
) -> Union[DetokenizeResponse, ErrorResponse]:
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret

request_id = f"tokn-{random_uuid()}"
request_id = f"tokn-{self._base_request_id(raw_request)}"

(
lora_request,
Expand Down

0 comments on commit 980ad39

Please sign in to comment.