From 7a3d2a5b957063e911c25401c593dbc7798a5536 Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Tue, 16 Jul 2024 12:18:09 +0000
Subject: [PATCH 01/16] [Frontend] Support for chat completions input in the
 tokenize endpoint (#5923)

---
 tests/async_engine/test_chat_template.py      |  14 +-
 tests/entrypoints/openai/test_completion.py   |  49 ------
 tests/entrypoints/openai/test_tokenization.py | 128 ++++++++++++++
 vllm/entrypoints/openai/api_server.py         |  10 +-
 vllm/entrypoints/openai/chat_utils.py         | 156 +++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |   8 +-
 vllm/entrypoints/openai/serving_chat.py       | 161 ++----------------
 vllm/entrypoints/openai/serving_completion.py |  31 +---
 .../openai/serving_tokenization.py            |  73 ++++++++
 9 files changed, 386 insertions(+), 244 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_tokenization.py
 create mode 100644 vllm/entrypoints/openai/chat_utils.py
 create mode 100644 vllm/entrypoints/openai/serving_tokenization.py

diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 55b730812ea94..536a7c96a1e9e 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -4,8 +4,8 @@
 
 import pytest
 
+from vllm.entrypoints.openai.chat_utils import load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
@@ -64,8 +64,7 @@ def test_load_chat_template():
     # Testing chatml template
     tokenizer = MockTokenizer()
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=chatml_jinja_path)
+    load_chat_template(mock_serving_chat, chat_template=chatml_jinja_path)
 
     template_content = tokenizer.chat_template
 
@@ -84,8 +83,7 @@ def test_no_load_chat_template_filelike():
     mock_serving_chat = MockServingChat(tokenizer)
 
     with pytest.raises(ValueError, match="looks like a file path"):
-        OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                              chat_template=template)
+        load_chat_template(mock_serving_chat, chat_template=template)
 
 
 def test_no_load_chat_template_literallike():
@@ -94,8 +92,7 @@ def test_no_load_chat_template_literallike():
     tokenizer = MockTokenizer()
 
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    load_chat_template(mock_serving_chat, chat_template=template)
     template_content = tokenizer.chat_template
 
     assert template_content == template
@@ -109,8 +106,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     # Initialize the tokenizer
     tokenizer = get_tokenizer(tokenizer_name=model)
     mock_serving_chat = MockServingChat(tokenizer)
-    OpenAIServingChat._load_chat_template(mock_serving_chat,
-                                          chat_template=template)
+    load_chat_template(mock_serving_chat, chat_template=template)
 
     # Create a mock request object using keyword arguments
     mock_request = ChatCompletionRequest(
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index f9dbf69c2eaab..35af0b02747e9 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -6,7 +6,6 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -636,51 +635,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
             prompt="Give an example string that fits this regex",
             extra_body=dict(guided_regex=sample_regex,
                             guided_json=sample_json_schema))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
-
-    for add_special in [False, True]:
-        prompt = "This is a test prompt."
-        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
-
-        response = requests.post(base_url + "/tokenize",
-                                 json={
-                                     "add_special_tokens": add_special,
-                                     "model": model_name,
-                                     "prompt": prompt
-                                 })
-        response.raise_for_status()
-        assert response.json() == {
-            "tokens": tokens,
-            "count": len(tokens),
-            "max_model_len": 8192
-        }
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3]
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
-
-    prompt = "This is a test prompt."
-    tokens = tokenizer.encode(prompt, add_special_tokens=False)
-
-    response = requests.post(base_url + "detokenize",
-                             json={
-                                 "model": model_name,
-                                 "tokens": tokens
-                             })
-    response.raise_for_status()
-    assert response.json() == {"prompt": prompt}
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
new file mode 100644
index 0000000000000..d33fd222ee150
--- /dev/null
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -0,0 +1,128 @@
+import openai  # use the official client for correctness check
+import pytest
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_tokenize_completions(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+
+    for add_special in [False, True]:
+        prompt = "This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [{
+                "role": "user",
+                "content": "Hi there!"
+            }, {
+                "role": "assistant",
+                "content": "Nice to meet you!"
+            }, {
+                "role": "user",
+                "content": "Can I ask a question?"
+            }]
+
+            prompt = tokenizer.apply_chat_template(
+                add_generation_prompt=add_generation,
+                conversation=conversation,
+                tokenize=False)
+            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+            response = requests.post(base_url + "/tokenize",
+                                     json={
+                                         "add_generation_prompt":
+                                         add_generation,
+                                         "add_special_tokens": add_special,
+                                         "messages": conversation,
+                                         "model": model_name
+                                     })
+            response.raise_for_status()
+
+            assert response.json() == {
+                "tokens": tokens,
+                "count": len(tokens),
+                "max_model_len": 8192
+            }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+
+    prompt = "This is a test prompt."
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(base_url + "/detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+
+    assert response.json() == {"prompt": prompt}
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 45c634b4a2991..a35dcbbd6545e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -33,6 +33,8 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from vllm.entrypoints.openai.serving_tokenization import (
+    OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
@@ -46,6 +48,7 @@
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
 openai_serving_embedding: OpenAIServingEmbedding
+openai_serving_tokenization: OpenAIServingTokenization
 
 logger = init_logger('vllm.entrypoints.openai.api_server')
 
@@ -86,7 +89,7 @@ async def health() -> Response:
 
 @router.post("/tokenize")
 async def tokenize(request: TokenizeRequest):
-    generator = await openai_serving_completion.create_tokenize(request)
+    generator = await openai_serving_tokenization.create_tokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -97,7 +100,7 @@ async def tokenize(request: TokenizeRequest):
 
 @router.post("/detokenize")
 async def detokenize(request: DetokenizeRequest):
-    generator = await openai_serving_completion.create_detokenize(request)
+    generator = await openai_serving_tokenization.create_detokenize(request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -241,6 +244,7 @@ def run_server(args, llm_engine=None):
     global openai_serving_chat
     global openai_serving_completion
     global openai_serving_embedding
+    global openai_serving_tokenization
 
     openai_serving_chat = OpenAIServingChat(engine, model_config,
                                             served_model_names,
@@ -252,6 +256,8 @@ def run_server(args, llm_engine=None):
         args.prompt_adapters)
     openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                       served_model_names)
+    openai_serving_tokenization = OpenAIServingTokenization(
+        engine, model_config, served_model_names, args.chat_template)
     app.root_path = args.root_path
 
     logger.info("Available routes are:")
diff --git a/vllm/entrypoints/openai/chat_utils.py b/vllm/entrypoints/openai/chat_utils.py
new file mode 100644
index 0000000000000..27115391d5b27
--- /dev/null
+++ b/vllm/entrypoints/openai/chat_utils.py
@@ -0,0 +1,156 @@
+import codecs
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Awaitable, Iterable, List, Optional, TypedDict, cast, final
+
+from openai.types.chat import (ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartTextParam)
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionContentPartParam,
+                                              ChatCompletionMessageParam)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.utils import async_get_and_parse_image
+
+logger = init_logger(__name__)
+
+
+@final  # So that it should be compatible with Dict[str, str]
+class ConversationMessage(TypedDict):
+    role: str
+    content: str
+
+
+@dataclass(frozen=True)
+class ChatMessageParseResult:
+    messages: List[ConversationMessage]
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
+        default_factory=list)
+
+
+def load_chat_template(engine: OpenAIServing, chat_template: Optional[str]):
+    tokenizer = engine.tokenizer
+
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError as e:
+            JINJA_CHARS = "{}\n"
+            if not any(c in chat_template for c in JINJA_CHARS):
+                msg = (f"The supplied chat template ({chat_template}) "
+                       f"looks like a file path, but it failed to be "
+                       f"opened. Reason: {e}")
+                raise ValueError(msg) from e
+
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(chat_template,
+                                                    "unicode_escape")
+
+        logger.info("Using supplied chat template:\n%s",
+                    tokenizer.chat_template)
+    elif tokenizer.chat_template is not None:
+        logger.info("Using default chat template:\n%s",
+                    tokenizer.chat_template)
+    else:
+        logger.warning("No chat template provided. Chat API will not work.")
+
+
+@lru_cache(maxsize=None)
+def _image_token_str(engine: OpenAIServing) -> Optional[str]:
+    # TODO: Let user specify how to insert image tokens into prompt
+    # (similar to chat template)
+    model_type = engine.model_config.hf_config.model_type
+    if model_type == "phi3_v":
+        # Workaround since this token is not defined in the tokenizer
+        return "<|image_1|>"
+    if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"):
+        # These models do not use image tokens in the prompt
+        return None
+    if model_type.startswith("llava"):
+        return engine.tokenizer.decode(
+            engine.model_config.hf_config.image_token_index)
+
+    else:
+        raise TypeError("Unknown model type: {model_type}")
+
+
+# TODO: Let user specify how to insert image tokens into prompt
+# (similar to chat template)
+def _get_full_image_text_prompt(engine: OpenAIServing, image_token_str: str,
+                                text_prompt: str) -> str:
+    """Combine image and text prompts for vision language model"""
+
+    # NOTE: For now we assume all model architectures use the same
+    # image + text prompt format. This may change in the future.
+    return f"{image_token_str}\n{text_prompt}"
+
+
+def _parse_chat_message_content_parts(
+    engine: OpenAIServing,
+    role: str,
+    parts: Iterable[ChatCompletionContentPartParam],
+) -> ChatMessageParseResult:
+    texts: List[str] = []
+    mm_futures: List[Awaitable[MultiModalDataDict]] = []
+
+    for part in parts:
+        part_type = part["type"]
+        if part_type == "text":
+            text = cast(ChatCompletionContentPartTextParam, part)["text"]
+            texts.append(text)
+        elif part_type == "image_url":
+            if len(mm_futures) > 0:
+                raise NotImplementedError(
+                    "Multiple 'image_url' input is currently not supported.")
+
+            image_url = cast(ChatCompletionContentPartImageParam,
+                             part)["image_url"]
+
+            if image_url.get("detail", "auto") != "auto":
+                logger.warning(
+                    "'image_url.detail' is currently not supported and "
+                    "will be ignored.")
+
+            image_future = async_get_and_parse_image(image_url["url"])
+            mm_futures.append(image_future)
+        else:
+            raise NotImplementedError(f"Unknown part type: {part_type}")
+
+    text_prompt = "\n".join(texts)
+
+    if mm_futures:
+        image_token_str = _image_token_str(engine)
+        if image_token_str is not None:
+            if image_token_str in text_prompt:
+                logger.warning(
+                    "Detected image token string in the text prompt. "
+                    "Skipping prompt formatting.")
+            else:
+                text_prompt = _get_full_image_text_prompt(
+                    engine,
+                    image_token_str=image_token_str,
+                    text_prompt=text_prompt,
+                )
+
+    messages = [ConversationMessage(role=role, content=text_prompt)]
+
+    return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
+
+
+def parse_chat_message_content(
+    engine: OpenAIServing,
+    message: ChatCompletionMessageParam,
+) -> ChatMessageParseResult:
+    role = message["role"]
+    content = message.get("content")
+
+    if content is None:
+        return ChatMessageParseResult(messages=[], mm_futures=[])
+    if isinstance(content, str):
+        messages = [ConversationMessage(role=role, content=content)]
+        return ChatMessageParseResult(messages=messages, mm_futures=[])
+
+    return _parse_chat_message_content_parts(engine, role, content)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b3f0aae6d002d..2faf061192307 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -738,15 +738,17 @@ class BatchRequestOutput(OpenAIBaseModel):
 
 
 class TokenizeRequest(OpenAIBaseModel):
+    add_generation_prompt: bool = Field(default=True)
+    add_special_tokens: bool = Field(default=False)
+    prompt: Optional[str] = Field(default=None)
+    messages: Optional[List[ChatCompletionMessageParam]] = Field(default=None)
     model: str
-    prompt: str
-    add_special_tokens: bool = Field(default=True)
 
 
 class TokenizeResponse(OpenAIBaseModel):
-    tokens: List[int]
     count: int
     max_model_len: int
+    tokens: List[int]
 
 
 class DetokenizeRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 010d6f2ebb909..dbd4521073da9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,22 +1,19 @@
-import codecs
 import time
-from dataclasses import dataclass, field
-from functools import cached_property
-from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
-                    List, Optional)
+from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, List,
+                    Optional)
 from typing import Sequence as GenericSequence
-from typing import TypedDict, Union, cast, final
+from typing import Union
 
 from fastapi import Request
-from openai.types.chat import (ChatCompletionContentPartImageParam,
-                               ChatCompletionContentPartTextParam)
 
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.chat_utils import (ConversationMessage,
+                                                load_chat_template,
+                                                parse_chat_message_content)
 from vllm.entrypoints.openai.protocol import (
-    ChatCompletionContentPartParam, ChatCompletionLogProb,
-    ChatCompletionLogProbs, ChatCompletionLogProbsContent,
-    ChatCompletionMessageParam, ChatCompletionNamedToolChoiceParam,
+    ChatCompletionLogProb, ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
@@ -28,7 +25,6 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import async_get_and_parse_image
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -38,19 +34,6 @@
 logger = init_logger(__name__)
 
 
-@final  # So that it should be compatible with Dict[str, str]
-class ConversationMessage(TypedDict):
-    role: str
-    content: str
-
-
-@dataclass(frozen=True)
-class ChatMessageParseResult:
-    messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
-        default_factory=list)
-
-
 class OpenAIServingChat(OpenAIServing):
 
     def __init__(self,
@@ -66,131 +49,7 @@ def __init__(self,
                          lora_modules=lora_modules)
 
         self.response_role = response_role
-        self._load_chat_template(chat_template)
-
-    def _load_chat_template(self, chat_template: Optional[str]):
-        tokenizer = self.tokenizer
-
-        if chat_template is not None:
-            try:
-                with open(chat_template, "r") as f:
-                    tokenizer.chat_template = f.read()
-            except OSError as e:
-                JINJA_CHARS = "{}\n"
-                if not any(c in chat_template for c in JINJA_CHARS):
-                    msg = (f"The supplied chat template ({chat_template}) "
-                           f"looks like a file path, but it failed to be "
-                           f"opened. Reason: {e}")
-                    raise ValueError(msg) from e
-
-                # If opening a file fails, set chat template to be args to
-                # ensure we decode so our escape are interpreted correctly
-                tokenizer.chat_template = codecs.decode(
-                    chat_template, "unicode_escape")
-
-            logger.info("Using supplied chat template:\n%s",
-                        tokenizer.chat_template)
-        elif tokenizer.chat_template is not None:
-            logger.info("Using default chat template:\n%s",
-                        tokenizer.chat_template)
-        else:
-            logger.warning(
-                "No chat template provided. Chat API will not work.")
-
-    @cached_property
-    def image_token_str(self) -> Optional[str]:
-        # TODO: Let user specify how to insert image tokens into prompt
-        # (similar to chat template)
-        model_type = self.model_config.hf_config.model_type
-        if model_type == "phi3_v":
-            # Workaround since this token is not defined in the tokenizer
-            return "<|image_1|>"
-        if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv",
-                          "paligemma"):
-            # These models do not use image tokens in the prompt
-            return None
-        if model_type.startswith("llava"):
-            return self.tokenizer.decode(
-                self.model_config.hf_config.image_token_index)
-
-        else:
-            raise TypeError("Unknown model type: {model_type}")
-
-    # TODO: Let user specify how to insert image tokens into prompt
-    # (similar to chat template)
-    def _get_full_image_text_prompt(self, image_token_str: str,
-                                    text_prompt: str) -> str:
-        """Combine image and text prompts for vision language model"""
-
-        # NOTE: For now we assume all model architectures use the same
-        # image + text prompt format. This may change in the future.
-        return f"{image_token_str}\n{text_prompt}"
-
-    def _parse_chat_message_content_parts(
-        self,
-        role: str,
-        parts: Iterable[ChatCompletionContentPartParam],
-    ) -> ChatMessageParseResult:
-        texts: List[str] = []
-        mm_futures: List[Awaitable[MultiModalDataDict]] = []
-
-        for part in parts:
-            part_type = part["type"]
-            if part_type == "text":
-                text = cast(ChatCompletionContentPartTextParam, part)["text"]
-                texts.append(text)
-            elif part_type == "image_url":
-                if len(mm_futures) > 0:
-                    raise NotImplementedError(
-                        "Multiple 'image_url' input is currently not supported."
-                    )
-
-                image_url = cast(ChatCompletionContentPartImageParam,
-                                 part)["image_url"]
-
-                if image_url.get("detail", "auto") != "auto":
-                    logger.warning(
-                        "'image_url.detail' is currently not supported and "
-                        "will be ignored.")
-
-                image_future = async_get_and_parse_image(image_url["url"])
-                mm_futures.append(image_future)
-            else:
-                raise NotImplementedError(f"Unknown part type: {part_type}")
-
-        text_prompt = "\n".join(texts)
-
-        if mm_futures:
-            image_token_str = self.image_token_str
-            if image_token_str is not None:
-                if image_token_str in text_prompt:
-                    logger.warning(
-                        "Detected image token string in the text prompt. "
-                        "Skipping prompt formatting.")
-                else:
-                    text_prompt = self._get_full_image_text_prompt(
-                        image_token_str=image_token_str,
-                        text_prompt=text_prompt,
-                    )
-
-        messages = [ConversationMessage(role=role, content=text_prompt)]
-
-        return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
-
-    def _parse_chat_message_content(
-        self,
-        message: ChatCompletionMessageParam,
-    ) -> ChatMessageParseResult:
-        role = message["role"]
-        content = message.get("content")
-
-        if content is None:
-            return ChatMessageParseResult(messages=[], mm_futures=[])
-        if isinstance(content, str):
-            messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, mm_futures=[])
-
-        return self._parse_chat_message_content_parts(role, content)
+        load_chat_template(self, chat_template)
 
     async def create_chat_completion(
         self,
@@ -216,7 +75,7 @@ async def create_chat_completion(
             mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
             for msg in request.messages:
-                chat_parsed_result = self._parse_chat_message_content(msg)
+                chat_parsed_result = parse_chat_message_content(self, msg)
 
                 conversation.extend(chat_parsed_result.messages)
                 mm_futures.extend(chat_parsed_result.mm_futures)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b53b058b52af3..647fc31410647 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -16,10 +16,7 @@
                                               CompletionResponseChoice,
                                               CompletionResponseStreamChoice,
                                               CompletionStreamResponse,
-                                              DetokenizeRequest,
-                                              DetokenizeResponse,
-                                              TokenizeRequest,
-                                              TokenizeResponse, UsageInfo)
+                                              UsageInfo)
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing,
@@ -457,29 +454,3 @@ def _create_completion_logprobs(
             tokens=out_tokens,
             top_logprobs=out_top_logprobs,
         )
-
-    async def create_tokenize(self,
-                              request: TokenizeRequest) -> TokenizeResponse:
-        error_check_ret = await self._check_model(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        (input_ids, input_text) = self._validate_prompt_and_tokenize(
-            request,
-            prompt=request.prompt,
-            add_special_tokens=request.add_special_tokens)
-
-        return TokenizeResponse(tokens=input_ids,
-                                count=len(input_ids),
-                                max_model_len=self.max_model_len)
-
-    async def create_detokenize(
-            self, request: DetokenizeRequest) -> DetokenizeResponse:
-        error_check_ret = await self._check_model(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        (input_ids, input_text) = self._validate_prompt_and_tokenize(
-            request, prompt_ids=request.tokens)
-
-        return DetokenizeResponse(prompt=input_text)
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
new file mode 100644
index 0000000000000..f441e940c5e5f
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -0,0 +1,73 @@
+from typing import List, Optional
+
+from vllm.config import ModelConfig
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.chat_utils import (ConversationMessage,
+                                                load_chat_template,
+                                                parse_chat_message_content)
+from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+
+
+class OpenAIServingTokenization(OpenAIServing):
+
+    def __init__(self,
+                 engine: AsyncLLMEngine,
+                 model_config: ModelConfig,
+                 served_model_names: List[str],
+                 chat_template: Optional[str] = None):
+        super().__init__(engine=engine,
+                         model_config=model_config,
+                         served_model_names=served_model_names,
+                         lora_modules=None)
+
+        load_chat_template(self, chat_template)
+
+    async def create_tokenize(self,
+                              request: TokenizeRequest) -> TokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        if not (request.prompt or request.messages):
+            return self.create_error_response(
+                "Either `prompt` or `messages` should be provided.")
+
+        if (request.prompt and request.messages):
+            return self.create_error_response(
+                "Only one of `prompt` or `messages` should be provided.")
+
+        if request.messages:
+            conversation: List[ConversationMessage] = []
+
+            for message in request.messages:
+                conversation.extend(
+                    parse_chat_message_content(self, message).messages)
+
+            request.prompt = self.tokenizer.apply_chat_template(
+                add_generation_prompt=request.add_generation_prompt,
+                conversation=conversation,
+                tokenize=False)
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request,
+            prompt=request.prompt,
+            add_special_tokens=request.add_special_tokens)
+
+        return TokenizeResponse(tokens=input_ids,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+
+    async def create_detokenize(
+            self, request: DetokenizeRequest) -> DetokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request, prompt_ids=request.tokens)
+
+        return DetokenizeResponse(prompt=input_text)

From 7508a3dc34c2b7a5c5c971b13f15208d7dade442 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 16 Jul 2024 15:55:15 +0200
Subject: [PATCH 02/16] [Misc] Fix typos in spec. decode metrics logging.
 (#6470)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/engine/metrics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 77de42bc0ed5d..48aec84298d86 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -325,8 +325,8 @@ def _format_spec_decode_metrics_str(
                 f"System efficiency: {metrics.system_efficiency:.3f}, "
                 f"Number of speculative tokens: {metrics.num_spec_tokens}, "
                 f"Number of accepted tokens: {metrics.accepted_tokens}, "
-                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
-                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
+                f"Number of draft tokens: {metrics.draft_tokens}, "
+                f"Number of emitted tokens: {metrics.emitted_tokens}.")
 
 
 class PrometheusStatLogger(StatLoggerBase):
@@ -457,4 +457,4 @@ def log(self, stats: Stats):
 
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""
-    _metrics_cls = RayMetrics
\ No newline at end of file
+    _metrics_cls = RayMetrics

From 2bb0489cb3367e46e201e84ab629df535544495b Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Tue, 16 Jul 2024 23:13:25 +0800
Subject: [PATCH 03/16] [Core] Use numpy to speed up padded token processing
 (#6442)

---
 vllm/model_executor/sampling_metadata.py | 38 +++++++++++-------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index ad5fb13176edc..c346cd0562867 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
+import numpy as np
 import torch
 
 from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
@@ -457,16 +458,20 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         if do_penalties:
             prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
                                  default=0)
-            prompt_padded_tokens = [
-                tokens + [vocab_size] * (prompt_max_len - len(tokens))
-                for tokens in prompt_tokens
-            ]
+            prompt_padded_tokens = np.full(
+                (len(prompt_tokens), prompt_max_len),
+                vocab_size,
+                dtype=np.int64)
+            for i, tokens in enumerate(prompt_tokens):
+                prompt_padded_tokens[i, :len(tokens)] = tokens
             output_max_len = max([len(tokens) for tokens in output_tokens],
                                  default=0)
-            output_padded_tokens = [
-                tokens + [vocab_size] * (output_max_len - len(tokens))
-                for tokens in output_tokens
-            ]
+            output_padded_tokens = np.full(
+                (len(output_tokens), output_max_len),
+                vocab_size,
+                dtype=np.int64)
+            for i, tokens in enumerate(output_tokens):
+                output_padded_tokens[i, :len(tokens)] = tokens
 
         temperatures_t = torch.tensor(
             temperatures,
@@ -517,18 +522,11 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             pin_memory=pin_memory,
         )
         if do_penalties:
-            prompt_tensor = torch.tensor(
-                prompt_padded_tokens,
-                device="cpu",
-                dtype=torch.long,
-                pin_memory=pin_memory,
-            )
-            output_tensor = torch.tensor(
-                output_padded_tokens,
-                device="cpu",
-                dtype=torch.long,
-                pin_memory=pin_memory,
-            )
+            prompt_tensor = torch.from_numpy(prompt_padded_tokens)
+            output_tensor = torch.from_numpy(output_padded_tokens)
+            if pin_memory:
+                prompt_tensor = prompt_tensor.pin_memory()
+                output_tensor = output_tensor.pin_memory()
         else:
             prompt_tensor = None
             output_tensor = None

From 38ef94888afc0c2bccc2f18422d2b525d7649ac3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 16 Jul 2024 23:59:36 +0800
Subject: [PATCH 04/16] [CI/Build] Remove "boardwalk" image asset (#6460)

---
 tests/conftest.py               |  7 +------
 tests/models/test_fuyu.py       |  7 ++++---
 tests/models/test_llava.py      |  2 --
 tests/models/test_llava_next.py |  2 --
 tests/models/test_paligemma.py  |  7 ++++---
 tests/models/test_phi3v.py      |  2 --
 vllm/assets/image.py            | 13 +++----------
 7 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 17f75d948c543..08b8814d983d3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -39,7 +39,6 @@ def _read_prompts(filename: str) -> List[str]:
 class _ImageAssetPrompts(TypedDict):
     stop_sign: str
     cherry_blossom: str
-    boardwalk: str
 
 
 if sys.version_info < (3, 9):
@@ -58,7 +57,6 @@ def __init__(self) -> None:
         super().__init__([
             ImageAsset("stop_sign"),
             ImageAsset("cherry_blossom"),
-            ImageAsset("boardwalk")
         ])
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
@@ -68,10 +66,7 @@ def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         The order of the returned prompts matches the order of the
         assets when iterating through this object.
         """
-        return [
-            prompts["stop_sign"], prompts["cherry_blossom"],
-            prompts["boardwalk"]
-        ]
+        return [prompts["stop_sign"], prompts["cherry_blossom"]]
 
 
 IMAGE_ASSETS = _ImageAssets()
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
index 672470acb77e6..25f63a3d64d0e 100644
--- a/tests/models/test_fuyu.py
+++ b/tests/models/test_fuyu.py
@@ -12,9 +12,10 @@
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign": "What's the content of the image?\n",  # noqa: E501
-    "cherry_blossom": "What is the season?\n",
-    "boardwalk": "What's in this image?\n",
+    "stop_sign":
+    "What's the content of the image?\n",
+    "cherry_blossom":
+    "What is the season?\n",
 })
 
 models = ["adept/fuyu-8b"]
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 2c0a8d4ffdf5c..79ab58c364f64 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -16,8 +16,6 @@
     "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
     "cherry_blossom":
     "USER: <image>\nWhat is the season?\nASSISTANT:",
-    "boardwalk":
-    "USER: <image>\nWhat's in this image?\nASSISTANT:",
 })
 
 IMAGE_TOKEN_ID = 32000
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 163741a5719c2..2f200c13ea001 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -23,8 +23,6 @@
     f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
     "cherry_blossom":
     f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
-    "boardwalk":
-    f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
 })
 
 IMAGE_TOKEN_ID = 32000
diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
index b0e7264e89118..81afd11a6e697 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -12,9 +12,10 @@
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign": "caption es",
-    "cherry_blossom": "What is in the picture?",
-    "boardwalk": "What is in the picture?",
+    "stop_sign":
+    "caption es",
+    "cherry_blossom":
+    "What is in the picture?",
 })
 
 IMAGE_TOKEN_ID = 257152
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index faadab22429ba..636a9d3f1a65e 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -18,8 +18,6 @@
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
     "cherry_blossom":
     "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-    "boardwalk":
-    "<|user|>\n<|image_1|>\nWhat's in this image?<|end|>\n<|assistant|>\n",
 })
 
 models = ["microsoft/Phi-3-vision-128k-instruct"]
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index a526db735ffc7..ca6c3ac9e3a38 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,13 +1,11 @@
 import shutil
 from dataclasses import dataclass
-from functools import cached_property, lru_cache
+from functools import lru_cache
 from typing import Literal
 
 import requests
 from PIL import Image
 
-from vllm.multimodal.utils import fetch_image
-
 from .base import get_cache_dir
 
 
@@ -35,13 +33,8 @@ def get_air_example_data_2_asset(filename: str) -> Image.Image:
 
 @dataclass(frozen=True)
 class ImageAsset:
-    name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
+    name: Literal["stop_sign", "cherry_blossom"]
 
-    @cached_property
+    @property
     def pil_image(self) -> Image.Image:
-        if self.name == "boardwalk":
-            return fetch_image(
-                "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-            )
-
         return get_air_example_data_2_asset(f"{self.name}.jpg")

From 9f4ccec76135083c96d15fbbade5eda7a2321bf1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 16 Jul 2024 09:45:30 -0700
Subject: [PATCH 05/16] [doc][misc] remind to cancel debugging environment
 variables (#6481)

[doc][misc] remind users to cancel debugging environment variables after debugging (#6481)
---
 docs/source/getting_started/debugging.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 0d03fe93adc61..2aa52e79888a3 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -19,9 +19,6 @@ If you have already taken care of the above issues, but the vLLM instance still
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
 - Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
 
-  .. warning::
-    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
-
 With more logging, hopefully you can find the root cause of the issue.
 
 If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
@@ -67,3 +64,7 @@ Here are some common issues that can cause hangs:
     If the script runs successfully, you should see the message ``sanity check is successful!``.
 
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
+
+.. warning::
+
+    After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.

From c467dff24f5dfa0b8e4045c3d265c6e606e35f99 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 16 Jul 2024 09:56:28 -0700
Subject: [PATCH 06/16] [Hardware][TPU] Support MoE with Pallas GMM kernel 
 (#6457)

---
 Dockerfile.tpu                                |  4 +-
 .../getting_started/tpu-installation.rst      |  4 +-
 vllm/model_executor/layers/fused_moe/layer.py | 18 ++++++
 .../layers/fused_moe/moe_pallas.py            | 62 +++++++++++++++++++
 vllm/worker/tpu_model_runner.py               |  9 ++-
 5 files changed, 89 insertions(+), 8 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_pallas.py

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 6ad8e8ccfac78..be7dbe63cb237 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20240601"
+ARG NIGHTLY_DATE="20240713"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
@@ -6,6 +6,8 @@ WORKDIR /workspace
 
 # Install aiohttp separately to avoid build errors.
 RUN pip install aiohttp
+# Install NumPy 1 instead of NumPy 2.
+RUN pip install "numpy<2"
 # Install the TPU and Pallas dependencies.
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index e96aabbb63279..5e2f514a4a509 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,7 +56,7 @@ First, install the dependencies:
     $ pip uninstall torch torch-xla -y
 
     $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="+20240601"
+    $ export DATE="+20240713"
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
     $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
 
@@ -85,7 +85,7 @@ Next, build vLLM from source. This will only take a few seconds:
         ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
 
 
-    You can install OpenBLAS with the following command:
+    Please install OpenBLAS with the following command:
 
     .. code-block:: console
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7f0668601fac3..bb2be3f3eb56f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -104,6 +104,24 @@ def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
             "The CPU backend currently does not support MoE.")
 
+    def forward_tpu(
+        self,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        return fused_moe(x, w1, w2, router_logits, top_k, renormalize)
+
 
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
new file mode 100644
index 0000000000000..563ee18c64304
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn.functional as F
+from torch_xla.experimental.custom_kernel import _histogram
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+    assert (num_tokens * topk) % 16 == 0, (
+        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
+        f"16 but got {num_tokens * topk}")
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    topk_indices = topk_indices.flatten()
+    topk_argsort_indices = topk_indices.argsort()
+    topk_argsort_revert_indices = topk_argsort_indices.argsort()
+    token_indices = torch.arange(num_tokens,
+                                 device=device).repeat_interleave(topk)
+    token_indices = token_indices[topk_argsort_indices]
+    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
+
+    # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout
+    # from HF Transformers.
+    w1 = w1.transpose(1, 2)
+    w2 = w2.transpose(1, 2)
+
+    x = hidden_states[token_indices]
+    x = torch.ops.xla.gmm(x, w1, group_sizes)
+    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
+    x = torch.ops.xla.gmm(x, w2, group_sizes)
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+
+    x = x * topk_weights.unsqueeze_(dim=-1)
+    x = x.sum(dim=-2)
+    x = x.reshape(orig_shape)
+    return x
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 9b00a60ac0e3b..6c1149ee9dfca 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -598,11 +598,10 @@ def _get_padded_prefill_len(x: int) -> int:
 
 
 def _get_padded_batch_size(batch_size: int) -> int:
-    if batch_size <= 2:
-        return batch_size
-    elif batch_size <= 4:
-        return 4
-    elif batch_size <= 8:
+    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
+    # To meet this requirement in the simplest way, we set the minimal batch
+    # size to 8.
+    if batch_size <= 8:
         return 8
     else:
         return ((batch_size + 15) // 16) * 16

From 94162beb9f454403d68ec009bb5572ee560d7603 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Tue, 16 Jul 2024 10:11:04 -0700
Subject: [PATCH 07/16] [Doc] Fix the lora adapter path in server startup
 script (#6230)

---
 docs/source/models/lora.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 934887a607a6a..5cc3076073fbd 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -64,7 +64,10 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
     python -m vllm.entrypoints.openai.api_server \
         --model meta-llama/Llama-2-7b-hf \
         --enable-lora \
-        --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
+        --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+
+.. note::
+   The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
 
 The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
 etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along

From 160e1d8c99567872ef3e683e87c09f96a3a2e7c4 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 16 Jul 2024 13:37:10 -0700
Subject: [PATCH 08/16] [Misc] Log spec decode metrics (#6454)

---
 tests/metrics/test_metrics.py                 | 49 +++++++++++++++++++
 tests/spec_decode/e2e/conftest.py             | 44 ++++++++++++++---
 .../e2e/test_multistep_correctness.py         | 18 ++++---
 vllm/engine/metrics.py                        | 40 +++++++++++++++
 4 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 0191d85194e33..5694061e17e02 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -168,6 +168,55 @@ def test_engine_log_metrics_regression(
     assert_metrics(engine, disable_log_stats, len(example_prompts))
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_metric_spec_decode(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    k = 5
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4,
+                     speculative_model=model,
+                     num_speculative_tokens=k,
+                     use_v2_block_manager=True) as vllm_model:
+
+        # Force log interval to be 0 to catch all metrics.
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger.local_interval = 0
+
+        # Note that the purpose of this test is to verify spec decode
+        # metrics instead of functional correctness, so the expected values
+        # are intended to be loose.
+        metric_name_to_expected_fn = {
+            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
+            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
+            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
+            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
+            "counter_spec_decode_num_emitted_tokens":
+            lambda v: 0 <= v <= k + 1,
+        }
+
+        # Use one request to better inspect the metrics.
+        prompts = example_prompts[:1]
+
+        _ = vllm_model.generate_greedy(prompts, max_tokens)
+        for metric_name, is_expected in metric_name_to_expected_fn.items():
+            metric_val = getattr(
+                stat_logger.metrics,
+                metric_name).labels(**stat_logger.labels)._value.get()
+            assert is_expected(metric_val), (
+                f"the value of metric {metric_name} ({metric_val}) "
+                "does not meet expectation")
+
+
 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index fb3415b5db153..34a6c9a393a58 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -162,6 +162,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
     }
     test_name = request.node.name
 
+    model = kwargs["model"]
+    draft_model = kwargs.get("speculative_model", None)
+    same_draft_target_model = (draft_model is not None
+                               and draft_model == model)
+
     def generator_inner():
 
         wait_for_gpu_memory_to_clear(
@@ -177,6 +182,13 @@ def generator_inner():
 
         print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
         llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
+
+        # Override logging interval to 0 for spec decode test run to
+        # log all metrics in time.
+        if (baseline_or_test == "test" and not use_async
+                and llm.llm_engine.log_stats):
+            for sate_logger in llm.llm_engine.stat_loggers.values():
+                sate_logger.local_interval = 0
         set_random_seed(seed)
 
         yield llm
@@ -188,6 +200,9 @@ def generator_outer():
             yield llm
             del llm
 
+    # Set an attribute to the generator_outer function to allow us to
+    # determine whether to further check the acceptance rate in tests.
+    generator_outer.same_draft_target_model = same_draft_target_model  # type: ignore
     return generator_outer
 
 
@@ -204,18 +219,26 @@ def maybe_assert_ngram_worker(llm):
 
 def get_output_from_llm_generator(
         llm_generator, prompts,
-        sampling_params) -> Tuple[List[str], List[List[int]]]:
+        sampling_params) -> Tuple[List[str], List[List[int]], float]:
     tokens: List[str] = []
     token_ids: List[List[int]] = []
+    acceptance_rate: float = -1.0
     for llm in llm_generator():
         maybe_assert_ngram_worker(llm)
 
         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
         token_ids = [output.outputs[0].token_ids for output in outputs]
         tokens = [output.outputs[0].text for output in outputs]
+
+        # Fetch acceptance rate if logging is enabled.
+        if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
+            stat_logger = stat_loggers["prometheus"]
+            acceptance_rate = (stat_logger.metrics.
+                               gauge_spec_decode_draft_acceptance_rate.labels(
+                                   **stat_logger.labels)._value.get())
         del llm
 
-    return tokens, token_ids
+    return tokens, token_ids, acceptance_rate
 
 
 def get_logprobs_from_llm_generator(
@@ -237,7 +260,8 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
                                          batch_size,
                                          max_output_len,
                                          force_output_len: bool,
-                                         print_tokens: bool = False):
+                                         print_tokens: bool = False,
+                                         ensure_all_accepted: bool = False):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -267,12 +291,13 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         temperature=temperature,
     )
 
-    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
+    (spec_batch_tokens, spec_batch_token_ids,
+     acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
 
-    (baseline_batch_tokens,
-     baseline_batch_token_ids) = get_output_from_llm_generator(
-         baseline_llm_generator, prompts, sampling_params)
+    (baseline_batch_tokens, baseline_batch_token_ids,
+     _) = get_output_from_llm_generator(baseline_llm_generator, prompts,
+                                        sampling_params)
 
     assert len(baseline_batch_token_ids) == len(prompts)
     assert len(spec_batch_token_ids) == len(prompts)
@@ -287,3 +312,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
         print(f'{i=} {baseline_token_ids=}')
         print(f'{i=}     {spec_token_ids=}')
         assert baseline_token_ids == spec_token_ids
+
+    if ensure_all_accepted:
+        assert acceptance_rate == 1.0
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 94cc36f22875a..86cab7aba2380 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -97,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         temperature=temperature,
     )
 
-    batch_tokens, batch_token_ids = get_output_from_llm_generator(
+    batch_tokens, batch_token_ids, _ = get_output_from_llm_generator(
         test_llm_generator, prompts, sampling_params)
 
     # Expect a generation for each prompt in the batch.
@@ -200,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
 
     Since this test is cheaper than other e2e correctness tests, we generate
     with a higher output_len.
+
+    When the draft model is the same as the target model, we further check
+    whether all speculative tokens are accepted.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    ensure_all_accepted = test_llm_generator.same_draft_target_model
+    run_greedy_equality_correctness_test(
+        baseline_llm_generator,
+        test_llm_generator,
+        batch_size,
+        max_output_len=output_len,
+        force_output_len=True,
+        ensure_all_accepted=ensure_all_accepted)
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 48aec84298d86..9e187b2fbc33b 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -133,6 +133,30 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
+        # Speculatie decoding stats
+        self.gauge_spec_decode_draft_acceptance_rate = self._base_library.Gauge(
+            name="vllm:spec_decode_draft_acceptance_rate",
+            documentation="Speulative token acceptance rate.",
+            labelnames=labelnames)
+        self.gauge_spec_decode_efficiency = self._base_library.Gauge(
+            name="vllm:spec_decode_efficiency",
+            documentation="Speculative decoding system efficiency.",
+            labelnames=labelnames)
+        self.counter_spec_decode_num_accepted_tokens = (
+            self._base_library.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames))
+        self.counter_spec_decode_num_draft_tokens = self._base_library.Counter(
+            name="vllm:spec_decode_num_draft_tokens_total",
+            documentation="Number of draft tokens.",
+            labelnames=labelnames)
+        self.counter_spec_decode_num_emitted_tokens = (
+            self._base_library.Counter(
+                name="vllm:spec_decode_num_emitted_tokens_total",
+                documentation="Number of emitted tokens.",
+                labelnames=labelnames))
+
         # Deprecated in favor of vllm:prompt_tokens_total
         self.gauge_avg_prompt_throughput = self._base_library.Gauge(
             name="vllm:avg_prompt_throughput_toks_per_s",
@@ -454,6 +478,22 @@ def log(self, stats: Stats):
             self.num_generation_tokens = []
             self.last_local_log = stats.now
 
+            if stats.spec_decode_metrics is not None:
+                self._log_gauge(
+                    self.metrics.gauge_spec_decode_draft_acceptance_rate,
+                    stats.spec_decode_metrics.draft_acceptance_rate)
+                self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
+                                stats.spec_decode_metrics.system_efficiency)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_accepted_tokens,
+                    stats.spec_decode_metrics.accepted_tokens)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_draft_tokens,
+                    stats.spec_decode_metrics.draft_tokens)
+                self._log_counter(
+                    self.metrics.counter_spec_decode_num_emitted_tokens,
+                    stats.spec_decode_metrics.emitted_tokens)
+
 
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""

From 978aed53004b82877bd2af0f10afff1826d7194d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 16 Jul 2024 18:31:32 -0400
Subject: [PATCH 09/16] [Kernel][Attention] Separate `Attention.kv_scale` into
 `k_scale` and `v_scale` (#6081)

---
 .../kernels/benchmark_paged_attention.py      |  8 ++-
 csrc/attention/attention_kernels.cu           | 53 ++++++++---------
 csrc/cache.h                                  |  4 +-
 csrc/cache_kernels.cu                         | 21 +++----
 csrc/cpu/attention.cpp                        | 12 ++--
 csrc/cpu/cache.cpp                            |  5 +-
 csrc/cpu/torch_bindings.cpp                   | 10 ++--
 csrc/ops.h                                    |  8 +--
 csrc/torch_bindings.cpp                       | 10 ++--
 tests/kernels/test_attention.py               |  8 ++-
 tests/kernels/test_blocksparse_attention.py   |  8 ++-
 tests/kernels/test_cache.py                   |  4 +-
 tests/quantization/test_fp8.py                | 40 +++++++++++--
 vllm/_custom_ops.py                           | 18 +++---
 vllm/_ipex_ops.py                             |  9 ++-
 vllm/attention/backends/abstract.py           |  3 +-
 vllm/attention/backends/blocksparse_attn.py   |  9 ++-
 vllm/attention/backends/flash_attn.py         |  6 +-
 vllm/attention/backends/flashinfer.py         |  6 +-
 vllm/attention/backends/ipex_attn.py          | 14 +++--
 vllm/attention/backends/pallas.py             |  5 +-
 vllm/attention/backends/rocm_flash_attn.py    |  9 ++-
 vllm/attention/backends/torch_sdpa.py         | 11 ++--
 vllm/attention/backends/xformers.py           |  8 ++-
 vllm/attention/layer.py                       | 14 +++--
 vllm/attention/ops/ipex_attn.py               |  6 +-
 vllm/attention/ops/paged_attn.py              | 15 +++--
 vllm/model_executor/layers/linear.py          |  9 +++
 .../model_executor/layers/quantization/fp8.py | 51 +++++++++++-----
 .../model_loader/weight_utils.py              | 58 +++++++++++++++++--
 vllm/model_executor/models/llama.py           | 19 ++----
 vllm/model_executor/models/mixtral.py         | 21 ++-----
 vllm/model_executor/models/qwen2.py           | 20 ++-----
 33 files changed, 317 insertions(+), 185 deletions(-)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 16de60477c305..78cac8a555d1b 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -100,7 +100,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
         start_time = time.perf_counter()
 
         # Using default kv_scale
-        kv_scale = 1.0
+        k_scale = v_scale = 1.0
 
         for _ in range(num_iters):
             if version == "v1":
@@ -117,7 +117,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -136,7 +137,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_seq_len,
                     alibi_slopes,
                     kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 91083481705cb..350dbce1d7ba9 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -105,9 +105,9 @@ __device__ void paged_attention_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   const int seq_idx = blockIdx.y;
   const int partition_idx = blockIdx.z;
   const int max_num_partitions = gridDim.z;
@@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
           Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
               k_ptr + offset1 * BLOCK_SIZE * x + offset2);
           k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
-              k_vec_quant, kv_scale);
+              k_vec_quant, k_scale);
         }
       }
 
@@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
               *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
           // Vector conversion from V_quant_vec to V_vec.
           v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
-                                                                    kv_scale);
+                                                                    v_scale);
         }
         if (block_idx == num_seq_blocks - 1) {
           // NOTE(woosuk): When v_vec contains the tokens that are out of the
@@ -513,15 +513,15 @@ __global__ void paged_attention_v1_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                          KV_DTYPE, IS_BLOCK_SPARSE>(
       /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
       v_cache, num_kv_heads, scale, block_tables, seq_lens,
       max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
-      kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+      kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
       blocksparse_vert_stride, blocksparse_block_size,
       blocksparse_head_sliding_step);
 }
@@ -549,14 +549,14 @@ __global__ void paged_attention_v2_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const float k_scale, const float v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
                          KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
       exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
       block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
-      kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+      kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
       blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
       blocksparse_head_sliding_step);
 }
@@ -682,7 +682,7 @@ __global__ void paged_attention_v2_reduce_kernel(
           out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
           scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
           alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          kv_scale, tp_rank, blocksparse_local_blocks,                      \
+          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
           blocksparse_vert_stride, blocksparse_block_size,                  \
           blocksparse_head_sliding_step);
 
@@ -694,8 +694,8 @@ void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
@@ -770,7 +770,7 @@ void paged_attention_v1_launcher(
   paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
                               IS_BLOCK_SPARSE>(                              \
       out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank,                \
+      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
       blocksparse_local_blocks, blocksparse_vert_stride,                     \
       blocksparse_block_size, blocksparse_head_sliding_step);
 
@@ -815,8 +815,8 @@ void paged_attention_v1(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
@@ -833,7 +833,7 @@ void paged_attention_v1(
           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
           value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
           seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, kv_scale, tp_rank,                  \
+          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
           blocksparse_local_blocks, blocksparse_vert_stride,                   \
           blocksparse_block_size, blocksparse_head_sliding_step);              \
   vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
@@ -850,8 +850,8 @@ void paged_attention_v2_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
-    const int tp_rank, const int blocksparse_local_blocks,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
@@ -932,8 +932,9 @@ void paged_attention_v2_launcher(
                               IS_BLOCK_SPARSE>(                               \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
       num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
-      kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,   \
-      blocksparse_block_size, blocksparse_head_sliding_step);
+      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
+      blocksparse_vert_stride, blocksparse_block_size,                        \
+      blocksparse_head_sliding_step);
 
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
   switch (is_block_sparse) {                                               \
@@ -980,8 +981,8 @@ void paged_attention_v2(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
diff --git a/csrc/cache.h b/csrc/cache.h
index 86caa9345361d..52177e8901a89 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -18,8 +18,8 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       const double kv_scale);
+                       const std::string& kv_cache_dtype, const double k_scale,
+                       const double v_scale);
 
 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 72041076ae009..caef7f5e18630 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel(
                                          // block_size]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x,
-    const float kv_scale) {
+    const int head_size, const int block_size, const int x, const float k_scale,
+    const float v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   if (slot_idx < 0) {
@@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel(
       value_cache[tgt_value_idx] = tgt_value;
     } else {
       key_cache[tgt_key_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, kv_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
       value_cache[tgt_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, kv_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
     }
   }
 }
@@ -248,7 +248,7 @@ __global__ void reshape_and_cache_flash_kernel(
           reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
           reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
           slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
-          num_heads, head_size, block_size, x, kv_scale);
+          num_heads, head_size, block_size, x, k_scale, v_scale);
 
 void reshape_and_cache(
     torch::Tensor& key,    // [num_tokens, num_heads, head_size]
@@ -258,7 +258,8 @@ void reshape_and_cache(
     torch::Tensor&
         value_cache,  // [num_blocks, num_heads, head_size, block_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const double kv_scale) {
+    const std::string& kv_cache_dtype, const double k_scale,
+    const double v_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
@@ -318,13 +319,13 @@ namespace vllm {
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
 __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
                                    Tout* __restrict__ dst_cache,
-                                   const float kv_scale,
+                                   const float scale,
                                    const int64_t block_stride) {
   const int64_t block_idx = blockIdx.x;
   for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
     int64_t idx = block_idx * block_stride + i;
     dst_cache[idx] =
-        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], kv_scale);
+        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], scale);
   }
 }
 
@@ -333,11 +334,11 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
 #define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE)                                \
   vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
       reinterpret_cast<Tin*>(src_cache.data_ptr()),                          \
-      reinterpret_cast<Tout*>(dst_cache.data_ptr()), kv_scale, block_stride);
+      reinterpret_cast<Tout*>(dst_cache.data_ptr()), scale, block_stride);
 
 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double kv_scale, const std::string& kv_cache_dtype) {
+                 const double scale, const std::string& kv_cache_dtype) {
   torch::Device src_device = src_cache.device();
   torch::Device dst_device = dst_cache.device();
   TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index 8367093325314..abb4e3bea14bb 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -423,11 +423,11 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(kv_scale == 1.0f);
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
@@ -742,11 +742,11 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(kv_scale == 1.0f);
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 2b5c3bd6ee70b..31d454328b2c1 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -107,8 +107,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double kv_scale) {
-  TORCH_CHECK(kv_scale == 1.0f);
+                       const std::string& kv_cache_dtype, double k_scale,
+                       double v_scale) {
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
 
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 39e8cf3ed3c10..5be0e9810b5b9 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -16,8 +16,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
@@ -30,8 +30,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
@@ -103,7 +103,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
+      "                  float k_scale, float v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index fb1099e4fe0c2..f9feb3deff5e4 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -8,8 +8,8 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
@@ -19,8 +19,8 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 18331a674eeba..9dc7cefc404ca 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
@@ -41,8 +41,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
@@ -223,7 +223,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
+      "                  float k_scale, float v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
 
   // Reshape the key and value tensors and cache them.
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index f848ad51c7014..2e6412c28958e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -175,7 +175,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
@@ -193,7 +193,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
     elif version == "v2":
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
@@ -224,7 +225,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
     else:
         raise AssertionError(f"Unknown version: {version}")
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index 402545d1980d6..b3adb152949a2 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -212,7 +212,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
     tp_rank = 0
 
     # Call the paged attention kernel.
@@ -231,7 +231,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
             tp_rank=tp_rank,
             blocksparse_local_blocks=blocksparse_local_blocks,
             blocksparse_vert_stride=blocksparse_vert_stride,
@@ -267,7 +268,8 @@ def test_paged_attention(
             max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
             tp_rank=tp_rank,
             blocksparse_local_blocks=blocksparse_local_blocks,
             blocksparse_vert_stride=blocksparse_vert_stride,
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 23b6baa60c05b..70ae3d0c6e0c3 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -155,11 +155,11 @@ def test_reshape_and_cache(
         cloned_value_cache = value_cache.clone()
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
 
     # Call the reshape_and_cache kernel.
     ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, kv_scale)
+                          kv_cache_dtype, k_scale, v_scale)
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 0ed91cbb447fd..82dc775f8d812 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -7,19 +7,49 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
+                                                         Fp8LinearMethod)
 
 MODELS = [
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
     "nm-testing/Phi-3-mini-128k-instruct-FP8",
 ]
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
-def test_model_load_and_run(vllm_runner, model: str):
-    with vllm_runner(model) as llm:
+@pytest.mark.parametrize("model_id", MODELS)
+def test_model_load_and_run(vllm_runner, model_id: str):
+    with vllm_runner(model_id) as llm:
+        # note: this does not test accuracy, just that we can run through
+        # see lm-eval tests for accuracy
+        outputs = llm.generate_greedy(prompts=["Hello my name is"],
+                                      max_tokens=10)
+        print(outputs[0][1])
+
+
+KV_CACHE_MODELS = [
+    # Deprecated AutoFP8 format using .kv_scale
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    # AutoFP8 format using separate .k_scale and .v_scale
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        attn = model.model.layers[0].self_attn.attn
+        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+        # NOTE: it is valid for scales to be 1.0 (default value), but we know
+        # these checkpoints have scales < 1.0
+        assert 0.0 < attn._k_scale < 1.0
+        assert 0.0 < attn._v_scale < 1.0
+
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
         outputs = llm.generate_greedy(prompts=["Hello my name is"],
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 03308d04012aa..4ca67224a91b8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -84,7 +84,8 @@ def paged_attention_v1(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -94,8 +95,9 @@ def paged_attention_v1(
     torch.ops._C.paged_attention_v1(
         out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)
 
 
 def paged_attention_v2(
@@ -114,7 +116,8 @@ def paged_attention_v2(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -124,7 +127,7 @@ def paged_attention_v2(
     torch.ops._C.paged_attention_v2(
         out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
         blocksparse_local_blocks, blocksparse_vert_stride,
         blocksparse_block_size, blocksparse_head_sliding_step)
 
@@ -374,11 +377,12 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
                                              value_cache, slot_mapping,
-                                             kv_cache_dtype, kv_scale)
+                                             kv_cache_dtype, k_scale, v_scale)
 
 
 def reshape_and_cache_flash(
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 99a875c9b3fb7..b4721b4e1aedd 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -59,7 +59,8 @@ def paged_attention_v1(
         max_context_len: int,
         alibi_slopes: Optional[torch.Tensor],
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -99,7 +100,8 @@ def paged_attention_v2(
         max_context_len: int,
         alibi_slopes: Optional[torch.Tensor],
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -227,7 +229,8 @@ def reshape_and_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
     ) -> None:
         assert kv_cache_dtype == "auto"
         ipex.llm.modules.PagedAttention.reshape_and_cache(
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index adb8325168cdf..1310bb1679e15 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -134,7 +134,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index fe4c4a45dca0d..6308cf07ce41e 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -327,7 +327,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: BlocksparseFlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -368,7 +369,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         if prefill_meta := attn_metadata.prefill_metadata:
@@ -405,7 +407,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank=self.tp_rank,
                 blocksparse_local_blocks=self.local_blocks,
                 blocksparse_vert_stride=self.vert_stride,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 048abed48d2e9..0b6bd21279393 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -256,7 +256,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -277,7 +278,8 @@ def forward(
                                       "FlashAttentionImpl")
 
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert kv_scale == 1.0, "kv_scale is not supported in FlashAttention."
+        assert k_scale == 1.0 and v_scale == 1.0, (
+            "key/v_scale is not supported in FlashAttention.")
 
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index b27e3e40f566d..a4b01c6d3b508 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -223,10 +223,12 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: FlashInferMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0, (
+            "key/v_scale is not supported in FlashInfer.")
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 6a1295b1000bc..4559dd15f600c 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -156,7 +156,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: IpexAttnMetadata,  # type: ignore
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -170,7 +171,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
@@ -192,7 +193,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         if attn_metadata.is_prompt:
@@ -273,7 +275,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
             else:
                 # Run PagedAttention V2.
@@ -305,7 +308,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    kv_scale,
+                    k_scale,
+                    v_scale,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index c45f7b28b2afb..b83a83bb177d4 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -131,7 +131,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]],
         attn_metadata: PallasMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -146,7 +147,7 @@ def forward(
         Returns:
             shape = [batch_size, seq_len, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 81b546c65c819..f6ecea30da492 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -296,7 +296,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: ROCmFlashAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -336,7 +337,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         num_prefill_tokens = attn_metadata.num_prefill_tokens
@@ -456,7 +458,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 48418f24870f9..fe6a56123ce72 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -144,7 +144,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: TorchSDPAMetadata,  # type: ignore
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -158,7 +159,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert kv_scale == 1.0
+        assert k_scale == 1.0 and v_scale == 1.0
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
@@ -176,7 +177,8 @@ def forward(
             PagedAttention.write_to_paged_cache(key, value, key_cache,
                                                 value_cache,
                                                 attn_metadata.slot_mapping,
-                                                self.kv_cache_dtype, kv_scale)
+                                                self.kv_cache_dtype, k_scale,
+                                                v_scale)
 
         if attn_metadata.is_prompt:
             assert attn_metadata.seq_lens is not None
@@ -239,7 +241,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 6cc5f1d1477ae..3dd60ed5be528 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -427,7 +427,8 @@ def forward(
         value: Optional[torch.Tensor],
         kv_cache: Optional[torch.Tensor],
         attn_metadata: "XFormersMetadata",
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -531,7 +532,7 @@ def forward(
                                                     value_cache,
                                                     updated_slot_mapping,
                                                     self.kv_cache_dtype,
-                                                    kv_scale)
+                                                    k_scale, v_scale)
 
         if attn_type != AttentionType.ENCODER:
             # Decoder self-attention supports chunked prefill.
@@ -620,7 +621,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                kv_scale,
+                k_scale,
+                v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b8cc87be8c748..0619bda90a2a7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -47,13 +47,14 @@ def __init__(
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
-        # The default kv_scale is set to 1.0. This is ignored
+        # The default k/v_scale is set to 1.0. This is ignored
         # when kv-cache is not fp8, and should be used with
         # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
-        # expect the pre-quantized kv_scale to be loaded along
+        # expect the pre-quantized k/v_scale to be loaded along
         # with the model weights.
         self.kv_cache_dtype = kv_cache_dtype
-        self._kv_scale = 1.0
+        self._k_scale = 1.0
+        self._v_scale = 1.0
         quant_method = quant_config.get_quant_method(
             self) if quant_config else None
         if quant_method is not None:
@@ -66,8 +67,8 @@ def __init__(
                                      "fp8 checkpoints.")
                 # When FP8 quantization is enabled, we make a parameter
                 # "kv_scale" so that it can be loaded from FP8 checkpoint.
-                # The kv_scale will then be converted back to self._kv_scale
-                # in a native float32 value after weight loading.
+                # The k/v_scale will then be converted back to
+                # self._kv_scale in a native float32 value after weight loading
                 self.quant_method = quant_method
                 self.quant_method.create_weights(self)
 
@@ -98,7 +99,8 @@ def forward(
                                  value,
                                  kv_cache,
                                  attn_metadata,
-                                 self._kv_scale,
+                                 self._k_scale,
+                                 self._v_scale,
                                  attn_type=attn_type)
 
     def extra_repr(self) -> str:
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 5a5317b65004e..81d308c4d4e22 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -45,7 +45,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         *args,
     ) -> None:
         ipex_modules.PagedAttention.reshape_and_cache(
@@ -64,7 +65,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         *args,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index a214f40d16514..ce7b4d129779c 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -66,7 +66,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
     ) -> None:
         ops.reshape_and_cache(
             key,
@@ -75,7 +76,8 @@ def write_to_paged_cache(
             value_cache,
             slot_mapping.flatten(),
             kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
         )
 
     @staticmethod
@@ -90,7 +92,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -135,7 +138,8 @@ def forward_decode(
                 max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank,
                 blocksparse_local_blocks,
                 blocksparse_vert_stride,
@@ -172,7 +176,8 @@ def forward_decode(
                 max_seq_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                k_scale,
+                v_scale,
                 tp_rank,
                 blocksparse_local_blocks,
                 blocksparse_vert_stride,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index bc07d2b831862..684e1abf7bcf7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -196,6 +196,15 @@ def __init__(self,
         else:
             self.register_parameter("bias", None)
 
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5c916c9b4d7e4..cfef914ed6cf7 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -407,31 +407,56 @@ def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module):
-        """Create "weight" (aka kv_scale) for an attention layer.
+        """Create "weight" (aka k_scale and v_scale) for an attention layer.
 
         Args:
             layer: The layer that is using the QuantizeMethodBase factory.
         """
-        # Initialize the KV cache scale to 1.0 as the default value.
-        # If the kv_scale appears in the checkpoint, it will be
+        # Initialize the KV cache scales to -1.0, which is an invalid value.
+        # If the k/v_scale appears in the checkpoint, it will be
         # overwritten when loading weights.
-        layer.kv_scale = Parameter(torch.tensor(1.0), requires_grad=False)
+        layer.k_scale = Parameter(torch.tensor(-1.0), requires_grad=False)
+        layer.v_scale = Parameter(torch.tensor(-1.0), requires_grad=False)
 
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError("Fp8KVCacheMethod.apply should not be called.")
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # If the kv-cache dtype is auto, we enforce the kv-scale to be 1.0
+        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
         # regardless whether the kv-scale is available in the checkpoint.
         if layer.kv_cache_dtype != "auto":
-            kv_scale = layer.kv_scale.to("cpu").tolist()
-            if not isinstance(kv_scale, float):
+            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+                # We prefer to use separate k_scale and v_scale if present
+                k_scale = layer.k_scale.to("cpu").tolist()
+                v_scale = layer.v_scale.to("cpu").tolist()
+            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+                # If no scales were loaded (both scales are invalid negative
+                # values), use the default value of 1.0
+                k_scale = Parameter(torch.tensor(1.0), requires_grad=False)
+                v_scale = Parameter(torch.tensor(1.0), requires_grad=False)
+            else:
+                # If we find a single kv_scale in the checkpoint, we remap
+                # kv_scale to k_scale during weight loading, and duplicate
+                # k_scale to v_scale here
+                assert layer.k_scale > 0.0
+                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+                k_scale = scale_to_duplicate.to("cpu").tolist()
+                v_scale = scale_to_duplicate.to("cpu").tolist()
+
+            if not isinstance(k_scale, float) or not isinstance(
+                    v_scale, float):
                 raise ValueError("Only support per-tensor scaling factor "
                                  "for fp8 KV cache")
-            layer._kv_scale = kv_scale
-            if layer._kv_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype:
+
+            # These are used in the final Attention.forward()
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+            if (layer._k_scale == 1.0 and layer._v_scale == 1.0
+                    and "e5m2" not in layer.kv_cache_dtype):
                 print_warning_once(
-                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "
-                    "cause accuracy issues. Please make sure kv-cache scaling "
-                    "factor is available in the fp8 checkpoint.")
-        del layer.kv_scale
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
+                    "may cause accuracy issues. Please make sure k/v_scale "
+                    "scaling factors are available in the fp8 checkpoint.")
+
+        del layer.k_scale
+        del layer.v_scale
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index c8568b3dc6690..cb83f43a2a4e2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -431,11 +432,6 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
 def default_weight_loader(param: torch.Tensor,
                           loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
-    # If the weight on disk does not have a shape, give it one
-    # (such scales for AutoFp8).
-    if len(loaded_weight.shape) == 0:
-        loaded_weight = loaded_weight.reshape(1)
-
     assert param.size() == loaded_weight.size()
     param.data.copy_(loaded_weight)
 
@@ -462,3 +458,55 @@ def initialize_dummy_weights(
                 param.data.copy_(tmp_param)
             else:
                 param.uniform_(low, high)
+
+
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+    """Remap the name of FP8 k/v_scale parameters.
+
+    This function handles the remapping of FP8 k/v_scale parameter names.
+    It detects if the given name ends with a suffix and attempts to remap
+    it to the expected name format in the model. If the remapped name is not
+    found in the params_dict, a warning is printed and None is returned.
+
+    Args:
+        name (str): The original loaded checkpoint parameter name.
+        params_dict (dict): Dictionary containing the model's named parameters.
+
+    Returns:
+        str: The remapped parameter name if successful, or the original name
+             if no remapping is needed.
+        None: If the remapped name is not found in params_dict.
+    """
+    if name.endswith(".kv_scale"):
+        print_warning_once(
+            "DEPRECATED. Found kv_scale in the checkpoint. "
+            "This format is deprecated in favor of separate k_scale and "
+            "v_scale tensors and will be removed in a future release. "
+            "Functionally, we will remap kv_scale to k_scale and duplicate "
+            "k_scale to v_scale")
+        # NOTE: we remap the deprecated kv_scale to k_scale
+        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
+        if remapped_name not in params_dict:
+            print_warning_once(
+                f"Found kv_scale in the checkpoint (e.g. {name}), "
+                "but not found the expected name in the model "
+                f"(e.g. {remapped_name}). kv_scale is "
+                "not loaded.")
+            return None
+        return remapped_name
+
+    possible_scale_names = [".k_scale", ".v_scale"]
+    for scale_name in possible_scale_names:
+        if name.endswith(scale_name):
+            remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if remapped_name not in params_dict:
+                print_warning_once(
+                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
+                    "but not found the expected name in the model "
+                    f"(e.g. {remapped_name}). {scale_name} is "
+                    "not loaded.")
+                return None
+            return remapped_name
+
+    # If there were no matches, return the untouched param name
+    return name
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a777d1fbfa802..f03e34b9e7c92 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -44,10 +44,10 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader)
+    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import is_hip, print_warning_once
+from vllm.utils import is_hip
 
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
@@ -460,18 +460,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 # Remapping the name of FP8 kv-scale.
-                if name.endswith("kv_scale"):
-                    remapped_kv_scale_name = name.replace(
-                        ".kv_scale", ".attn.kv_scale")
-                    if remapped_kv_scale_name not in params_dict:
-                        print_warning_once(
-                            f"Found kv scale in the checkpoint (e.g. {name}), "
-                            "but not found the expected name in the model "
-                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
-                            "not loaded.")
-                        continue
-                    else:
-                        name = remapped_kv_scale_name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
 
                 if is_pp_missing_parameter(name, self):
                     continue
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0c456ada61230..e739df87cf96a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -42,10 +42,10 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
 
@@ -415,19 +415,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if name.endswith(".bias") and name not in params_dict:
                         continue
                     # Remapping the name of FP8 kv-scale.
-                    if name.endswith("kv_scale"):
-                        remapped_kv_scale_name = name.replace(
-                            ".kv_scale", ".attn.kv_scale")
-                        if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
-                            continue
-                        else:
-                            name = remapped_kv_scale_name
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e9ae2192f280d..e9aa4416eded4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -43,10 +43,10 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsLoRA
 
@@ -382,18 +382,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 # Remapping the name of FP8 kv-scale.
-                if name.endswith("kv_scale"):
-                    remapped_kv_scale_name = name.replace(
-                        ".kv_scale", ".attn.kv_scale")
-                    if remapped_kv_scale_name not in params_dict:
-                        print_warning_once(
-                            f"Found kv scale in the checkpoint (e.g. {name}), "
-                            "but not found the expected name in the model "
-                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
-                            "not loaded.")
-                        continue
-                    else:
-                        name = remapped_kv_scale_name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From 09c2eb85ddd3b2585979f4cd9cc97168d86718b6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 16 Jul 2024 15:44:22 -0700
Subject: [PATCH 10/16] [ci][distributed] add pipeline parallel correctness
 test (#6410)

---
 .buildkite/test-pipeline.yaml               |   9 +-
 tests/distributed/test_pipeline_parallel.py | 233 ++++++++++----------
 vllm/executor/multiproc_gpu_executor.py     |  15 ++
 3 files changed, 129 insertions(+), 128 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cd3a5e80d7bd0..445d74d6d9bbe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -72,7 +72,7 @@ steps:
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+    - pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 
@@ -115,12 +115,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 2d9f63795189d..5e824b0f5a65a 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,25 +1,18 @@
-import os
-
-import openai  # use the official client for correctness check
 import pytest
 
 from ..utils import RemoteOpenAIServer
 
-# downloading lora to test lora requests
-
-# any model with a chat template should work here
-MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
-EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0)))
-CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0)))
-TP_SIZE = int(os.getenv("TP_SIZE", 1))
-PP_SIZE = int(os.getenv("PP_SIZE", 1))
-
-pytestmark = pytest.mark.asyncio
-
 
-@pytest.fixture(scope="module")
-def server():
-    args = [
+@pytest.mark.parametrize(
+    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME", [
+        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B"),
+        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B"),
+        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B"),
+        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
+        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
+    ])
+def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
+    pp_args = [
         "--model",
         MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
@@ -32,109 +25,107 @@ def server():
         "--distributed-executor-backend",
         "ray",
     ]
+
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        "--model",
+        MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--tensor-parallel-size",
+        str(max(TP_SIZE, 2)),  # use at least TP_SIZE=2 to hold the model
+        "--distributed-executor-backend",
+        "mp",
+    ]
     if CHUNKED_PREFILL:
-        args += [
-            "--enable-chunked-prefill",
-        ]
+        pp_args.append("--enable-chunked-prefill")
+        tp_args.append("--enable-chunked-prefill")
     if EAGER_MODE:
-        args += [
-            "--enforce-eager",
-        ]
-    with RemoteOpenAIServer(args) as remote_server:
-        yield remote_server
-
-
-@pytest.fixture(scope="module")
-def client(server):
-    return server.get_async_client()
-
-
-async def test_check_models(server, client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_single_completion(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
-
-
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_batch_completions(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
-    # test simple list
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(batch.choices) == 2
-    assert batch.choices[0].text == batch.choices[1].text
-
-    # test n = 2
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        n=2,
-        max_tokens=5,
-        temperature=0.0,
-        extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-            # for official client.
-            use_beam_search=True),
-    )
-    assert len(batch.choices) == 4
-    assert batch.choices[0].text != batch.choices[
-        1].text, "beam search should be different"
-    assert batch.choices[0].text == batch.choices[
-        2].text, "two copies of the same prompt should be the same"
-    assert batch.choices[1].text == batch.choices[
-        3].text, "two copies of the same prompt should be the same"
-
-    # test streaming
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-    )
-    texts = [""] * 2
-    async for chunk in batch:
-        assert len(chunk.choices) == 1
-        choice = chunk.choices[0]
-        texts[choice.index] += choice.text
-    assert texts[0] == texts[1]
+        pp_args.append("--enforce-eager")
+        tp_args.append("--enforce-eager")
+
+    results = []
+    for args in [pp_args, tp_args]:
+        with RemoteOpenAIServer(args) as server:
+            client = server.get_client()
+
+            # test models list
+            models = client.models.list()
+            models = models.data
+            served_model = models[0]
+            results.append({
+                "test": "models_list",
+                "id": served_model.id,
+                "root": served_model.root,
+            })
+
+            # test with text prompt
+            completion = client.completions.create(model=MODEL_NAME,
+                                                   prompt="Hello, my name is",
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+            results.append({
+                "test": "single_completion",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test using token IDs
+            completion = client.completions.create(
+                model=MODEL_NAME,
+                prompt=[0, 0, 0, 0, 0],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "token_ids",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test simple list
+            batch = client.completions.create(
+                model=MODEL_NAME,
+                prompt=["Hello, my name is", "Hello, my name is"],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "simple_list",
+                "text0": batch.choices[0].text,
+                "text1": batch.choices[1].text,
+            })
+
+            # test streaming
+            batch = client.completions.create(
+                model=MODEL_NAME,
+                prompt=["Hello, my name is", "Hello, my name is"],
+                max_tokens=5,
+                temperature=0.0,
+                stream=True,
+            )
+            texts = [""] * 2
+            for chunk in batch:
+                assert len(chunk.choices) == 1
+                choice = chunk.choices[0]
+                texts[choice.index] += choice.text
+            results.append({
+                "test": "streaming",
+                "texts": texts,
+            })
+
+    n = len(results) // 2
+    pp_results = results[:n]
+    tp_results = results[n:]
+    for pp, tp in zip(pp_results, tp_results):
+        assert pp == tp
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index a0e248b2e1992..01ed9d1219e7f 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -1,5 +1,7 @@
 import asyncio
 import os
+import signal
+import weakref
 from functools import partial
 from typing import Any, List, Optional
 
@@ -78,6 +80,19 @@ def _init_executor(self) -> None:
             result_handler.start()
             self.worker_monitor.start()
 
+        # Set up signal handlers to shutdown the executor cleanly
+        # sometimes gc does not work well
+
+        # Use weakref to avoid holding a reference to self
+        ref = weakref.ref(self)
+
+        def shutdown(signum, frame):
+            if executor := ref():
+                executor.shutdown()
+
+        signal.signal(signal.SIGINT, shutdown)
+        signal.signal(signal.SIGTERM, shutdown)
+
         self.driver_worker = self._create_worker(
             distributed_init_method=distributed_init_method)
         self._run_workers("init_device")

From 7f62077af5159c625fe3ad1c812e6c1a2b93ba3b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 16 Jul 2024 17:35:52 -0700
Subject: [PATCH 11/16] [misc][distributed] improve tests (#6488)

---
 tests/distributed/test_pipeline_parallel.py            | 8 +++++---
 vllm/distributed/device_communicators/shm_broadcast.py | 4 +++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5e824b0f5a65a..7b7475a77167c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -4,12 +4,14 @@
 
 
 @pytest.mark.parametrize(
-    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME", [
+    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME",
+    [
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B"),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B"),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B"),
-        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
-        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
+        # TODO: figure out why PP=4 tests are flaky
+        # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
+        # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
     ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
     pp_args = [
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index db0064951cd1b..151b08c1b996c 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -170,7 +170,7 @@ def __init__(
         self.n_remote_reader = n_remote_reader
 
         if connect_ip is None:
-            connect_ip = get_ip()
+            connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1"
 
         context = Context()
 
@@ -230,6 +230,8 @@ def __init__(
             remote_sync_port=remote_sync_port,
         )
 
+        logger.info("vLLM message queue communication handle: %s", self.handle)
+
     def export_handle(self) -> Handle:
         return self.handle
 

From ce37be7ba05f21f2a8f3b792034195305ebe1b27 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 16 Jul 2024 19:16:34 -0700
Subject: [PATCH 12/16] [misc][distributed] add seed to dummy weights (#6491)

---
 vllm/model_executor/model_loader/weight_utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index cb83f43a2a4e2..698c59d49fe06 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -440,6 +440,7 @@ def initialize_dummy_weights(
     model: torch.nn.Module,
     low: float = -1e-3,
     high: float = 1e-3,
+    seed: int = 1234,
 ) -> None:
     """Initialize model weights with random values.
 
@@ -447,17 +448,25 @@ def initialize_dummy_weights(
     measurements. Additionally, the model weights should not cause NaNs in the
     forward pass. We empirically found that initializing the weights with
     values between -1e-3 and 1e-3 works well for most models.
+
+    We use per-parameter random seed, so that dummy weights are consistent,
+    even if the model is partitioned across multiple devices. When the seed
+    is fixed, the random values generated by this function only depends on
+    the parameter's number of elements and its data type.
     """
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
             if torch.finfo(param.data.dtype).bits < 16:
                 # uniform_ doesn't support < 16-bit datatypes (FP8)
                 dtype = param.data.dtype
                 tmp_param = param.data.to(torch.float16)
-                tmp_param = tmp_param.uniform_(low, high).to(dtype)
+                tmp_param = tmp_param.uniform_(low, high,
+                                               generator=generator).to(dtype)
                 param.data.copy_(tmp_param)
             else:
-                param.uniform_(low, high)
+                param.uniform_(low, high, generator=generator)
 
 
 def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:

From 1d094fd7c0d72e7eaf3a2b65fff9f7de8d812a49 Mon Sep 17 00:00:00 2001
From: Wushi Dong <33078715+wushidonguc@users.noreply.github.com>
Date: Tue, 16 Jul 2024 19:20:26 -0700
Subject: [PATCH 13/16] [Distributed][PP] only create embedding & lm head when
 necessary (#6455)

original title: [Distributed][Model] Rank-based Component Creation for Pipeline Parallelism Memory Optimization
---
 vllm/model_executor/models/llama.py | 65 +++++++++++++++++------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f03e34b9e7c92..4c434e54cf743 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -50,7 +50,7 @@
 from vllm.utils import is_hip
 
 from .interfaces import SupportsLoRA
-from .utils import is_pp_missing_parameter, make_layers
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
 
 
 class LlamaMLP(nn.Module):
@@ -257,17 +257,24 @@ def __init__(
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda: LlamaDecoderLayer(config=config,
                                       cache_config=cache_config,
                                       quant_config=quant_config))
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -360,26 +367,30 @@ def __init__(
                                 cache_config,
                                 quant_config,
                                 lora_config=lora_config)
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-            quant_config=quant_config,
-        )
-        if config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
 
     def forward(
         self,

From 10383887e03412196a2689b9398290719c4797bf Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 17 Jul 2024 01:47:02 -0400
Subject: [PATCH 14/16] [ROCm] Cleanup Dockerfile and remove outdated patch
 (#6482)

---
 Dockerfile.rocm                               | 27 +---------
 .../getting_started/amd-installation.rst      | 49 +++++--------------
 rocm_patch/rocm_bf16.patch                    | 15 ------
 3 files changed, 14 insertions(+), 77 deletions(-)
 delete mode 100644 rocm_patch/rocm_bf16.patch

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index befb0499f2e68..85dfda8dbb532 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,11 +1,6 @@
 # Default ROCm 6.1 base image
 ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
 
-# Tested and supported base rocm/pytorch images
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
-    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
-    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
-
 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 
@@ -54,18 +49,6 @@ RUN pip install --upgrade pip
 RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
 # Install torch == 2.5.0 on ROCm
 RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-5.7"*) \
-            pip uninstall -y torch torchaudio torchvision \
-            && pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
-                torchvision==0.20.0.dev20240710 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
-        *"rocm-6.0"*) \
-            pip uninstall -y torch torchaudio torchvision \
-            && pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
-                torchvision==0.20.0.dev20240710 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
         *"rocm-6.1"*) \
             pip uninstall -y torch torchaudio torchvision \
             && pip install --no-cache-dir --pre \
@@ -104,11 +87,6 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
     && cd flash-attention \
     && git checkout "${FA_BRANCH}" \
     && git submodule update --init \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-5.7"*) \
-            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
-            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
-        *) ;; esac \
     && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
     # Create an empty directory otherwise as later build stages expect one
     else mkdir -p /install; \
@@ -161,12 +139,9 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.0"*) \
-            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
         *"rocm-6.1"*) \
             # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
-            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
             # Prevent interference if torch bundles its own HIP runtime
             && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
         *) ;; esac \
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index cc41d47296f8d..1f9e4fabc4fc9 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -3,7 +3,7 @@
 Installation with ROCm
 ======================
 
-vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
+vLLM supports AMD GPUs with ROCm 6.1.
 
 Requirements
 ------------
@@ -11,7 +11,7 @@ Requirements
 * OS: Linux
 * Python: 3.8 -- 3.11
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.0 and ROCm 5.7
+* ROCm 6.1
 
 Installation options:
 
@@ -27,10 +27,10 @@ You can build and install vLLM from source.
 
 First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
 
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
+* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
 * `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
 * `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
@@ -39,24 +39,17 @@ It provides flexibility to customize the build of docker image using the followi
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
 
 
-To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:
 
 .. code-block:: console
 
-    $ docker build -f Dockerfile.rocm -t vllm-rocm .
+    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 
-To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
+To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
 
 .. code-block:: console
 
-    $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
-
-.. code-block:: console
-
-    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-       -f Dockerfile.rocm -t vllm-rocm . 
+    $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
 
 To run the above docker image ``vllm-rocm``, use the below command:
 
@@ -85,25 +78,12 @@ Option 2: Build from source
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
 - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
-- `Pytorch <https://pytorch.org/>`_
+- `PyTorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
-
-For rocm6.0:
-
-.. code-block:: console
-
-    $ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
-
-
-For rocm5.7:
-
-.. code-block:: console
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`.
 
-    $ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
 
 
 1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
@@ -115,8 +95,6 @@ Install ROCm's Triton flash attention (the default triton-mlir branch) following
 Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
 
 .. note::
-    - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
-    - If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
     - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
@@ -131,7 +109,6 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
 
 .. tip::
 
-    - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
-    - The ROCm version of pytorch, ideally, should match the ROCm driver version.
+    - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
+    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
diff --git a/rocm_patch/rocm_bf16.patch b/rocm_patch/rocm_bf16.patch
deleted file mode 100644
index a0f07da2a3e2b..0000000000000
--- a/rocm_patch/rocm_bf16.patch
+++ /dev/null
@@ -1,15 +0,0 @@
---- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
-+++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
-@@ -90,10 +90,10 @@
- #include "math_fwd.h"              // ocml device functions
- 
- #if defined(__HIPCC_RTC__)
--#define __HOST_DEVICE__ __device__
-+#define __HOST_DEVICE__ __device__ static
- #else
- #include <climits>
--#define __HOST_DEVICE__ __host__ __device__
-+#define __HOST_DEVICE__ __host__ __device__ static inline
- #endif
- 
- // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on

From a19e8d372651abad75dc6a3939c18f23a1ae8d40 Mon Sep 17 00:00:00 2001
From: shangmingc <csmthu@gmail.com>
Date: Wed, 17 Jul 2024 15:17:07 +0800
Subject: [PATCH 15/16] [Misc][Speculative decoding] Typos and typing fixes
 (#6467)

Co-authored-by: caishangming.csm <caishangming.csm@alibaba-inc.com>
---
 vllm/spec_decode/multi_step_worker.py    | 2 +-
 vllm/spec_decode/ngram_worker.py         | 4 ++--
 vllm/spec_decode/proposer_worker_base.py | 2 +-
 vllm/spec_decode/spec_decode_worker.py   | 2 +-
 vllm/spec_decode/top1_proposer.py        | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 09a77f9e870fb..11e99882e3f0b 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -43,7 +43,7 @@ def init_device(self) -> None:
         )
 
     def set_include_gpu_probs_tensor(self) -> None:
-        # Need include_gpu_probs_tensor for multi_step_worker
+        # Need include_gpu_probs_tensor for MultiStepWorker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
 
     @torch.inference_mode()
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 07991df52e655..a21222fec269b 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -13,7 +13,7 @@
 class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
-    Current NGramWorker only implement prompt lookup decoding,
+    Current NGramWorker only implements prompt lookup decoding,
     and in future we may also do RAG type drafter and other scenarios
     which don't rely on LLM model to give proposals.
     """
@@ -37,7 +37,7 @@ def init_device(self):
         self.device = torch.device(f"cuda:{self.local_rank}")
         self.load_model = lambda *args, **kwargs: None
 
-        # Current only support Top1Proposer
+        # Current NGramWorker only supports Top1Proposer
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
             device=self.device,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index fffa557121e17..51cefc0cbca8b 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -24,7 +24,7 @@ def sampler_output(
     ) -> Tuple[Optional[List[SamplerOutput]], bool]:
         raise NotImplementedError
 
-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
         """Implementation optional"""
         pass
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 3c8e3dee46831..903264aad7a15 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -206,7 +206,7 @@ def __init__(
 
         self.probs_dtype = self.spec_decode_sampler.probs_dtype
         self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
-        # Lazy initiazliation.
+        # Lazy initialization.
         self.scorer: SpeculativeScorer
 
         # Hidden states from target model to pass to proposer
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 7b34b5d34208b..ade293c2c0757 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -138,7 +138,7 @@ def _split_by_proposal_len(
 
             # Currently only proposal lens of 0 or the global batch proposal len
             # are supported.
-            # If max_proposal_len is defined, then we shall no exccess this
+            # If max_proposal_len is defined, then we shall no exceed this
             # quota for nonzero_proposal
             new_k = 0
             if (self.max_proposal_len is None
@@ -219,7 +219,7 @@ def _merge_outputs(
         proposal_lens: List[int],
         nonzero_proposal_len_indices: List[int],
         sampler_transposed: bool,
-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """After speculations are produced, merge the speculation results with
         the skipped sequences.
         """

From 5bf35a91e465278b51b0c98fa1b13beb8b04d431 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Jul 2024 15:43:21 +0800
Subject: [PATCH 16/16] [Doc][CI/Build] Update docs and tests to use `vllm
 serve` (#6431)

---
 docs/source/getting_started/quickstart.rst    |  7 +--
 docs/source/models/adding_model.rst           |  4 +-
 docs/source/models/engine_args.rst            |  4 +-
 docs/source/models/lora.rst                   |  3 +-
 docs/source/models/vlm.rst                    |  4 +-
 docs/source/serving/deploying_with_dstack.rst |  2 +-
 docs/source/serving/distributed_serving.rst   |  6 +-
 .../serving/openai_compatible_server.md       |  8 +--
 examples/api_client.py                        |  5 +-
 examples/logging_configuration.md             | 12 +---
 examples/openai_vision_api_client.py          |  4 +-
 examples/production_monitoring/Otel.md        |  6 +-
 examples/production_monitoring/README.md      |  3 +-
 tests/async_engine/test_openapi_server_ray.py | 22 +++----
 tests/distributed/test_pipeline_parallel.py   |  6 +-
 tests/entrypoints/openai/test_chat.py         | 42 ++++++-------
 tests/entrypoints/openai/test_completion.py   | 60 +++++++++----------
 tests/entrypoints/openai/test_embedding.py    | 22 +++----
 tests/entrypoints/openai/test_models.py       | 42 ++++++-------
 tests/entrypoints/openai/test_tokenization.py | 24 ++++----
 tests/entrypoints/openai/test_vision.py       | 22 +++----
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tests/utils.py                                | 18 +++---
 23 files changed, 155 insertions(+), 175 deletions(-)

diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 7c44a96865a50..89bdc247c5e8e 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -73,16 +73,13 @@ Start the server:
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-125m
+    $ vllm serve facebook/opt-125m
 
 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
 
 .. code-block:: console
 
-   $ python -m vllm.entrypoints.openai.api_server \
-   $     --model facebook/opt-125m \
-   $     --chat-template ./examples/template_chatml.jinja
+    $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
 
 This server can be queried in the same format as OpenAI API. For example, list the models:
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 53c19e5829218..5cffb58cafd96 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,7 +114,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
+If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
 
 .. code-block:: python
 
@@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
     import runpy
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
-Save the above code in a file and run it with `python your_file.py args`.
+Save the above code in a file and run it with :code:`python your_file.py <args>`.
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index bdf566d3ebbd1..e7ce8cdcabe88 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
 
 Async Engine Arguments
@@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
\ No newline at end of file
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 5cc3076073fbd..f08773fe59d92 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -61,8 +61,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model meta-llama/Llama-2-7b-hf \
+    vllm serve meta-llama/Llama-2-7b-hf \
         --enable-lora \
         --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index d488b0fefdf06..92aca168dadf2 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model llava-hf/llava-1.5-7b-hf \
-        --chat-template template_llava.jinja
+    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 
 .. important::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst
index baf87314ca8e4..e1eb45b225d9c 100644
--- a/docs/source/serving/deploying_with_dstack.rst
+++ b/docs/source/serving/deploying_with_dstack.rst
@@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
         gpu: 24GB
     commands:
         - pip install vllm
-        - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000
+        - vllm serve $MODEL --port 8000
     model:
         format: openai
         type: chat
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 2dfb83f168b5d..fa1b04dc3dce5 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -35,16 +35,14 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-13b \
+    $ vllm serve facebook/opt-13b \
     $     --tensor-parallel-size 4
 
 You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model gpt2 \
+    $ vllm serve gpt2 \
     $     --tensor-parallel-size 4 \
     $     --pipeline-parallel-size 2 \
     $     --distributed-executor-backend ray
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 092c3c6cb9a3d..a06c30d9c48c6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
 
 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
 and all chat requests will error.
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model ... \
-  --chat-template ./path-to-chat-template.jinja
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
 vLLM community provides a set of chat templates for popular models. You can find them in the examples
@@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 ```{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
-:prog: -m vllm.entrypoints.openai.api_server
+:prog: vllm serve
 ```
 
 ## Tool calling in the chat completion API
diff --git a/examples/api_client.py b/examples/api_client.py
index 5f7daa14d5044..27a2a08b7b0c3 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -1,8 +1,7 @@
-"""Example Python client for vllm.entrypoints.api_server
+"""Example Python client for `vllm.entrypoints.api_server`
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
-For production use, we recommend vllm.entrypoints.openai.api_server
-and the OpenAI client API
+For production use, we recommend `vllm serve` and the OpenAI client API.
 """
 
 import argparse
diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md
index 75b4b31a80462..0d278b0392403 100644
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -167,9 +163,7 @@ loggers.
 
 ```bash
 VLLM_CONFIGURE_LOGGING=0 \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index d4d9738a1f7bc..2082c378e267c 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -1,9 +1,7 @@
 """An example showing how to use vLLM to serve VLMs.
 
 Launch the vLLM server with the following command:
-python -m vllm.entrypoints.openai.api_server \
-    --model llava-hf/llava-1.5-7b-hf \
-    --chat-template template_llava.jinja
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 """
 import base64
 
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
index 1449442273c7a..2c7a7caa1bd7c 100644
--- a/examples/production_monitoring/Otel.md
+++ b/examples/production_monitoring/Otel.md
@@ -36,7 +36,7 @@
     ```
     export OTEL_SERVICE_NAME="vllm-server"
     export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
-    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
     ```
 
 1. In a new shell, send requests with trace context from a dummy client
@@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
 ```
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
-python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
 ```
 
 ## Instrumentation of FastAPI
@@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
 
 1. Run vLLM with `opentelemetry-instrument`
     ```
-    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    opentelemetry-instrument vllm serve facebook/opt-125m
     ```
 
 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md
index 268f2e771018f..807c0470e7b30 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
@@ -10,8 +10,7 @@ Install:
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
-python3 -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-v0.1 \
+vllm serve mistralai/Mistral-7B-v0.1 \
     --max-model-len 2048 \
     --disable-log-requests
 ```
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 575f8f19b8ebe..5ecd770ede836 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -9,17 +9,17 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "float16",
-            "--max-model-len",
-            "2048",
-            "--enforce-eager",
-            "--engine-use-ray"
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--engine-use-ray"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 7b7475a77167c..52074a93329ea 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -15,8 +15,6 @@
     ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
     pp_args = [
-        "--model",
-        MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -34,8 +32,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
     #  schedule all workers in a node other than the head node,
     #  which can cause the test to fail.
     tp_args = [
-        "--model",
-        MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -53,7 +49,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
 
     results = []
     for args in [pp_args, tp_args]:
-        with RemoteOpenAIServer(args) as server:
+        with RemoteOpenAIServer(MODEL_NAME, args) as server:
             client = server.get_client()
 
             # test models list
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d370c63c0c7ba..8f67dd54edff0 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -27,27 +27,27 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            # lora config below
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 35af0b02747e9..59151b9c4e99e 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -37,36 +37,36 @@ def zephyr_pa_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files, zephyr_pa_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--max-num-seqs",
-            "128",
-            "--enforce-eager",
-            # lora config
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            # pa config
-            "--enable-prompt-adapter",
-            "--prompt-adapters",
-            f"zephyr-pa={zephyr_pa_files}",
-            f"zephyr-pa2={zephyr_pa_files}",
-            "--max-prompt-adapters",
-            "2",
-            "--max-prompt-adapter-token",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 4a32aadc8c3ae..2ca0c0d63c25c 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,17 +11,17 @@
 
 @pytest.fixture(scope="module")
 def embedding_server():
-    with RemoteOpenAIServer([
-            "--model",
-            EMBEDDING_MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--enforce-eager",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index bf63f9a813f2c..c2cfff228c546 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -19,27 +19,27 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            # lora config below
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index d33fd222ee150..f32abba225d40 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -12,18 +12,18 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 563b68566bd2c..cc5c8d619183f 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,17 +23,17 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "4096",
-            "--enforce-eager",
-            "--chat-template",
-            str(LLAVA_CHAT_TEMPLATE),
-    ]) as remote_server:
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index a43f9132585b5..b7030e3cd6d42 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
     ## Start OpenAI API server
     openai_args = [
-        "--model", model_ref, "--dtype", "float16", "--load-format",
+        "--dtype", "float16", "--load-format",
         "tensorizer", "--model-loader-extra-config",
         json.dumps(model_loader_extra_config),
     ]
 
-    with RemoteOpenAIServer(openai_args) as server:
+    with RemoteOpenAIServer(model_ref, openai_args) as server:
         print("Server ready.")
 
         client = server.get_client()
diff --git a/tests/utils.py b/tests/utils.py
index 8780d45a31b29..80e0895c551b2 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -49,7 +49,13 @@ class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+    def __init__(
+        self,
+        model: str,
+        cli_args: List[str],
+        *,
+        auto_port: bool = True,
+    ) -> None:
         if auto_port:
             if "-p" in cli_args or "--port" in cli_args:
                 raise ValueError("You have manually specified the port"
@@ -68,12 +74,10 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
         env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        self.proc = subprocess.Popen(
-            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
-            cli_args,
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr)
+        self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
+                                     env=env,
+                                     stdout=sys.stdout,
+                                     stderr=sys.stderr)
         self._wait_for_server(url=self.url_for("health"),
                               timeout=self.MAX_SERVER_START_WAIT_S)