From 7cbd9ec7a9bfd4952ad522355b6bbb8e82b54fc9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 29 Jul 2024 18:16:30 +0800
Subject: [PATCH 01/79] [Model] Initialize support for InternVL2 series models
 (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst       |   4 +
 examples/offline_inference_vision_language.py |  15 +
 examples/openai_vision_api_client.py          |   2 +
 requirements-test.txt                         |   1 +
 tests/models/test_internvl.py                 | 201 ++++++++
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/intern_vit.py      | 270 ++++++++++
 vllm/model_executor/models/internlm2.py       |  10 +-
 vllm/model_executor/models/internvl.py        | 471 ++++++++++++++++++
 vllm/model_executor/models/qwen2.py           |  10 +-
 vllm/transformers_utils/config.py             |   8 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/internvl.py   |  51 ++
 14 files changed, 1042 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/test_internvl.py
 create mode 100644 vllm/model_executor/models/intern_vit.py
 create mode 100644 vllm/model_executor/models/internvl.py
 create mode 100644 vllm/transformers_utils/configs/internvl.py
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 83c1b9c8bce86..4fe33e5ab5d80 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -200,6 +200,10 @@ Vision Language Models
     - Fuyu
     - :code:`adept/fuyu-8b` etc.
     - 
+  * - :code:`InternVLChatModel`
+    - InternVL2
+    - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - 
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 04ba1a96314c9..846246a2062a6 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -106,6 +106,20 @@ def run_minicpmv(question):
     return llm, prompt
 
 
+# InternVL
+def run_internvl(question):
+    # Generally, InternVL can use chatml template for conversation
+    TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    prompt = f"<image>\n{question}\n"
+    prompt = TEMPLATE.format(prompt=prompt)
+    llm = LLM(
+        model="OpenGVLab/InternVL2-4B",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
 # BLIP-2
 def run_blip2(question):
 
@@ -125,6 +139,7 @@ def run_blip2(question):
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
+    "internvl_chat": run_internvl,
 }
 
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 2082c378e267c..be90394511f89 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -42,6 +42,7 @@
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_url.choices[0].message.content
@@ -78,6 +79,7 @@ def encode_image_base64_from_url(image_url: str) -> str:
         ],
     }],
     model=model,
+    max_tokens=64,
 )
 
 result = chat_completion_from_base64.choices[0].message.content
diff --git a/requirements-test.txt b/requirements-test.txt
index a7604d2e1015e..9b88fcce3e842 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -16,6 +16,7 @@ ray
 sentence-transformers # required for embedding
 sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
+timm # required for internvl test
 
 # Benchmarking
 aiohttp
diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
new file mode 100644
index 0000000000000..66cb8dda248db
--- /dev/null
+++ b/tests/models/test_internvl.py
@@ -0,0 +1,201 @@
+import types
+from typing import List, Optional, Type
+
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+from PIL.Image import Image
+
+from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
+                                                 IMG_START,
+                                                 image_to_pixel_values)
+from vllm.multimodal.utils import rescale_image_size
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    "cherry_blossom":
+    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+})
+
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+models = [
+    snapshot_download("OpenGVLab/InternVL2-1B"),
+    snapshot_download("OpenGVLab/InternVL2-2B"),
+    # snapshot_download("OpenGVLab/InternVL2-4B"),  # broken
+]
+
+
+class InternVLProcessor:
+    """A simple processor for InternVL2 HF model which misses a processor."""
+
+    def __init__(self, hf_runner: HfRunner):
+        self.num_image_token = hf_runner.model.num_image_token
+        self.tokenizer = hf_runner.tokenizer
+        self.dtype = hf_runner.model.dtype
+
+    def __call__(self, text: str, images: Image, **kwargs):
+        pixel_values = image_to_pixel_values(images).to(self.dtype)
+        num_patches_list = [pixel_values.shape[0]]
+        for num_patches in num_patches_list:
+            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
+            image_tokens = IMG_START + context_tokens + IMG_END
+            text = text.replace('<image>', image_tokens, 1)
+        prompt = self.tokenizer(text, return_tensors="pt")
+        prompt.update({"pixel_values": pixel_values})
+        return prompt
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
+def generate(
+    self,
+    pixel_values: torch.FloatTensor,
+    input_ids: torch.FloatTensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    **generate_kwargs,
+) -> torch.LongTensor:
+    """Generate method for InternVL2 model without fixed use_cache."""
+    assert self.img_context_token_id is not None
+    vit_embeds = self.extract_feature(pixel_values)
+    input_embeds = self.language_model.get_input_embeddings()(input_ids)
+    B, N, C = input_embeds.shape
+    input_embeds = input_embeds.reshape(B * N, C)
+
+    input_ids = input_ids.reshape(B * N)
+    selected = (input_ids == self.img_context_token_id)
+    assert selected.sum() != 0
+    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+    input_embeds = input_embeds.reshape(B, N, C)
+
+    outputs = self.language_model.generate(
+        inputs_embeds=input_embeds,
+        attention_mask=attention_mask,
+        **generate_kwargs,
+    )
+
+    return outputs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+            "<IMG_CONTEXT>")
+        hf_model.model.img_context_token_id = img_context_token_id
+        hf_model.processor = InternVLProcessor(hf_model)
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.get_output_embeddings()
+        hf_model.model.generate = types.MethodType(generate, hf_model.model)
+        eos_token_id = hf_model.tokenizer.eos_token_id
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
+            for prompts, hf_images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1f6d77b828459..fbb7f70b55e16 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig,
         return None
     if model_type.startswith("llava"):
         return tokenizer.decode(model_config.hf_config.image_token_index)
-    if model_type == "chameleon":
+    if model_type in ("chameleon", "internvl_chat"):
         return "<image>"
     raise TypeError(f"Unknown model type: {model_type}")
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fe04c6db5fbc2..94c3cea98be7b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -37,6 +37,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
+    "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "LlavaForConditionalGeneration":
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
new file mode 100644
index 0000000000000..86d0930d80126
--- /dev/null
+++ b/vllm/model_executor/models/intern_vit.py
@@ -0,0 +1,270 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False)
+        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                            1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            target_dtype))  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
+                                width)
+        ],
+                                       dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads '
+                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).')
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim,
+                             3 * self.embed_dim,
+                             bias=config.qkv_bias)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(
+                B_, N_, H_, D_).transpose(1, 2)
+
+        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        x = x.transpose(1, 2).reshape(B, N, C)
+
+        x = self.proj(x)
+        return x
+
+
+class InternMLP(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config, quant_config=quant_config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor *
+                                torch.ones(self.embed_dim))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states)) * self.ls1
+
+        hidden_states = hidden_states + self.mlp(
+            self.norm2(hidden_states)) * self.ls2
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config=config, quant_config=quant_config)
+            for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, inputs_embeds: torch.Tensor):
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
+
+        return hidden_states
+
+
+class InternVisionModel(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 num_hidden_layers_override: Optional[int] = None):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(
+            config=config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size,
+                                            old_size // patch_size,
+                                            -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(),
+                                size=new_size // patch_size,
+                                mode='bicubic',
+                                align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim,
+                                                    -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError(
+                'You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        elif pixel_values is not None:
+            if pixel_values.ndim == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(
+                    f'wrong pixel_values size: {pixel_values.shape}')
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+
+        return encoder_outputs
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 22132f40fc5e6..745fbf99a902d 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -219,14 +219,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.tok_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: IntermediateTensors = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.tok_embeddings(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.tok_embeddings(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
new file mode 100644
index 0000000000000..f64c78c15f8ee
--- /dev/null
+++ b/vllm/model_executor/models/internvl.py
@@ -0,0 +1,471 @@
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.intern_vit import InternVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_num_patches)
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500
+
+
+class InternVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    """
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def calculate_num_blocks(orig_width: int,
+                         orig_height: int,
+                         min_num=1,
+                         max_num=6,
+                         image_size=448):
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
+                        for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+
+    blocks, target_width, target_height = calculate_num_blocks(
+        orig_width, orig_height, min_num, max_num, image_size)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6):
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image,
+                                image_size=input_size,
+                                use_thumbnail=True,
+                                max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_internvl_num_patches(image_size: int, patch_size: int,
+                             downsample_ratio: float):
+    return int(
+        get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
+        (downsample_ratio**2))
+
+
+def get_max_internvl_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+    return num_patches * 7
+
+
+def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        width, height = image_data.size
+        num_blocks, _, _ = calculate_num_blocks(width, height)
+    elif isinstance(image_data, torch.Tensor):
+        raise NotImplementedError("Embeddings input is not supported yet")
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    prompt = llm_inputs["prompt"]
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    if prompt is None:
+        prompt = tokenizer.decode(prompt_token_ids)
+    image_prompt = IMG_START + IMG_CONTEXT * (num_blocks +
+                                              1) * num_patches + IMG_END
+    new_prompt = prompt.replace('<image>', image_prompt, 1)
+    new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+    return LLMInputs(prompt=prompt,
+                     prompt_token_ids=new_prompt_token_ids,
+                     multi_modal_data=multi_modal_data)
+
+
+def input_mapper_for_internvl(ctx: InputContext, data: object):
+    if isinstance(data, Image.Image):
+        data = image_to_pixel_values(data)
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+    image_token_id = tokenizer.encode(IMG_CONTEXT,
+                                      add_special_tokens=False,
+                                      return_tensors="pt")[0]
+
+    return MultiModalInputs({
+        "pixel_values": data,
+        "image_token_id": image_token_id
+    })
+
+
+def dummy_data_for_internvl(ctx: InputContext, seq_len: int):
+
+    image_feature_size = get_max_internvl_image_tokens(ctx)
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+    vision_config = hf_config.vision_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer,
+                                     trust_remote_code=True)
+
+    seq_data = dummy_seq_data_for_clip(
+        vision_config,
+        seq_len,
+        image_token_id=tokenizer.encode(IMG_CONTEXT,
+                                        add_special_tokens=False)[0],
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_image_for_clip(
+        vision_config,
+        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+    )
+
+    return seq_data, mm_data
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl)
+class InternVLChatModel(nn.Module, SupportsVision):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        vision_feature_layer = self.select_layer
+        if vision_feature_layer < 0:
+            num_hidden_layers = config.vision_config.num_hidden_layers \
+                + vision_feature_layer + 1
+        else:
+            num_hidden_layers = vision_feature_layer + 1
+        self.vision_model = InternVisionModel(
+            config.vision_config, num_hidden_layers_override=num_hidden_layers)
+
+        llm_class = ModelRegistry.load_model_cls(
+            config.text_config.architectures[0])
+        self.language_model = llm_class(config.text_config, cache_config,
+                                        quant_config)
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size), nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size))
+
+        self.img_context_token_id = None
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each batch element "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[InternVLImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_token_id = kwargs.pop("image_token_id", None)
+
+        if pixel_values is None:
+            return None
+
+        self.img_context_token_id = image_token_id[0]
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return InternVLImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is not None:
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids)
+            vit_embeds = self.extract_feature(image_input["data"])
+            inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+                                                    vit_embeds,
+                                                    self.img_context_token_id)
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".w1", 0),
+            (".gate_up_proj", ".w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.text_config.tie_word_embeddings \
+                and "lm_head.weight" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # We only do sharding for language model
+                # and not vision model for now.
+                if "vision_embed_tokens" in name and self.vision_embed_tokens:
+                    continue
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config.text_config
+                    kv_groups = (config.num_attention_heads //
+                                 config.num_key_value_heads)
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+                                                       head_dim,
+                                                       loaded_weight.shape[-1])
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+                                             dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, 'q')
+                    weight_loader(param, wk, 'k')
+                    weight_loader(param, wv, 'v')
+                    continue
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e9aa4416eded4..3deb3d8840cc4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -243,14 +243,22 @@ def __init__(
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3ba2e01985598..3d13631b9b2b6 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,9 +6,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             JAISConfig, MedusaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, RWConfig)
+                                             InternVLChatConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -26,6 +27,7 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
+    "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
 }
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 1750950b3c38b..5ccacd4a4c40a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -4,6 +4,7 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
@@ -15,6 +16,7 @@
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
+    "InternVLChatConfig",
     "JAISConfig",
     "MedusaConfig",
     "MLPSpeculatorConfig",
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
new file mode 100644
index 0000000000000..ac2492317aa36
--- /dev/null
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -0,0 +1,51 @@
+# Adapted from
+# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch

From 766435e660a786933392eb8ef0a873bc38cf0c8b Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 11:42:35 -0400
Subject: [PATCH 02/79] [Kernel] Tuned FP8 Kernels for Ada Lovelace (#6677)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   2 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 520 ++----------------
 .../cutlass_w8a8/scaled_mm_c2x.cuh            | 340 ++++++++++++
 .../scaled_mm_c2x_sm80_dispatch.cuh           | 139 +++++
 .../scaled_mm_c2x_sm89_dispatch.cuh           | 362 ++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 6 files changed, 877 insertions(+), 490 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 234c2c8a1074c..70247e94e63cf 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -13,7 +13,7 @@
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
 
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index 6ce25c5ac897b..d26c43de522c9 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -1,470 +1,16 @@
 #include <stddef.h>
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-// clang-format will break include orders
-// clang-format off
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/util/device_memory.h"
-
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
 
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-
-#include "broadcast_load_epilogue_c2x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
+#include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm80_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
-namespace {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-/*
- * This class provides the common ScaleA and ScaleB descriptors for the
- * ScaledEpilogue and ScaledEpilogueBias classes.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-    using BiasArgs = typename Bias::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
-                                                    bias_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_,
-          template <typename, typename> typename Epilogue_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
-
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
-
-  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
-      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
-      Stride<int64_t, Int<1>, Int<0>>>;
-
-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
-
-  // clang-format off
-  using RowMajor = typename cutlass::layout::RowMajor;
-  using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType =
-    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp,
-      Arch,
-      TileShape, WarpShape, InstructionShape,
-      EVTD,
-      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
-      MainLoopStages, Operator,
-      1 /* epilogue stages */
-      >::GemmKernel>;
-  // clang-format on
-
-  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-  cutlass::gemm::GemmCoord problem_size{m, n, k};
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
-
-  using Epilogue = typename Gemm::Epilogue;
-  auto evt_args =
-      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
-
-  typename Gemm::EVTD::Arguments epilogue_args{
-      evt_args,
-      d_args,
-  };
-
-  typename Gemm::Op::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
-      problem_size,                                           // problem size
-      1,                                                      // batch count
-      epilogue_args,
-      a_ptr,
-      b_ptr,
-      nullptr,
-      nullptr,
-      0,
-      0,
-      0,
-      0,
-      lda,
-      ldb,
-      ldc,
-      ldc};
-
-  // Launch the CUTLASS GEMM kernel.
-  typename Gemm::Op gemm_op;
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-  cutlass::Status status = gemm_op(args, workspace.get(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
-void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                  torch::Tensor const& b,
-                                  EpilogueArgs&&... args) {
-  // In some cases, the GPU isn't able to accommodate the
-  // shared memory requirements of the Gemm. In such cases, use
-  // the FallbackGemm instead.
-  static const int max_shared_mem_per_block_opt_in =
-      get_cuda_max_shared_memory_per_block_opt_in(0);
-
-  size_t const gemm_shared_mem_size =
-      sizeof(typename Gemm::KernelType::SharedStorage);
-  size_t const fallback_gemm_shared_mem_size =
-      sizeof(typename FallbackGemm::KernelType::SharedStorage);
-
-  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
-    return cutlass_gemm_caller<Gemm>(out, a, b,
-                                     std::forward<EpilogueArgs>(args)...);
-  } else {
-    TORCH_CHECK(fallback_gemm_shared_mem_size <=
-                max_shared_mem_per_block_opt_in);
-    return cutlass_gemm_caller<FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_default {
-  // This config is used in 2 cases,
-  //  - M in (128, inf)
-  //  - M in (64, 128] and N >= 8192
-  // Shared Memory required by this Gemm - 81920 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M64 {
-  // This config is used in 2 cases,
-  // - M in (32, 64]
-  // - M in (64, 128] and N < 8192
-  // Shared Memory required by this Gemm - 122880 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M32 {
-  // M in (16, 32]
-  // Shared Memory required by this Gemm - 61440 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M16 {
-  // M in [1, 16]
-  // Shared Memory required by this Gemm - 51200 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass2xGemmDefault =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128BigN =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128SmallN =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM64 =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM32 =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM16 =
-      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  // Due to shared memory requirements, some Gemms may fail to run on some
-  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
-  // in such cases.
-  // sm80_config_M16 has the least shared-memory requirement. However,
-  // based on some profiling, we select sm80_config_M32 as a better alternative
-  // performance wise.
-  using FallbackGemm =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
-  if (mp2 <= 16) {
-    // M in [1, 16]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 32) {
-    // M in (16, 32]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
-    // M in (32, 64]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // M in (64, 128]
-    uint32_t const n = out.size(1);
-    bool const small_n = n < 8192;
-    if (small_n) {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
-                                          FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else {
-    // M in (128, inf)
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
 template <template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
@@ -478,14 +24,15 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_caller<
+        vllm::cutlass_2x_gemm<cutlass::arch::Sm75, vllm::enable_sm75_to_sm80,
+                              int8_t, cutlass::bfloat16_t, Epilogue, TileShape,
+                              WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
+    return vllm::cutlass_gemm_caller<vllm::cutlass_2x_gemm<
+        cutlass::arch::Sm75, vllm::enable_sm75_to_sm80, int8_t, cutlass::half_t,
         Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
@@ -501,11 +48,11 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -518,11 +65,12 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -537,11 +85,11 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
 
@@ -558,15 +106,17 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::bfloat16_t, Epilogue,
+                                TileShape, WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                int8_t, cutlass::half_t, Epilogue, TileShape,
+                                WarpShape, InstructionShape, 5>>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -574,17 +124,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
+                                              cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -600,10 +146,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
new file mode 100644
index 0000000000000..ba620e85117b5
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -0,0 +1,340 @@
+#pragma once
+#include <stddef.h>
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "broadcast_load_epilogue_c2x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace vllm {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
+          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                FP8MathOperator>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
+      float, cutlass::layout::RowMajor, 4,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.get(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
new file mode 100644
index 0000000000000..89d101b0ed822
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM80 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
new file mode 100644
index 0000000000000..c9ed3ed67171e
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
@@ -0,0 +1,362 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+#include "cutlass/float8.h"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_fallback_gemm {
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5,
+                      FP8MathOperator>;
+};
+
+struct sm89_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M128 {
+  // M in (64, 128]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8196) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5, FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
+  static const int32_t MainLoopStages = 5;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+
+    using FallbackGemm =
+        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 24576) {
+      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, MainLoopStages,
+                                FP8MathOperator>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index d409df34ee5e5..6b72875823b4b 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -106,8 +106,8 @@ def cutlass_int8_gemm_helper(m: int,
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
-@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
+@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From db9e5708a98b7209cf4465a0391139cf8fca7674 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Tue, 30 Jul 2024 00:47:31 +0800
Subject: [PATCH 03/79] [Core] Reduce unnecessary compute when logprobs=None
 (#6532)

---
 tests/samplers/test_logprobs.py       |  39 ++++++-
 vllm/model_executor/layers/sampler.py | 144 +++++++++++++++-----------
 vllm/outputs.py                       |  17 +--
 vllm/sampling_params.py               |  15 +--
 4 files changed, 135 insertions(+), 80 deletions(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index f7bcd4c855799..c07c71e38233f 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -14,7 +14,7 @@
 @pytest.mark.parametrize("dtype",
                          ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
 def test_get_prompt_logprobs(
     hf_runner,
@@ -63,7 +63,10 @@ def test_get_prompt_logprobs(
         assert result.outputs[0].logprobs is not None
         assert len(result.outputs[0].logprobs) == max_tokens
         for logprobs in result.outputs[0].logprobs:
-            assert len(logprobs) == num_top_logprobs
+            # If the output token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(logprobs) == num_top_logprobs
+                    or len(logprobs) == num_top_logprobs + 1)
         output_text = result.outputs[0].text
         output_string_from_most_likely_tokens_lst: List[str] = []
         for top_logprobs in result.outputs[0].logprobs:
@@ -135,3 +138,35 @@ def test_max_logprobs():
     bad_sampling_params = SamplingParams(logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
+                       detokenize: bool, example_prompts):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 121458f8156a1..60fa3fbb51be6 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,5 +1,6 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+from math import inf
 from typing import Dict, List, Optional, Tuple
 
 import torch
@@ -774,8 +775,11 @@ def _get_logprobs(
     # The next token ids to get the logprob value from.
     next_token_ids: List[int] = []
     # The largest requested number of logprobs. We find logprobs as many as the
-    # largest num logprobs in this API.
-    largest_num_logprobs = 1
+    # largest num logprobs in this API. If every logprobs is None, it will be
+    # set to -1.
+    largest_num_logprobs = -1
+    # If beam search is enabled.
+    use_beam_search = False
 
     # Select indices to compute logprob from, ranks of token ids, and the top
     # k token ids from logprobs.
@@ -808,6 +812,8 @@ def _get_logprobs(
                 largest_num_logprobs = max(largest_num_logprobs,
                                            sampling_params.logprobs)
 
+            use_beam_search = use_beam_search or sampling_params.use_beam_search
+
         assert len(next_token_ids) == len(query_indices)
 
     if len(query_indices) == 0:
@@ -815,35 +821,40 @@ def _get_logprobs(
         empty_prompt_logprob: Optional[PromptLogprobs] = None
         return [empty_prompt_logprob], [empty_sampled_logprob]
 
-    query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
-    next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device)
-
-    # (num_selected_query_tokens, num_logprobs). Note that query_indices can
-    # contain duplicates if beam search is enabled.
-    selected_logprobs = logprobs[[
-        query_indices_gpu,
-        next_token_ids_gpu,
-    ]]
-    ranks = _get_ranks(
-        logprobs[query_indices_gpu],
-        next_token_ids_gpu,
-    )
-    assert selected_logprobs.shape[0] == ranks.shape[0]
-
-    # Logprobs of topk tokens for a batch of sequence groups.
-    # (num_query_tokens_across_batch).
-    if largest_num_logprobs > 0:
-        top_logprobs, top_token_ids = torch.topk(logprobs,
-                                                 largest_num_logprobs,
-                                                 dim=-1)
-    else:
-        top_logprobs, top_token_ids = None, None
+    selected_logprobs, ranks = None, None
+    top_logprobs, top_token_ids = None, None
+
+    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
+    # skip the whole logprob calculation.
+    if largest_num_logprobs >= 0 or use_beam_search:
+        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
+        next_token_ids_gpu = torch.tensor(next_token_ids,
+                                          device=logprobs.device)
+
+        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
+        # contain duplicates if beam search is enabled.
+        selected_logprobs = logprobs[[
+            query_indices_gpu,
+            next_token_ids_gpu,
+        ]]
+        ranks = _get_ranks(
+            logprobs[query_indices_gpu],
+            next_token_ids_gpu,
+        )
+        assert selected_logprobs.shape[0] == ranks.shape[0]
+
+        # We need to compute top k only if there exists logprobs > 0.
+        if largest_num_logprobs > 0:
+            # Logprobs of topk tokens for a batch of sequence groups.
+            # (num_query_tokens_across_batch).
+            top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                     largest_num_logprobs,
+                                                     dim=-1)
+            top_logprobs = top_logprobs.to('cpu')
+            top_token_ids = top_token_ids.to('cpu')
 
-    selected_logprobs = selected_logprobs.to('cpu')
-    ranks = ranks.to('cpu')
-    if top_logprobs is not None and top_token_ids is not None:
-        top_logprobs = top_logprobs.to('cpu')
-        top_token_ids = top_token_ids.to('cpu')
+        selected_logprobs = selected_logprobs.to('cpu')
+        ranks = ranks.to('cpu')
 
     # Find prompt/sample logprobs.
     prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = []
@@ -940,46 +951,53 @@ def _get_sampled_logprob_if_needed(
 ):
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.logprobs or 0
+    num_logprobs = seq_group.sampling_params.logprobs
+    use_beam_search = seq_group.sampling_params.use_beam_search
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
     if seq_group.do_sample:
         assert len(next_token_ids) > 0
-        # Pre-select items from tensor. tolist() is faster than repetitive
-        # `.item()` calls.
-        selected_logprob_items = selected_logprobs[
-            selected_logprobs_idx:selected_logprobs_idx +
-            len(next_token_ids)].tolist()
-        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
-                           len(next_token_ids)].tolist()
-        for idx, (next_token_id,
-                  parent_id) in enumerate(zip(next_token_ids, parent_seq_ids)):
-            # Get the logprob of a sampled token.
-            sampled_logprobs_dict = {
-                next_token_id: (selected_logprob_items[idx], rank_items[idx])
-            }
-            # Get top K logprobs.
-            if num_logprobs > 0:
-                top_ids = top_token_ids[top_logprob_idx +
-                                        parent_id, :num_logprobs].tolist()
-                top_probs = top_logprobs[top_logprob_idx +
-                                         parent_id, :num_logprobs].tolist()
-                # Top K is already sorted by rank, so we can use 1 ~
-                # num_logprobs + 1 for rank.
-                top_ranks = range(1, num_logprobs + 1)
-                sampled_logprobs_dict.update({
-                    top_id: (top_prob, rank)
-                    for top_id, top_prob, rank in zip(top_ids, top_probs,
-                                                      top_ranks)
+        if num_logprobs is None and not use_beam_search:
+            for next_token_id in next_token_ids:
+                # Use a dummy logprob
+                sampled_logprobs.append({next_token_id: Logprob(inf)})
+        else:
+            # Pre-select items from tensor. tolist() is faster than repetitive
+            # `.item()` calls.
+            selected_logprob_items = selected_logprobs[
+                selected_logprobs_idx:selected_logprobs_idx +
+                len(next_token_ids)].tolist()
+            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                               len(next_token_ids)].tolist()
+            for idx, (next_token_id, parent_id) in enumerate(
+                    zip(next_token_ids, parent_seq_ids)):
+                # Get the logprob of a sampled token.
+                sampled_logprobs_dict = {
+                    next_token_id:
+                    (selected_logprob_items[idx], rank_items[idx])
+                }
+                if num_logprobs is not None and num_logprobs > 0:
+                    # Get top K logprobs.
+                    top_ids = top_token_ids[top_logprob_idx +
+                                            parent_id, :num_logprobs].tolist()
+                    top_probs = top_logprobs[
+                        top_logprob_idx + parent_id, :num_logprobs].tolist()
+                    # Top K is already sorted by rank, so we can use 1 ~
+                    # num_logprobs + 1 for rank.
+                    top_ranks = range(1, num_logprobs + 1)
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(
+                            top_ids, top_probs, top_ranks)
+                    })
+
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in
+                    sampled_logprobs_dict.items()
                 })
 
-            sampled_logprobs.append({
-                token_id: Logprob(*logprob_and_rank)
-                for token_id, logprob_and_rank in
-                sampled_logprobs_dict.items()
-            })
-
         # NOTE: This part of code is not intuitive. `selected_logprobs` include
         # logprobs for the current step, which has len(next_token_ids) tokens
         # per sequence group. `logprobs` includes logprobs from the previous
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4cb7f06bdb8c7..b1cb1cd07fbb1 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -29,7 +29,7 @@ class CompletionOutput:
     index: int
     text: str
     token_ids: Tuple[int, ...]
-    cumulative_logprob: float
+    cumulative_logprob: Optional[float]
     logprobs: Optional[SampleLogprobs]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
@@ -124,13 +124,14 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         include_logprobs = seq_group.sampling_params.logprobs is not None
         text_buffer_length = seq_group.sampling_params.output_text_buffer_length
         outputs = [
-            CompletionOutput(seqs.index(seq),
-                             seq.get_output_text_to_return(text_buffer_length),
-                             seq.get_output_token_ids(),
-                             seq.get_cumulative_logprob(),
-                             seq.output_logprobs if include_logprobs else None,
-                             SequenceStatus.get_finished_reason(seq.status),
-                             seq.stop_reason) for seq in top_n_seqs
+            CompletionOutput(
+                seqs.index(seq),
+                seq.get_output_text_to_return(text_buffer_length),
+                seq.get_output_token_ids(),
+                seq.get_cumulative_logprob() if include_logprobs else None,
+                seq.output_logprobs if include_logprobs else None,
+                SequenceStatus.get_finished_reason(seq.status),
+                seq.stop_reason) for seq in top_n_seqs
         ]
 
         # Every sequence in the sequence group should have the same prompt.
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 638c870c04371..2598325439ebf 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -92,11 +92,12 @@ class SamplingParams:
         min_tokens: Minimum number of tokens to generate per output sequence
             before EOS or stop_token_ids can be generated
         logprobs: Number of log probabilities to return per output token.
-            Note that the implementation follows the OpenAI API: The return
-            result includes the log probabilities on the `logprobs` most likely
-            tokens, as well the chosen tokens. The API will always return the
-            log probability of the sampled token, so there  may be up to
-            `logprobs+1` elements in the response.
+            When set to None, no probability is returned. If set to a non-None
+            value, the result includes the log probabilities of the specified
+            number of most likely tokens, as well as the chosen tokens.
+            Note that the implementation follows the OpenAI API: The API will
+            always return the log probability of the sampled token, so there
+            may be up to `logprobs+1` elements in the response.
         prompt_logprobs: Number of log probabilities to return per prompt token.
         detokenize: Whether to detokenize the output. Defaults to True.
         skip_special_tokens: Whether to skip special tokens in the output.
@@ -168,8 +169,8 @@ def __init__(
         self.ignore_eos = ignore_eos
         self.max_tokens = max_tokens
         self.min_tokens = min_tokens
-        self.logprobs = logprobs
-        self.prompt_logprobs = prompt_logprobs
+        self.logprobs = 1 if logprobs is True else logprobs
+        self.prompt_logprobs = 1 if prompt_logprobs is True else prompt_logprobs
         # NOTE: This parameter is only exposed at the engine level for now.
         # It is not exposed in the OpenAI API server, as the OpenAI API does
         # not support returning only a list of token IDs.

From 60d1c6e584203b07df5020529d678e69baad84a3 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 12:59:02 -0400
Subject: [PATCH 04/79] [Kernel] Fix deprecation function warnings squeezellm
 quant_cuda_kernel (#6901)

---
 csrc/quantization/squeezellm/quant_cuda_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 40baac6108695..714907428a1ab 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -197,13 +197,13 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
-      (half2*)vec.data<at::Half>(),
+      (half2*)vec.data_ptr<at::Half>(),
 #else
       (__half2*)vec.data_ptr<at::Half>(),
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data<at::Half>(),
+      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

From 7f8d612d24c66e9b5f8c0aa6cb562e853e9523a0 Mon Sep 17 00:00:00 2001
From: Earthwalker <48991073+etwk@users.noreply.github.com>
Date: Tue, 30 Jul 2024 03:42:21 +0800
Subject: [PATCH 05/79] [TPU] Support tensor parallelism in async llm engine
 (#6891)

---
 Dockerfile.tpu                  |  3 +++
 vllm/engine/async_llm_engine.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 4fc14d6bd186c..adebb8ab5adca 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -12,6 +12,9 @@ RUN pip install "numpy<2"
 RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
+# Fix FastAPI dependence
+RUN pip install "starlette<0.38.0"
+
 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 93cc319f11c42..d3f9a0ab00f10 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -407,8 +407,14 @@ def _get_executor_cls(
             from vllm.executor.neuron_executor import NeuronExecutorAsync
             executor_class = NeuronExecutorAsync
         elif engine_config.device_config.device_type == "tpu":
-            from vllm.executor.tpu_executor import TPUExecutorAsync
-            executor_class = TPUExecutorAsync
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
+                executor_class = RayTPUExecutorAsync
+            else:
+                assert distributed_executor_backend is None
+                from vllm.executor.tpu_executor import TPUExecutorAsync
+                executor_class = TPUExecutorAsync
         elif engine_config.device_config.device_type == "cpu":
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync

From 9a7e2d053405da076ff0975660e82c4bc08a62be Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Mon, 29 Jul 2024 23:51:27 +0200
Subject: [PATCH 06/79] [Bugfix] Allow vllm to still work if triton is not
 installed. (#6786)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 requirements-cpu.txt                          |  1 -
 requirements-openvino.txt                     |  2 --
 requirements-tpu.txt                          |  1 -
 tests/kernels/test_sampler.py                 |  7 +++---
 vllm/attention/ops/paged_attn.py              |  5 ++++-
 .../layers/fused_moe/__init__.py              | 22 +++++++++++++------
 vllm/model_executor/layers/ops/sample.py      | 14 +-----------
 .../model_executor/layers/quantization/fp8.py |  4 ++--
 vllm/model_executor/layers/sampler.py         |  6 ++++-
 vllm/model_executor/sampling_metadata.py      |  2 +-
 vllm/triton_utils/__init__.py                 | 14 +++++++-----
 vllm/triton_utils/importing.py                | 11 ++++++++++
 vllm/triton_utils/sample.py                   | 13 +++++++++++
 13 files changed, 65 insertions(+), 37 deletions(-)
 create mode 100644 vllm/triton_utils/importing.py
 create mode 100644 vllm/triton_utils/sample.py

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index a8ce104d83290..2dcd86274a2a8 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,3 @@
 # Dependencies for x86_64 CPUs
 torch == 2.4.0; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index e32c76fb0db21..fabac3c7bbaf9 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -5,5 +5,3 @@
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
 optimum-intel[openvino] >= 1.18.1
-
-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index c2140fbffec9f..5eb27b39eb623 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -5,4 +5,3 @@
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
 ray
-triton  # To avoid import errors
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index e28f809309ec5..3c53f7decc6eb 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -5,11 +5,12 @@
 import triton
 import triton.language as tl
 
-from vllm.model_executor.layers.ops.sample import (
-    MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
-    sample)
+from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
+                                                   sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
+                                      get_num_triton_sampler_splits)
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 0f6d2f2d1ab3f..e88963ade16c9 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -4,7 +4,10 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index db837231c6ace..3e0767c7d2665 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,14 +1,22 @@
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
+from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
-    "fused_moe",
-    "fused_topk",
-    "fused_experts",
-    "get_config_file_name",
-    "grouped_topk",
     "FusedMoE",
     "FusedMoEMethodBase",
 ]
+
+if HAS_TRITON:
+
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
+
+    __all__ += [
+        "fused_moe",
+        "fused_topk",
+        "fused_experts",
+        "get_config_file_name",
+        "grouped_topk",
+    ]
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index d08ae6064aa2a..bdb577da3172f 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -1,4 +1,3 @@
-import math
 from typing import Optional, Tuple
 
 import torch
@@ -6,21 +5,10 @@
 import triton.language as tl
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 
 _EPS = 1e-6
 
-# This is a hardcoded limit in Triton (max block size).
-MAX_TRITON_N_COLS = 131072
-
-
-def get_num_triton_sampler_splits(n_cols: int) -> int:
-    """Get the number of splits to use for Triton sampling.
-
-    Triton has a limit on the number of columns it can handle, so we need to
-    split the tensor and call the kernel multiple times if it's too large.
-    """
-    return math.ceil(n_cols / MAX_TRITON_N_COLS)
-
 
 def _multi_split_sample(
     probs: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6649b317ca838..c829cb836ee4c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,8 +6,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  fused_moe)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -404,6 +403,7 @@ def apply(self,
               num_expert_group: Optional[int] = None,
               topk_group: Optional[int] = None) -> torch.Tensor:
 
+        from vllm.model_executor.layers.fused_moe import fused_moe
         return fused_moe(x,
                          layer.w13_weight,
                          layer.w2_weight,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 60fa3fbb51be6..6632b1c434582 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -6,7 +6,11 @@
 import torch
 import torch.nn as nn
 
-from vllm.model_executor.layers.ops.sample import sample as sample_triton
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.model_executor.layers.ops.sample import sample as sample_triton
+
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 27b37a9d53470..1caf9aa01d8c8 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,9 +5,9 @@
 
 import torch
 
-from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.triton_utils.sample import get_num_triton_sampler_splits
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
                         make_tensor_with_pad, maybe_expand_dim)
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 09843e5d1f30b..568185383aa5c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,6 +1,10 @@
-from vllm.triton_utils.custom_cache_manager import (
-    maybe_set_triton_cache_manager)
+from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = [
-    "maybe_set_triton_cache_manager",
-]
+__all__ = ["HAS_TRITON"]
+
+if HAS_TRITON:
+
+    from vllm.triton_utils.custom_cache_manager import (
+        maybe_set_triton_cache_manager)
+
+    __all__ += ["maybe_set_triton_cache_manager"]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
new file mode 100644
index 0000000000000..3455036586a93
--- /dev/null
+++ b/vllm/triton_utils/importing.py
@@ -0,0 +1,11 @@
+from importlib.util import find_spec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+HAS_TRITON = find_spec("triton") is not None
+
+if not HAS_TRITON:
+    logger.info("Triton not installed; certain GPU-related functions"
+                " will be not be available.")
diff --git a/vllm/triton_utils/sample.py b/vllm/triton_utils/sample.py
new file mode 100644
index 0000000000000..401e4d28a3c99
--- /dev/null
+++ b/vllm/triton_utils/sample.py
@@ -0,0 +1,13 @@
+import math
+
+# This is a hardcoded limit in Triton (max block size).
+MAX_TRITON_N_COLS = 131072
+
+
+def get_num_triton_sampler_splits(n_cols: int) -> int:
+    """Get the number of splits to use for Triton sampling.
+
+    Triton has a limit on the number of columns it can handle, so we need to
+    split the tensor and call the kernel multiple times if it's too large.
+    """
+    return math.ceil(n_cols / MAX_TRITON_N_COLS)

From 9f69d8245a695918cb80c218a83c42e9fd409687 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 29 Jul 2024 16:37:27 -0700
Subject: [PATCH 07/79] [Frontend] New `allowed_token_ids` decoding request
 parameter (#6753)

---
 tests/entrypoints/openai/test_completion.py   | 22 ++++++
 vllm/entrypoints/openai/logits_processors.py  | 74 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 60 ++++-----------
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 5 files changed, 114 insertions(+), 46 deletions(-)
 create mode 100644 vllm/entrypoints/openai/logits_processors.py

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index fe00640c0021e..50add84087a95 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -541,6 +541,28 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
     assert first_response != completion.choices[0].text
 
 
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                          ["outlines", "lm-format-enforcer"])
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
new file mode 100644
index 0000000000000..31eb5aa628c52
--- /dev/null
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -0,0 +1,74 @@
+from functools import lru_cache
+from typing import Dict, FrozenSet, Iterable, List, Optional, Union
+
+import torch
+from transformers import PreTrainedTokenizer
+
+from vllm.sampling_params import LogitsProcessor
+
+
+class AllowedTokenIdsLogitsProcessor:
+    """Logits processor for constraining generated tokens to a
+    specific set of token ids."""
+
+    def __init__(self, allowed_ids: Iterable[int]):
+        self.allowed_ids: Optional[List[int]] = list(allowed_ids)
+        self.mask: Optional[torch.Tensor] = None
+
+    def __call__(self, token_ids: List[int],
+                 logits: torch.Tensor) -> torch.Tensor:
+        if self.mask is None:
+            self.mask = torch.ones((logits.shape[-1], ),
+                                   dtype=torch.bool,
+                                   device=logits.device)
+            self.mask[self.allowed_ids] = False
+            self.allowed_ids = None
+        logits.masked_fill_(self.mask, float("-inf"))
+        return logits
+
+
+@lru_cache(maxsize=32)
+def _get_allowed_token_ids_logits_processor(
+    allowed_token_ids: FrozenSet[int],
+    vocab_size: int,
+) -> LogitsProcessor:
+    if not allowed_token_ids:
+        raise ValueError("Empty allowed_token_ids provided")
+    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
+        raise ValueError("allowed_token_ids contains "
+                         "out-of-vocab token id")
+    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
+
+
+def get_logits_processors(
+        logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
+        allowed_token_ids: Optional[List[int]],
+        tokenizer: PreTrainedTokenizer) -> List[LogitsProcessor]:
+    logits_processors = []
+    if logit_bias:
+        try:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
+            clamped_logit_bias: Dict[int, float] = {
+                int(token_id): min(100.0, max(-100.0, bias))
+                for token_id, bias in logit_bias.items()
+            }
+        except ValueError as exc:
+            raise ValueError(
+                "Found token_id in logit_bias that is not "
+                "an integer or string representing an integer") from exc
+
+        def logit_bias_logits_processor(token_ids: List[int],
+                                        logits: torch.Tensor) -> torch.Tensor:
+            for token_id, bias in clamped_logit_bias.items():
+                logits[token_id] += bias
+            return logits
+
+        logits_processors.append(logit_bias_logits_processor)
+
+    if allowed_token_ids is not None:
+        logits_processors.append(
+            _get_allowed_token_ids_logits_processor(
+                frozenset(allowed_token_ids), tokenizer.vocab_size))
+
+    return logits_processors
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c024bbc07c069..205860aa8e722 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,9 +5,11 @@
 
 import torch
 from pydantic import BaseModel, ConfigDict, Field, model_validator
+from transformers import PreTrainedTokenizer
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -213,30 +215,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self) -> SamplingParams:
+    def to_sampling_params(self,
+                           tokenizer: PreTrainedTokenizer) -> SamplingParams:
         # We now allow logprobs being true without top_logrobs.
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer before we add to LLMEngine
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=None,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
@@ -358,6 +345,7 @@ class CompletionRequest(OpenAIBaseModel):
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    allowed_token_ids: Optional[List[int]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
@@ -407,30 +395,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self):
+    def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        logits_processors = None
-        if self.logit_bias:
-            logit_bias: Dict[int, float] = {}
-            try:
-                for token_id, bias in self.logit_bias.items():
-                    # Convert token_id to integer
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    logit_bias[int(token_id)] = min(100, max(-100, bias))
-            except ValueError as exc:
-                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
-                                 f"but token_id must be an integer or string "
-                                 f"representing an integer") from exc
-
-            def logit_bias_logits_processor(
-                    token_ids: List[int],
-                    logits: torch.Tensor) -> torch.Tensor:
-                for token_id, bias in logit_bias.items():
-                    logits[token_id] += bias
-                return logits
-
-            logits_processors = [logit_bias_logits_processor]
+        logits_processors = get_logits_processors(
+            logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
+            tokenizer=tokenizer,
+        )
 
         return SamplingParams(
             n=self.n,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 012f70e661100..01843930bf11d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -134,7 +134,7 @@ async def create_chat_completion(
 
         request_id = f"chat-{random_uuid()}"
         try:
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 73e420141813e..8548352791680 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -95,7 +95,7 @@ async def create_completion(self, request: CompletionRequest,
 
             tokenizer = await self.engine.get_tokenizer(lora_request)
 
-            sampling_params = request.to_sampling_params()
+            sampling_params = request.to_sampling_params(tokenizer)
             decoding_config = await self.engine.get_decoding_config()
             guided_decoding_backend = request.guided_decoding_backend \
                 or decoding_config.guided_decoding_backend

From aae6d36f7ebec2476f7a29ec72c91d7424bb66de Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 20:01:17 -0400
Subject: [PATCH 08/79] [Kernel] Remove unused variables in awq/gemm_kernels.cu
 (#6908)

---
 csrc/quantization/awq/gemm_kernels.cu | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 6d6da5f3d8746..9da724a1b43c3 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -17,14 +17,6 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 namespace vllm {
 namespace awq {
 
-// Pack two half values.
-static inline __device__ __host__ unsigned __pack_half2(const half x,
-                                                        const half y) {
-  unsigned v0 = *((unsigned short*)&x);
-  unsigned v1 = *((unsigned short*)&y);
-  return (v1 << 16) | v0;
-}
-
 template <int N>
 __global__ void __launch_bounds__(64)
     gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
@@ -42,11 +34,7 @@ __global__ void __launch_bounds__(64)
   __shared__ half A_shared[16 * (32 + 8)];
   __shared__ half B_shared[32 * (N + 8)];
 
-  __shared__ half scaling_factors_shared[N];
-  __shared__ half zeros_shared[N];
-
   int j_factors1 = ((OC + N - 1) / N);
-  int blockIdx_x = 0;
   int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
   int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
 
@@ -60,7 +48,6 @@ __global__ void __launch_bounds__(64)
 
   static constexpr int row_stride_warp = 32 * 8 / 32;
   static constexpr int row_stride = 2 * 32 * 8 / N;
-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
   // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
   bool ld_A_flag =
       (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
@@ -145,11 +132,7 @@ __global__ void __launch_bounds__(64)
       uint32_t B_loaded =
           *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
       uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
-      // uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N /
-      // 8)) * 8);
 
-      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x
-      // % (cta_N / 8)) * 8);
       // - zero and * scale
       // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
       // q * scale - zero * scale.
@@ -367,17 +350,11 @@ __global__ void __launch_bounds__(64)
 __global__ void __launch_bounds__(64)
     dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
                        int* __restrict__ zeros, half* __restrict__ C, int G) {
-  int j_factors1 = 4;
-  int row_stride2 = 4;
-  int split_k_iters = 1;
   static constexpr uint32_t ZERO = 0x0;
   half B_shared[32 * (128 + 8)];
 
   half* B_shared_ptr2 = B_shared;
 
-  half B_shared_warp[32];
-  int OC = 512;
-
   int N = blockDim.x * gridDim.x;  // 2
   int col = (blockIdx.x * blockDim.x + threadIdx.x);
   int row = blockIdx.y * blockDim.y + threadIdx.y;

From 4fbf4aa128c5f5fee62d520dea9d3dfd10f33cdb Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 29 Jul 2024 17:03:45 -0700
Subject: [PATCH 09/79] [ci] GHA workflow to remove ready label upon
 "/notready" comment (#6921)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .../remove_label_not_ready_comment.yml        | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/remove_label_not_ready_comment.yml

diff --git a/.github/workflows/remove_label_not_ready_comment.yml b/.github/workflows/remove_label_not_ready_comment.yml
new file mode 100644
index 0000000000000..d1da7726eaee3
--- /dev/null
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@@ -0,0 +1,23 @@
+name: Remove ready Label on notready Comment
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  add-ready-label:
+    runs-on: ubuntu-latest
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
+    steps:
+        -   name: Remove ready label
+            uses: actions/github-script@v5
+            with:
+                script: |
+                    github.rest.issues.removeLabel({
+                        owner: context.repo.owner,
+                        repo: context.repo.repo,
+                        issue_number: context.issue.number,
+                        name: 'ready'
+                    })
+            env:
+                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 61a97c32f64641738d2cc623708f28046768224e Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 29 Jul 2024 21:26:07 -0400
Subject: [PATCH 10/79] [Kernel] Fix marlin divide-by-zero warnings (#6904)

---
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 61 ++++++++++---------
 .../marlin/dense/marlin_cuda_kernel.cu        | 18 ++++--
 .../marlin/sparse/marlin_24_cuda_kernel.cu    | 18 ++++--
 3 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 36ae2bfafa7c2..26cc248e6ac5d 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1128,44 +1128,47 @@ __global__ void Marlin(
   };
 
   auto fetch_zp_to_registers = [&](int k, int full_pipe) {
-    if constexpr (!has_zp) {
-      return;
-    }
+    if constexpr (has_zp) {
+      // This code does not handle group_blocks == 0,
+      // which signifies act_order.
+      // has_zp implies AWQ, which doesn't have act_order,
+      static_assert(group_blocks != 0);
 
-    int pipe = full_pipe % stages;
+      int pipe = full_pipe % stages;
 
-    if constexpr (group_blocks == -1) {
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
-      }
+      if constexpr (group_blocks == -1) {
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+        }
 
-    } else if constexpr (group_blocks >= thread_k_blocks) {
-      int4* sh_zp_stage =
-          sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                 (pipe / (group_blocks / thread_k_blocks)));
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
-      }
-    } else {
-      int warp_id = threadIdx.x / 32;
-      int n_warps = thread_n_blocks / 4;
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
 
-      int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / n_warps;
 
-      int cur_k = warp_row * 16;
-      cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
 
-      int k_blocks = cur_k / 16;
-      int cur_group_id = k_blocks / group_blocks;
+        int k_blocks = cur_k / 16;
+        int cur_group_id = k_blocks / group_blocks;
 
-      int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-      sh_zp_stage += cur_group_id * zp_sh_stride;
+        sh_zp_stage += cur_group_id * zp_sh_stride;
 
-      for (int i = 0; i < num_ints_per_thread; i++) {
-        frag_qzp[k % 2][i] =
-            (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
       }
     }
   };
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 37339b84ae25b..efbcc182a3ae4 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -452,10 +452,15 @@ __global__ void Marlin(
         B_ptr[i] += b_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -480,7 +485,10 @@ __global__ void Marlin(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index b5effc3055441..3c50f1786bc68 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -404,10 +404,15 @@ __global__ void Marlin_24(
         meta_ptr[i] += m_gl_rd_delta_o;
       }
       // Only fetch scales if this tile starts a new group
-      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
-        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-        s_gl_rd += s_gl_rd_delta;
+      if constexpr (group_blocks != -1) {
+        // This assumes group_blocks >= thread_k_blocks
+        // and would need to be modified to support smaller groups.
+        static_assert(group_blocks >= thread_k_blocks);
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+          s_gl_rd += s_gl_rd_delta;
+        }
       }
     }
     // Insert a fence even when we are winding down the pipeline to ensure that
@@ -432,7 +437,10 @@ __global__ void Marlin_24(
     // however, this does not seem to be a significant bottleneck, while some
     // theoretically better attempts have lead to bad instruction ordering by
     // the compiler and correspondingly a noticeable drop in performance.
-    if (group_blocks != -1) {
+    if constexpr (group_blocks != -1) {
+      // This assumes group_blocks >= thread_k_blocks
+      // and would need to be modified to support smaller groups.
+      static_assert(group_blocks >= thread_k_blocks);
       int4* sh_s_stage =
           sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
                                (pipe / (group_blocks / thread_k_blocks)));

From af647fb8b3ea9d910f7d1bc104af8986d048a8e2 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 29 Jul 2024 22:24:58 -0400
Subject: [PATCH 11/79] [Kernel] Tuned int8 kernels for Ada Lovelace (#6848)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  27 +-
 ...uh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} |  54 +--
 .../scaled_mm_c2x_sm89_int8_dispatch.cuh      | 353 ++++++++++++++++++
 tests/kernels/test_cutlass.py                 |   4 +-
 4 files changed, 395 insertions(+), 43 deletions(-)
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c2x_sm89_dispatch.cuh => scaled_mm_c2x_sm89_fp8_dispatch.cuh} (89%)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index d26c43de522c9..aac4900f933a7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -4,7 +4,8 @@
 
 #include "scaled_mm_c2x.cuh"
 #include "scaled_mm_c2x_sm80_dispatch.cuh"
-#include "scaled_mm_c2x_sm89_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
+#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
@@ -98,25 +99,17 @@ template <template <typename, typename> typename Epilogue,
 void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
                                      torch::Tensor const& b,
                                      EpilogueArgs&&... epilogue_args) {
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::bfloat16_t, Epilogue,
-                                TileShape, WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
-                                int8_t, cutlass::half_t, Epilogue, TileShape,
-                                WarpShape, InstructionShape, 5>>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -124,13 +117,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::bfloat16_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_dispatch<cutlass::float_e4m3_t,
-                                              cutlass::half_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                                  cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
similarity index 89%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
index c9ed3ed67171e..4e82c99c3af31 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -4,7 +4,7 @@
 #include "cutlass/float8.h"
 
 /**
- * This file defines Gemm kernel configurations for SM89 based on the Gemm
+ * This file defines Gemm kernel configurations for SM89 (FP8) based on the Gemm
  * shape.
  */
 
@@ -12,7 +12,7 @@ namespace vllm {
 
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue>
-struct sm89_fallback_gemm {
+struct sm89_fp8_fallback_gemm {
   // Shared Memory required by this Gemm - 61440 bytes
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
@@ -25,7 +25,7 @@ struct sm89_fallback_gemm {
                       FP8MathOperator>;
 };
 
-struct sm89_config_default {
+struct sm89_fp8_config_default {
   // M in (256, inf)
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -40,7 +40,8 @@ struct sm89_config_default {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -74,7 +75,7 @@ struct sm89_config_default {
   }
 };
 
-struct sm89_config_M256 {
+struct sm89_fp8_config_M256 {
   // M in (128, 256]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -89,7 +90,8 @@ struct sm89_config_M256 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -114,7 +116,7 @@ struct sm89_config_M256 {
   }
 };
 
-struct sm89_config_M128 {
+struct sm89_fp8_config_M128 {
   // M in (64, 128]
   using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -129,7 +131,8 @@ struct sm89_config_M128 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -163,7 +166,7 @@ struct sm89_config_M128 {
   }
 };
 
-struct sm89_config_M64 {
+struct sm89_fp8_config_M64 {
   // M in (32, 64]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
@@ -176,7 +179,8 @@ struct sm89_config_M64 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -215,7 +219,7 @@ struct sm89_config_M64 {
   }
 };
 
-struct sm89_config_M32 {
+struct sm89_fp8_config_M32 {
   // M in (16, 32]
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
   using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
@@ -229,7 +233,8 @@ struct sm89_config_M32 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -265,7 +270,7 @@ struct sm89_config_M32 {
   }
 };
 
-struct sm89_config_M16 {
+struct sm89_fp8_config_M16 {
   // M in [1, 16]
   using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@@ -281,7 +286,8 @@ struct sm89_config_M16 {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
 
     using FallbackGemm =
-        typename sm89_fallback_gemm<InType, OutType, Epilogue>::Cutlass2xGemm;
+        typename sm89_fp8_fallback_gemm<InType, OutType,
+                                        Epilogue>::Cutlass2xGemm;
 
     uint32_t const n = out.size(1);
     uint32_t const np2 = next_pow_2(n);
@@ -320,10 +326,10 @@ struct sm89_config_M16 {
 template <typename InType, typename OutType,
           template <typename, typename> typename Epilogue,
           typename... EpilogueArgs>
-inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
-                                       torch::Tensor const& a,
-                                       torch::Tensor const& b,
-                                       EpilogueArgs&&... args) {
+inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
@@ -334,27 +340,27 @@ inline void cutlass_gemm_sm89_dispatch(torch::Tensor& out,
 
   if (mp2 <= 16) {
     // M in [1, 16]
-    return sm89_config_M16::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M16::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 32) {
     // M in (16, 32]
-    return sm89_config_M32::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M32::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 64) {
     // M in (32, 64]
-    return sm89_config_M64::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // M in (64, 128]
-    return sm89_config_M128::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M128::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 256) {
     // M in (128, 256]
-    return sm89_config_M256::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_M256::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // M in (256, inf)
-    return sm89_config_default::dispatch<InType, OutType, Epilogue>(
+    return sm89_fp8_config_default::dispatch<InType, OutType, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(args)...);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
new file mode 100644
index 0000000000000..95723b31ca3ce
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -0,0 +1,353 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM89 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm89_int8_fallback_gemm {
+  // Shared mem requirement : 61440
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  static int32_t const MainLoopStages = 5;
+
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+struct sm89_int8_config_default {
+  // M in (256, inf)
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M256 {
+  // M in (128, 256]
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 4096) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M128 {
+  // M in (64, 128]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else if (np2 <= 16384) {
+      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M64 {
+  // M in (32, 64]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 3>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M32 {
+  // M in (16, 32]
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<32, 128, 128>;
+      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+struct sm89_int8_config_M16 {
+  // M in [1, 16]
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
+  template <typename InType, typename OutType,
+            template <typename, typename> typename Epilogue,
+            typename... EpilogueArgs>
+  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, EpilogueArgs&&... args) {
+    static_assert(std::is_same<InType, int8_t>());
+    TORCH_CHECK(a.dtype() == torch::kInt8);
+
+    using FallbackGemm =
+        typename sm89_int8_fallback_gemm<InType, OutType,
+                                         Epilogue>::Cutlass2xGemm;
+
+    uint32_t const n = out.size(1);
+    uint32_t const np2 = next_pow_2(n);
+
+    if (np2 <= 8192) {
+      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 5>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
+
+      return vllm::fallback_cutlass_gemm_caller<
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+                                InType, OutType, Epilogue, TileShape, WarpShape,
+                                InstructionShape, 4>,
+          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return sm89_int8_config_M16::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return sm89_int8_config_M32::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return sm89_int8_config_M64::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return sm89_int8_config_M128::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return sm89_int8_config_M256::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return sm89_int8_config_default::dispatch<InType, OutType, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 6b72875823b4b..4ed9db37f60ab 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -119,8 +119,8 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
     cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])

From 6e063ea35b6fe3b6a9b87aae589725493849a632 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 30 Jul 2024 02:06:29 -0700
Subject: [PATCH 12/79] [TPU] Fix greedy decoding (#6933)

---
 vllm/worker/tpu_model_runner.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 1692094af8c41..cf4cc5535ba5b 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -28,7 +28,9 @@
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = -1  # NOTE(woosuk): In PyTorch XLA, index -1 is ignored.
+# Here we utilize the behavior that out-of-bound index is ignored.
+# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
+_PAD_SLOT_ID = 1_000_000_000
 # FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
 _ENABLE_TOP_P = False
 # FIXME(woosuk): A temporary hack to support `n > 1`.
@@ -414,10 +416,7 @@ def _prepare_sample(
         best_of = []
         for seq_group_metadata in seq_group_metadata_list:
             sampling_params = seq_group_metadata.sampling_params
-            # NOTE(woosuk): Here we mimic argmax sampling by applying a very
-            # low temperature. This is not accurate.
-            t.append(sampling_params.temperature
-                     if sampling_params.temperature >= 1e-5 else 1e-5)
+            t.append(sampling_params.temperature)
             if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
                 raise NotImplementedError(
                     "Top-p sampling is currently disabled for the TPU backend "
@@ -678,13 +677,23 @@ def forward(
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
-        logits = logits / t.unsqueeze(dim=1)
+        # Argmax sampling.
+        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        argmax_token_ids = argmax_token_ids.repeat(1, num_samples)
+
+        # Zero temperature means greedy decoding. Avoid division by zero.
+        nonzero_t = torch.where(t != 0, t, 1.0)
+        logits = logits / nonzero_t.unsqueeze(dim=1)
         if _ENABLE_TOP_P:
             logits = _apply_top_p(logits, p.unsqueeze(dim=1))
+
+        # Random sampling.
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        next_token_ids = torch.multinomial(probs,
-                                           num_samples,
-                                           replacement=True)
+        sampled_token_ids = torch.multinomial(probs,
+                                              num_samples,
+                                              replacement=True)
+        next_token_ids = torch.where(t != 0, sampled_token_ids,
+                                     argmax_token_ids)
         return next_token_ids
 
 

From c66c7f86aca956014d9ec6cc7a3e6001037e4655 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 30 Jul 2024 02:20:57 -0700
Subject: [PATCH 13/79] [Bugfix] Fix PaliGemma MMP (#6930)

---
 vllm/model_executor/models/paligemma.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8a2bacbd96b67..2af48b6bc190f 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -9,7 +9,6 @@
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -133,12 +132,10 @@ class PaliGemmaMultiModalProjector(nn.Module):
     def __init__(self, vision_hidden_size: int, projection_dim: int):
         super().__init__()
 
-        self.linear = ColumnParallelLinear(vision_hidden_size,
-                                           projection_dim,
-                                           bias=True)
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.linear(image_features)
+        hidden_states = self.linear(image_features)
         return hidden_states
 
 

From f05840368335aa9c8184239d3c8bd986e44692f7 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Wed, 31 Jul 2024 00:14:03 +0800
Subject: [PATCH 14/79] [Doc] Super tiny fix doc typo (#6949)

---
 vllm/engine/arg_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bad5be4917216..2737b50927f6b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -632,9 +632,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--preemption-mode',
             type=str,
             default=None,
-            help='If \'recompute\', the engine performs preemption by block '
-            'swapping; If \'swap\', the engine performs preemption by block '
-            'swapping.')
+            help='If \'recompute\', the engine performs preemption by '
+            'recomputing; If \'swap\', the engine performs preemption by '
+            'block swapping.')
 
         parser.add_argument(
             "--served-model-name",

From 5cf9254a9cfd1611e5c2fcd1b5011b4bdb56947f Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 30 Jul 2024 10:40:08 -0700
Subject: [PATCH 15/79] [BugFix] Fix use of per-request seed with pipeline
 parallel (#6698)

---
 tests/samplers/test_rejection_sampler.py      | 23 ++---
 tests/samplers/test_sampler.py                |  5 +-
 tests/spec_decode/e2e/test_mlp_correctness.py | 54 ++++++++++-
 tests/spec_decode/e2e/test_seed.py            |  2 +-
 tests/spec_decode/test_batch_expansion.py     |  1 +
 tests/utils.py                                | 31 ++++++
 vllm/core/scheduler.py                        |  1 -
 .../layers/rejection_sampler.py               | 95 ++++++++-----------
 .../layers/spec_decode_base_sampler.py        |  4 +-
 vllm/model_executor/sampling_metadata.py      | 20 ++--
 vllm/sequence.py                              | 12 ---
 vllm/spec_decode/batch_expansion.py           | 37 +++++---
 vllm/spec_decode/medusa_worker.py             |  4 +-
 vllm/spec_decode/mlp_speculator_worker.py     |  4 +-
 vllm/spec_decode/ngram_worker.py              |  3 +-
 vllm/spec_decode/spec_decode_worker.py        | 25 +++--
 vllm/worker/cpu_model_runner.py               |  3 +-
 vllm/worker/model_runner.py                   | 14 ++-
 vllm/worker/model_runner_base.py              | 15 +++
 vllm/worker/neuron_model_runner.py            |  3 +-
 vllm/worker/xpu_model_runner.py               |  3 +-
 21 files changed, 222 insertions(+), 137 deletions(-)

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index b6330a5e5f7c5..8f6c292620c20 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -150,10 +150,9 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     high=vocab_size,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
-    generators = [None] * batch_size
 
     rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids, generators)
+                      draft_token_ids)
 
 
 @pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@@ -185,14 +184,13 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
 
     results = []
     for _ in range(n_rep):
-        generators = [
-            torch.Generator(
-                device=device).manual_seed(i) if seeded_mask[i] else None
-            for i in range(batch_size)
-        ]
+        seeded_seqs = {
+            i: torch.Generator(device=device).manual_seed(i)
+            for i in range(batch_size) if seeded_mask[i]
+        }
         results.append(
             rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, generators))
+                              draft_token_ids, seeded_seqs))
 
     for i in range(batch_size):
         if seeded_mask[i]:
@@ -242,11 +240,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
         raise AssertionError()
 
     oob_token_ids[0][0] = rogue_token_id
-    generators = [None] * batch_size
 
     with pytest.raises(AssertionError):
         rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids, generators)
+                          draft_token_ids)
 
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -417,15 +414,11 @@ def _estimate_rejection_sampling_pdf(
                                       dtype=torch.int64,
                                       device="cuda").repeat(num_samples, 1)
 
-        # unseeded
-        generators = [None]
-
         # Get output tokens via rejection sampling.
         output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
                                                   bonus_token_ids.to("cuda"),
                                                   draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"),
-                                                  generators)
+                                                  draft_token_ids.to("cuda"))
 
         # Remove bonus tokens
         output_token_ids = output_token_ids[:, :-1].flatten()
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 9572588ce6e53..bf062e4a5c09d 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -510,13 +510,16 @@ def test_sampler_mixed(seed: int, device: str):
             ))
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
+    generators: Dict[str, torch.Generator] = {}
+
     def test_sampling():
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
         sampler_output = sampler(logits=fake_logits,
                                  sampling_metadata=sampling_metadata)
 
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index e310941afacf3..20f50888dab55 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -21,7 +21,8 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import (run_equality_correctness_test,
+                       run_greedy_equality_correctness_test)
 
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
@@ -77,6 +78,57 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+
+        # Speculative model
+        "speculative_model": SPEC_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
+@pytest.mark.parametrize("output_len", [64])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize("seed", [None])
+def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int,
+                                    temperature: float):
+    """Verify seeded runs produce the same output."""
+    run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=temperature,
+                                  seeded=True,
+                                  force_output_len=True)
+
+    # Ensure this same test does fail if we _don't_ include per-request seeds
+    with pytest.raises(AssertionError):
+        run_equality_correctness_test(baseline_llm_generator,
+                                      test_llm_generator,
+                                      batch_size,
+                                      max_output_len=output_len,
+                                      temperature=temperature,
+                                      seeded=False,
+                                      force_output_len=True)
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 394a53f03ed46..f84c346c1d315 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -29,7 +29,7 @@
     "output_len",
     [
         # Use smaller output len for fast test.
-        10,
+        20,
     ])
 @pytest.mark.parametrize("seed", [None])
 def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index c350a2c55396e..0d6aaa449d856 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -86,6 +86,7 @@ def test_create_single_target_seq_group_metadata(k: int):
         input_seq_id,
         target_seq_id,
         token_ids,
+        input_seq_group_metadata.sampling_params,
     )
 
     assert output.request_id == input_seq_group_metadata.request_id
diff --git a/tests/utils.py b/tests/utils.py
index bf36d96108d8c..1086591464d43 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -178,6 +178,37 @@ def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
                 "usage": completion.usage,
             })
 
+            # test seeded random sampling
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test": "seeded_sampling",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test seeded random sampling with multiple prompts
+            completion = client.completions.create(model=model,
+                                                   prompt=[prompt, prompt],
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test":
+                "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason":
+                [choice.finish_reason for choice in completion.choices],
+                "usage":
+                completion.usage,
+            })
+
             # test simple list
             batch = client.completions.create(
                 model=model,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 6e59c5e0f74f3..5b7b569c3e08d 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1029,7 +1029,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 token_chunk_size=token_chunk_size,
                 lora_request=seq_group.lora_request,
                 computed_block_nums=common_computed_block_nums,
-                state=seq_group.state,
                 # `multi_modal_data` will only be present for the 1st comm
                 # between engine and worker.
                 # the subsequent comms can still use delta, but
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index b4994083c797b..533b436344410 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.jit
@@ -36,7 +36,7 @@ def forward(
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
     ) -> torch.Tensor:
         """Sample token ids using rejection sampling. This accepts or rejects
         tokens proposed by the draft model using the probability of each token
@@ -66,6 +66,9 @@ def forward(
                 probabilities.
             shape = [batch_size, num_speculative_tokens]
 
+            seeded_seqs: Dict of batch row index to torch generator, for
+                sequences using seeded generation.
+
         Returns:
             output_token_ids: The token ids sampled via rejection sampling,
                 or -1 if unable to sample a token because the previous token
@@ -83,7 +86,7 @@ def forward(
                 target_probs,
                 draft_probs,
                 draft_token_ids,
-                generators,
+                seeded_seqs,
             ))
 
         output_token_ids = self._create_output(
@@ -100,7 +103,7 @@ def _batch_modified_rejection_sampling(
         target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_token_ids: torch.Tensor,  # [batch_size, k]
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Perform modified rejection sampling on each sequence.
 
@@ -117,23 +120,17 @@ def _batch_modified_rejection_sampling(
 
         # shape [batch_size, k]
         accepted = self._get_accepted(target_probs, draft_probs,
-                                      draft_token_ids, generators)
+                                      draft_token_ids, seeded_seqs)
 
         recovered_probs = self._get_recovered_probs(
             target_probs, draft_probs).reshape(batch_size * k, vocab_size)
 
-        seed_indices, non_seed_indices = self._split_batch_by_seeded(
-            generators, k=k)
-
         # NOTE: the recovered_probs are overwritten by this method.
         recovered_token_ids = _multinomial(
             recovered_probs,
             num_samples=1,
             k=k,
-            generators=generators,
-            seed_indices=seed_indices,
-            # this arg is unused when None but torch.jit requires a list
-            non_seed_indices=non_seed_indices or [],
+            seeded_seqs=seeded_seqs or {},
         ).reshape(batch_size, k)
 
         return accepted, recovered_token_ids
@@ -143,7 +140,7 @@ def _get_accepted(
         target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
         draft_token_ids: torch.Tensor,  # [batch_size, k]
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]],
     ) -> torch.Tensor:
         r"""Create bool matrix over the proposed draft tokens. If
         True, then a token can be accepted, else it should be
@@ -178,24 +175,26 @@ def _get_accepted(
         selected_target_probs = target_probs[batch_indices, probs_indicies,
                                              draft_token_ids]
 
-        seed_indices, non_seed_indices = self._split_batch_by_seeded(
-            generators)
-
-        if len(seed_indices) == 0:
+        if not seeded_seqs:
             uniform_rand = torch.rand_like(selected_target_probs)
         else:
             uniform_rand = torch.empty_like(selected_target_probs)
 
-            for idx in seed_indices:
-                uniform_rand[idx, :] = torch.rand(1,
-                                                  k,
-                                                  dtype=self.probs_dtype,
-                                                  device=target_probs.device,
-                                                  generator=generators[idx])
-
-            if non_seed_indices:
-                uniform_rand[non_seed_indices, :] = torch.rand(
-                    len(non_seed_indices),
+            non_seeded_indices = []
+            for idx in range(batch_size):
+                generator = seeded_seqs.get(idx)
+                if generator is None:
+                    non_seeded_indices.append(idx)
+                else:
+                    uniform_rand[idx, :] = torch.rand(
+                        1,
+                        k,
+                        dtype=self.probs_dtype,
+                        device=target_probs.device,
+                        generator=generator)
+            if non_seeded_indices:
+                uniform_rand[non_seeded_indices, :] = torch.rand(
+                    len(non_seeded_indices),
                     k,
                     dtype=self.probs_dtype,
                     device=target_probs.device)
@@ -272,27 +271,6 @@ def _smallest_positive_value(self) -> float:
         """
         return torch.finfo(self.probs_dtype).tiny
 
-    # partition batch into indices for which a generator is provided
-    # and indicies for which no generator is provided
-    @staticmethod
-    def _split_batch_by_seeded(
-        generators: List[Optional[torch.Generator]],
-        k: int = 1,
-    ) -> Tuple[List[int], Optional[List[int]]]:
-
-        if all(generator is None for generator in generators):
-            seed_indices: List[int] = []
-            non_seed_indices: Optional[List[int]] = None
-        else:
-            seed_indices, non_seed_indices = [], []
-            for i, generator in enumerate(generators):
-                if generator is None:
-                    non_seed_indices.extend(range(k * i, k * (i + 1)))
-                else:
-                    seed_indices.extend(range(k * i, k * (i + 1)))
-
-        return seed_indices, non_seed_indices
-
 
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead that skips the sync.
@@ -304,9 +282,7 @@ def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
     k: int,
-    generators: List[Optional[torch.Generator]],
-    seed_indices: List[int],
-    non_seed_indices: List[int],
+    seeded_seqs: Dict[int, torch.Generator],
 ) -> torch.Tensor:
 
     if num_samples > 1:
@@ -315,13 +291,20 @@ def _multinomial(
         probs = probs[:, None, :].expand(probs.shape[0], num_samples,
                                          probs.shape[1]).contiguous().view(
                                              -1, probs.shape[1])
-
     q = torch.empty_like(probs)
-    if len(seed_indices) == 0:
+    if not seeded_seqs:
         q.exponential_(1.0)
     else:
-        q[non_seed_indices].exponential_(1.0)
-        for idx in seed_indices:
-            q[idx].exponential_(1.0, generator=generators[idx // k])
+        non_seeded_indices: List[int] = []
+        start = 0
+        for idx in range(len(q) // k):
+            end = start + k
+            generator = seeded_seqs.get(idx)
+            if generator is None:
+                non_seeded_indices.extend(list(range(start, end)))
+            else:
+                q[start:end].exponential_(1.0, generator=generator)
+            start = end
+        q[non_seeded_indices].exponential_(1.0)
 
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 08191da49d52f..3091e639727b0 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Optional
+from typing import Dict, Optional
 
 import torch
 import torch.jit
@@ -237,6 +237,6 @@ def forward(
         bonus_token_ids: torch.Tensor,
         draft_probs: torch.Tensor,
         draft_token_ids: torch.Tensor,
-        generators: List[Optional[torch.Generator]],
+        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1caf9aa01d8c8..59cfec9ec8934 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -118,6 +118,7 @@ def prepare(
         query_lens: Optional[List[int]],
         device: str,
         pin_memory: bool,
+        generators: Optional[Dict[str, torch.Generator]] = None,
     ) -> "SamplingMetadata":
         (
             seq_groups,
@@ -125,7 +126,7 @@ def prepare(
             categorized_sample_indices,
             num_prompts,
         ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
-                                device)
+                                device, generators)
         selected_token_indices = async_tensor_h2d(selected_token_indices,
                                                   dtype=torch.long,
                                                   target_device=device,
@@ -160,6 +161,7 @@ def _prepare_seq_groups(
     seq_lens: List[int],
     query_lens: Optional[List[int]],
     device: str,
+    generators: Optional[Dict[str, torch.Generator]] = None,
 ) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
         SamplingType, List[Tuple[int, int]]], int]:
     """Prepare sequence groups and indices for sampling.
@@ -170,8 +172,10 @@ def _prepare_seq_groups(
             Index of prompt len should match with seq_group_metadata_list.
         query_lens: A list of query lengths. Prompt lens include the length
             of entire prompt tokens, and it could be shorter.
-        device: A device to use for random number generator,
+        device: A device to use for random number generators,
             `SequenceGroupToSample.generator`.
+        generators: A store of per-request random number generators used
+            for seeded requests.
 
     Returns:
         seq_groups: A list of sequence group to sample.
@@ -217,8 +221,10 @@ def _prepare_seq_groups(
 
         if seq_group_metadata.is_prompt:
             if sampling_params.seed is not None:
-                seq_group_metadata.state.generator = torch.Generator(
-                    device=device).manual_seed(sampling_params.seed)
+                generator = torch.Generator(device=device).manual_seed(
+                    sampling_params.seed)
+                if generators is not None:
+                    generators[seq_group_metadata.request_id] = generator
 
             num_prompts += 1
             num_prefill_sample = len(seq_ids)
@@ -235,6 +241,9 @@ def _prepare_seq_groups(
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
 
+            if sampling_params.seed is not None and generators is not None:
+                generator = generators.get(seq_group_metadata.request_id)
+
         # Update indices to select from the model output.
         """
         This blocks computes selected_token_indices which is used in the
@@ -279,9 +288,6 @@ def sample(logits):
             logit_idx += sample_len
             sample_idx += sample_len
 
-        if sampling_params.seed is not None:
-            generator = seq_group_metadata.state.generator
-
         seq_groups.append(
             SequenceGroupToSample(
                 seq_ids=seq_ids,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 72821ecea0f47..ab50cfdfd29a5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -411,14 +411,6 @@ def __repr__(self) -> str:
                 f"num_blocks={self.n_blocks}, ")
 
 
-@dataclass
-class SequenceGroupState:
-    """Mutable state tied to a specific sequence group"""
-
-    # torch.Generator used in seeded sampling
-    generator: Optional = None  # type: ignore
-
-
 class SequenceGroup:
     """A group of sequences that are generated from the same prompt.
 
@@ -461,7 +453,6 @@ def __init__(
                                       time_in_queue=None)
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
-        self.state = SequenceGroupState()
         self.embeddings = embeddings
         self.pooling_params = pooling_params
         self.prompt_adapter_request = prompt_adapter_request
@@ -648,7 +639,6 @@ class SequenceGroupMetadata:
         lora_request: LoRA request.
         computed_block_nums: The block numbers that are already computed,
             used in prefix caching.
-        state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
         encoder_seq_data: Optional sequence data for encoder prompt
                           (SequenceGroup.encoder_seq). Should be None 
@@ -674,7 +664,6 @@ def __init__(
         token_chunk_size: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
-        state: Optional[SequenceGroupState] = None,
         multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
@@ -690,7 +679,6 @@ def __init__(
         self.prompt_adapter_request = prompt_adapter_request
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
-        self.state = SequenceGroupState() if state is None else state
         self.encoder_seq_data = encoder_seq_data
         self.cross_block_table = cross_block_table
         self._token_chunk_size = token_chunk_size
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 41f0aebf3c01b..45eaeb51c5c0f 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -3,9 +3,9 @@
 
 import torch
 
+from vllm import SamplingParams
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata, SequenceGroupState,
-                           get_all_seq_ids)
+                           SequenceGroupMetadata, get_all_seq_ids)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
@@ -16,6 +16,8 @@
 TargetSeqId = int
 TokenId = int
 
+DEFAULT_SIMPLE_SAMPLING_PARAMS = SamplingParams()
+
 
 class BatchExpansionTop1Scorer(SpeculativeScorer):
     """Implements a speculative scorer that uses batch expansion to get
@@ -247,24 +249,39 @@ def _create_target_seq_group_metadata(
         token_ids_to_score = self._get_token_ids_to_score(
             proposal_token_ids[batch_index])
 
+        # Use simpler sampling parameters apart from for final token
+        # (in particular don't do seeded sampling) since those sampled tokens
+        # aren't used.
+        # We don't replace the sampling_params in the greedy case because
+        # this also controls whether the probs get modified in the sampler
+        # (see use of _modify_greedy_probs_inplace there).
+        sampling_params = input_seq_group_metadata.sampling_params
+        non_bonus_sampling_params = DEFAULT_SIMPLE_SAMPLING_PARAMS \
+            if sampling_params.temperature else sampling_params
+
         target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        for token_ids in token_ids_to_score:
+        last_index = len(token_ids_to_score) - 1
+        for i, token_ids in enumerate(token_ids_to_score):
+            target_sampling_params = sampling_params if i == last_index \
+                else non_bonus_sampling_params
             target_seq_group_metadata_list.append(
                 self._create_single_target_seq_group_metadata(
                     input_seq_group_metadata,
                     input_seq_id,
                     next(target_seq_ids_iter),
                     token_ids,
+                    sampling_params=target_sampling_params,
                 ))
 
         return target_seq_group_metadata_list
 
+    @staticmethod
     def _create_single_target_seq_group_metadata(
-        self,
         seq_group_metadata: SequenceGroupMetadata,
         seq_id: SeqId,
         target_seq_id: TargetSeqId,
         token_ids: List[TokenId],
+        sampling_params: SamplingParams,
     ) -> SequenceGroupMetadata:
         """Create a single target SequenceGroupMetadata.
 
@@ -293,26 +310,16 @@ def _create_single_target_seq_group_metadata(
         for data in new_seq_data_dict.values():
             data.update_num_computed_tokens(data.get_len() - 1)
 
-        if (seq_group_metadata.state is not None
-                and seq_group_metadata.state.generator is not None):
-            generator = torch.Generator(
-                device=seq_group_metadata.state.generator.device)
-            generator.set_state(seq_group_metadata.state.generator.get_state())
-            state = SequenceGroupState(generator=generator)
-        else:
-            state = None
-
         return SequenceGroupMetadata(
             request_id=seq_group_metadata.request_id,
             is_prompt=seq_group_metadata.is_prompt,
             seq_data=new_seq_data_dict,
-            sampling_params=seq_group_metadata.sampling_params,
+            sampling_params=sampling_params,
             block_tables={
                 target_seq_id: seq_group_metadata.block_tables[seq_id],
             },
             lora_request=None,
             token_chunk_size=1,
-            state=state,
         )
 
     def _split_scoring_output(
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 041ce41e91d05..4b82f7bf92bab 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -57,9 +57,11 @@ def sampler_output(
         seq_lens, query_lens = self._prepare_input_tensors(
             seq_group_metadata_list)
 
+        generators = self.model_runner.get_generators(
+            execute_model_req.finished_requests_ids)
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list, seq_lens, query_lens, self.device,
-            self.model_runner.pin_memory)
+            self.model_runner.pin_memory, generators)
 
         model_outputs = self.model_runner.model.generate_proposals(
             previous_hidden_states=execute_model_req.previous_hidden_states.
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 308573348d443..76e444387816f 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -38,9 +38,11 @@ def sampler_output(
         (input_tokens, seq_lens,
          query_lens) = self._prepare_input_tensors(seq_group_metadata_list)
 
+        generators = self.model_runner.get_generators(
+            execute_model_req.finished_requests_ids)
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list, seq_lens, query_lens, self.device,
-            self.model_runner.pin_memory)
+            self.model_runner.pin_memory, generators)
 
         model_outputs = self.model_runner.model.generate_proposals(
             input_ids=input_tokens,
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index a21222fec269b..806480b5c892f 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -7,10 +7,9 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
+class NGramWorker(NonLLMProposerWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
     Current NGramWorker only implements prompt lookup decoding,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 98960b88f719f..ad8c0cee0b5b6 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -213,6 +213,9 @@ def __init__(
         """
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
+        scorer_runner = getattr(self.scorer_worker, "model_runner", None)
+        self.generators = scorer_runner.get_generators(
+        ) if scorer_runner else None
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
         self.spec_decode_sampler = spec_decode_sampler
         self._allow_zero_draft_token_step = allow_zero_draft_token_step
@@ -591,20 +594,14 @@ def _verify_tokens(
         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
 
         # Sampler arguments
-        sampler_extra_kwargs = {}
-        if isinstance(self.spec_decode_sampler,
-                      SpecDecodeStochasticBaseSampler):
-
-            # Get sequence group state
-            generators = []
-            for seq_group_metadata in seq_group_metadata_list:
-                if (seq_group_metadata.state is not None
-                        and seq_group_metadata.state.generator is not None):
-                    generators.append(seq_group_metadata.state.generator)
-                else:
-                    generators.append(None)
-
-            sampler_extra_kwargs["generators"] = generators
+        sampler_extra_kwargs: Dict[str, Any] = {}
+        if self.generators and isinstance(self.spec_decode_sampler,
+                                          SpecDecodeStochasticBaseSampler):
+            sampler_extra_kwargs["seeded_seqs"] = {
+                idx: self.generators[sgm.request_id]
+                for idx, sgm in enumerate(seq_group_metadata_list)
+                if sgm.sampling_params.seed is not None
+            }
 
         accepted_token_ids = self.spec_decode_sampler(
             target_probs=proposal_verifier_probs,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 71763c08ec45f..c1dee444da512 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -337,7 +337,8 @@ def prepare_model_input(
             # just use seq_lens instead.
             seq_lens,
             self.device,
-            pin_memory=False)
+            pin_memory=False,
+            generators=self.get_generators(finished_requests_ids))
         return CPUModelInput(
             input_tokens=input_tokens,
             input_positions=input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 86d26b4a84c36..4010c45e10267 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1264,11 +1264,15 @@ def prepare_model_input(
         """
         model_input = self._prepare_model_input_tensors(
             seq_group_metadata_list, finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     self.pin_memory)
+        if get_pp_group().is_last_rank:
+            # Sampling metadata is only required for the final pp group
+            generators = self.get_generators(finished_requests_ids)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, model_input.seq_lens,
+                model_input.query_lens, self.device, self.pin_memory,
+                generators)
+        else:
+            sampling_metadata = None
         is_prompt = (seq_group_metadata_list[0].is_prompt
                      if seq_group_metadata_list else None)
         return dataclasses.replace(model_input,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 5fb97025af5c0..46ac16b504bf4 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -139,6 +139,9 @@ class ModelRunnerBase(ABC, Generic[T]):
     ModelRunnerInputBase subclass.
     """
 
+    # Map of request_id -> generator used for seeded random sampling
+    generators: Dict[str, torch.Generator] = {}
+
     @abstractmethod
     def make_model_input_from_broadcasted_tensor_dict(
         self,
@@ -176,3 +179,15 @@ def execute_model(
         Execute the model on the given input.
         """
         raise NotImplementedError
+
+    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
+        """
+        Return dict of per-request generators used for random sampling.
+        """
+
+        # Clean up generators from completed requests
+        if finished_request_ids:
+            for request_id in finished_request_ids:
+                self.generators.pop(request_id, None)
+
+        return self.generators
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 651319ab14548..243e2ece56fe5 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -219,7 +219,8 @@ def prepare_model_input(
             # just use seq_lens instead.
             seq_lens,
             self.device,
-            self.pin_memory)
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
 
         return ModelInputForNeuron(input_tokens=input_tokens,
                                    input_positions=input_positions,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 2f0ca42316e13..98462f0f7f38e 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -246,7 +246,8 @@ def prepare_model_input(
                 # just use seq_lens instead.
                 seq_lens,
                 self.device,
-                pin_memory=False)
+                pin_memory=False,
+                generators=self.get_generators(finished_requests_ids))
             # Broadcast the metadata.
             metadata_dict = {
                 "input_tokens": input_tokens,

From cbbc904470668b9420e71595edeef76d673a2d59 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 30 Jul 2024 13:50:42 -0400
Subject: [PATCH 16/79] [Kernel] Squash a few more warnings (#6914)

---
 csrc/attention/attention_kernels.cu               | 4 ++--
 csrc/quantization/aqlm/gemm_kernels.cu            | 2 --
 csrc/quantization/fp8/amd/quant_utils.cuh         | 2 ++
 csrc/quantization/fp8/nvidia/quant_utils.cuh      | 2 ++
 csrc/quantization/squeezellm/quant_cuda_kernel.cu | 3 ++-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 875570a1e894f..bcd170411e7cb 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
@@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
index 8fb9856800867..22da5e4f08a18 100644
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
   }
   __syncthreads();
 
-  float res = 0;
-
   int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
   while (iters--) {
     if (pred && a_gl_rd < a_gl_end) {
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index 35123d7fc65d4..eb66834222f3e 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh
index cde26dbda18cf..e32684eaed24d 100644
--- a/csrc/quantization/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 714907428a1ab..8ed918b3d7c27 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
+      (half2*)mul.data_ptr<at::Half>(),
+      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

From 5895b24677ff7771fa414f611a241856e225e6ea Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 30 Jul 2024 22:33:01 +0400
Subject: [PATCH 17/79] [OpenVINO] Updated OpenVINO requirements and build docs
 (#6948)

---
 Dockerfile.openvino                           |  4 +--
 .../getting_started/openvino-installation.rst |  2 +-
 requirements-openvino.txt                     | 28 ++++++++++++++++++-
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index cfb786485266c..7c62dd845aa99 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -1,7 +1,7 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
-FROM ubuntu:20.04 AS dev
+FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
     apt-get install -y python3-pip git
@@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
index 0d8e0b680ff0d..62256df091a44 100644
--- a/docs/source/getting_started/openvino-installation.rst
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index fabac3c7bbaf9..a86c6cb580484 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,7 +1,33 @@
 # Common dependencies
--r requirements-common.txt
+# -r requirements-common.txt
+# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
+cmake >= 3.21
+ninja  # For faster builds.
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers < 4.43
+tokenizers >= 0.19.1  # Required for Llama 3.
+fastapi
+aiohttp
+openai
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
 
 # OpenVINO dependencies
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
+openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
 optimum-intel[openvino] >= 1.18.1

From 052b6f8ca4041f90a1d6825342a4836befbcf478 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Tue, 30 Jul 2024 14:48:50 -0400
Subject: [PATCH 18/79] [Bugfix] Fix tensorizer memory profiling bug during
 testing (#6881)

---
 tests/tensorizer_loader/conftest.py        |  35 +++--
 tests/tensorizer_loader/test_tensorizer.py | 169 +++++++++++----------
 2 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index c5c6fc1057d31..b46116391db26 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,6 +1,5 @@
-# isort: skip_file
-
 import contextlib
+import functools
 import gc
 
 import pytest
@@ -12,34 +11,38 @@
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 
+@pytest.fixture(autouse=True)
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
+    ray.shutdown()
     gc.collect()
     torch.cuda.empty_cache()
-    ray.shutdown()
 
 
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    """Allow subdirectories to skip global cleanup by overriding this fixture.
-    This can provide a ~10x speedup for non-GPU unit tests since they don't need
-    to initialize torch.
-    """
+def retry_until_skip(n):
 
-    return True
+    def decorator_retry(func):
 
+        @functools.wraps(func)
+        def wrapper_retry(*args, **kwargs):
+            for i in range(n):
+                try:
+                    return func(*args, **kwargs)
+                except AssertionError:
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    if i == n - 1:
+                        pytest.skip("Skipping test after attempts..")
 
-@pytest.fixture(autouse=True)
-def cleanup_fixture(should_do_global_cleanup_after_test: bool):
-    yield
-    if should_do_global_cleanup_after_test:
-        cleanup()
+        return wrapper_retry
+
+    return decorator_retry
 
 
 @pytest.fixture(autouse=True)
 def tensorizer_config():
     config = TensorizerConfig(tensorizer_uri="vllm")
-    return config
\ No newline at end of file
+    return config
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 2adeae8874bdb..32591ecfe6774 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,3 +1,4 @@
+import gc
 import json
 import os
 import pathlib
@@ -20,13 +21,13 @@
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
 
-from ..conftest import VllmRunner, cleanup
+from ..conftest import VllmRunner
 from ..utils import RemoteOpenAIServer
+from .conftest import retry_until_skip
 
 # yapf conflicts with isort for this docstring
 
 
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -40,6 +41,7 @@
 tensorize_model_for_testing_script = os.path.join(
     os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
 
+
 def is_curl_installed():
     try:
         subprocess.check_call(['curl', '--version'])
@@ -47,14 +49,16 @@ def is_curl_installed():
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
+
 def get_torch_model(vllm_runner: VllmRunner):
     return vllm_runner \
-            .model \
-            .llm_engine \
-            .model_executor \
-            .driver_worker \
-            .model_runner \
-            .model
+        .model \
+        .llm_engine \
+        .model_executor \
+        .driver_worker \
+        .model_runner \
+        .model
+
 
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
@@ -63,7 +67,6 @@ def write_keyfile(keyfile_path: str):
         f.write(encryption_params.key)
 
 
-
 @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
 def test_load_with_tensorizer(mock_agent, tensorizer_config):
     mock_linear_method = MagicMock()
@@ -85,14 +88,15 @@ def test_can_deserialize_s3(vllm_runner):
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
     with vllm_runner(model_ref,
-                                  load_format="tensorizer",
-                                  model_loader_extra_config=TensorizerConfig(
-                                      tensorizer_uri=tensorized_path,
-                                      num_readers=1,
-                                      s3_endpoint="object.ord1.coreweave.com",
-                                  )) as loaded_hf_model:
-
-        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
+                     load_format="tensorizer",
+                     model_loader_extra_config=TensorizerConfig(
+                         tensorizer_uri=tensorized_path,
+                         num_readers=1,
+                         s3_endpoint="object.ord1.coreweave.com",
+                     )) as loaded_hf_model:
+        deserialized_outputs = loaded_hf_model.generate(prompts,
+                                                        sampling_params)
+        # noqa: E501
 
         assert deserialized_outputs
 
@@ -100,7 +104,6 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
-    cleanup()
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
@@ -113,18 +116,19 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
             encryption_keyfile=key_path
         )
         serialize_vllm_model(get_torch_model(vllm_model),
-                            config_for_serializing)
-
+                             config_for_serializing)
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
 
     with vllm_runner(
-        model_ref,
-        load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=config_for_deserializing) as loaded_vllm_model:  # noqa: E501
 
-        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
+        # noqa: E501
 
         assert outputs == deserialized_outputs
 
@@ -140,12 +144,11 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer.write_module(hf_model.model)
 
     with vllm_runner(model_ref,
-                                  load_format="tensorizer",
-                                  model_loader_extra_config=TensorizerConfig(
-                                      tensorizer_uri=model_path,
-                                      num_readers=1,
-                                  )) as loaded_hf_model:
-
+                     load_format="tensorizer",
+                     model_loader_extra_config=TensorizerConfig(
+                         tensorizer_uri=model_path,
+                         num_readers=1,
+                     )) as loaded_hf_model:
         deserialized_outputs = loaded_hf_model.generate_greedy(
             prompts, max_tokens=max_tokens)
 
@@ -167,21 +170,21 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
         model_path = tmp_path / (model_ref + ".tensors")
 
         serialize_vllm_model(get_torch_model(vllm_model),
-                            TensorizerConfig(tensorizer_uri=model_path))
+                             TensorizerConfig(tensorizer_uri=model_path))
 
     with vllm_runner(
-        model_ref,
-        load_format="tensorizer",
-        model_loader_extra_config=TensorizerConfig(
-            tensorizer_uri=model_path,
-            num_readers=1,
-        ),
-        enable_lora=True,
-        max_loras=1,
-        max_lora_rank=8,
-        max_cpu_loras=2,
-        max_num_seqs=50,
-        max_model_len=1000,
+            model_ref,
+            load_format="tensorizer",
+            model_loader_extra_config=TensorizerConfig(
+                tensorizer_uri=model_path,
+                num_readers=1,
+            ),
+            enable_lora=True,
+            max_loras=1,
+            max_lora_rank=8,
+            max_cpu_loras=2,
+            max_num_seqs=50,
+            max_model_len=1000,
     ) as loaded_vllm_model:
         process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
 
@@ -189,10 +192,14 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
 
 
 def test_load_without_tensorizer_load_format(vllm_runner):
+    model = None
     with pytest.raises(ValueError):
-        vllm_runner(
+        model = vllm_runner(
             model_ref,
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
@@ -202,7 +209,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         model_path = tmp_path / (model_ref + ".tensors")
 
         serialize_vllm_model(get_torch_model(vllm_model),
-                            TensorizerConfig(tensorizer_uri=model_path))
+                             TensorizerConfig(tensorizer_uri=model_path))
 
         model_loader_extra_config = {
             "tensorizer_uri": str(model_path),
@@ -220,9 +227,9 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
         client = server.get_client()
         completion = client.completions.create(model=model_ref,
-                                            prompt="Hello, my name is",
-                                            max_tokens=5,
-                                            temperature=0.0)
+                                               prompt="Hello, my name is",
+                                               max_tokens=5,
+                                               temperature=0.0)
 
         assert completion.id is not None
         assert len(completion.choices) == 1
@@ -233,11 +240,15 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
 
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
+    model = None
     with pytest.raises(ValueError):
-        vllm_runner(
+        model = vllm_runner(
             model_ref,
             load_format="safetensors",
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -259,22 +270,20 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
             disable_custom_all_reduce=True,
         )
 
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Requires 2 GPUs")
 def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
                                                                     tmp_path):
     model_ref = "EleutherAI/pythia-1.4b"
     # record outputs from un-sharded un-tensorized model
-    base_model = vllm_runner(
-        model_ref,
-        disable_custom_all_reduce=True,
-        enforce_eager=True,
-    )
-    outputs = base_model.generate(prompts, sampling_params)
-
-    base_model.model.llm_engine.model_executor.shutdown()
-    del base_model
-    cleanup()
+    with vllm_runner(
+            model_ref,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+    ) as base_model:
+        outputs = base_model.generate(prompts, sampling_params)
+        base_model.model.llm_engine.model_executor.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -287,32 +296,34 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
 
     tensorize_vllm_model(
         engine_args=EngineArgs(
-                model=model_ref,
-                tensor_parallel_size=2,
-                disable_custom_all_reduce=True,
-                enforce_eager=True,
-            ),
+            model=model_ref,
+            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+        ),
         tensorizer_config=tensorizer_config,
     )
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
-    cleanup()
-
-    loaded_vllm_model = vllm_runner(
-        model_ref,
-        tensor_parallel_size=2,
-        load_format="tensorizer",
-        disable_custom_all_reduce=True,
-        enforce_eager=True,
-        model_loader_extra_config=tensorizer_config)
 
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    with vllm_runner(
+            model_ref,
+            tensor_parallel_size=2,
+            load_format="tensorizer",
+            disable_custom_all_reduce=True,
+            enforce_eager=True,
+            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
 
     assert outputs == deserialized_outputs
 
 
+
+@retry_until_skip(3)
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
-    cleanup()
+    gc.collect()
+    torch.cuda.empty_cache()
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
@@ -324,8 +335,10 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
         assert is_vllm_tensorized(config)
 
     with vllm_runner(model_ref,
-                    load_format="tensorizer",
-                    model_loader_extra_config=config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
+                     load_format="tensorizer",
+                     model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts,
+                                                          sampling_params)
+        # noqa: E501
 
         assert outputs == deserialized_outputs

From d7a299edaa5d23f3d7d5c98b53872a8ced9aad80 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 30 Jul 2024 16:37:01 -0400
Subject: [PATCH 19/79] [Kernel] Remove scaled_fp8_quant kernel padding footgun
 (#6842)

---
 tests/quantization/test_fp8.py                |  2 +-
 vllm/_custom_ops.py                           | 24 ++++++++++---------
 .../layers/quantization/utils/w8a8_utils.py   |  5 ++--
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index ad92f1f189f65..a020f7bf37262 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ad9f01be6ddd4..6ca667eb85640 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -307,7 +307,7 @@ def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
-    batch_dim_padding: Optional[int] = None,
+    num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -317,7 +317,7 @@ def scaled_fp8_quant(
     This function supports both static and dynamic quantization: If you
     provide the scale, it will use static scaling and if you omit it,
     the scale will be determined dynamically. The function also allows
-    optional padding of the output tensor for downstream kernels that
+    optional padding of the output tensors for downstream kernels that
     will benefit from padding.
 
     Args:
@@ -325,7 +325,7 @@ def scaled_fp8_quant(
         scale: Optional scaling factor for the FP8 quantization
         scale_ub: Optional upper bound for scaling factor in dynamic 
             per token case
-        batch_dim_padding: If specified, pad the first dimension
+        num_token_padding: If specified, pad the first dimension
             of the output to at least this value.
         use_per_token_if_dynamic: Whether to do per_tensor or per_token 
             in the dynamic quantization case.
@@ -334,16 +334,16 @@ def scaled_fp8_quant(
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
-    if batch_dim_padding:
-        shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
-        output = torch.empty(shape,
-                             device=input.device,
-                             dtype=torch.float8_e4m3fn)
-    else:
-        output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape = input.shape
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
+
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((input.numel() // input.shape[-1], 1),
+            scale = torch.empty((shape[0], 1),
                                 device=input.device,
                                 dtype=torch.float32)
             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
@@ -352,6 +352,8 @@ def scaled_fp8_quant(
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
+        # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 or num_token_padding is None)
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 20100c76bd690..dbe86902853cd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -139,7 +139,7 @@ def apply_fp8_linear(
         qinput, x_scale = ops.scaled_fp8_quant(
             input,
             input_scale,
-            batch_dim_padding=17,
+            num_token_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)
@@ -177,8 +177,9 @@ def apply_fp8_linear(
             output, _ = torch._scaled_mm(qinput,
                                          weight,
                                          out_dtype=torch.float32)
-            # Unpad (undo batch_dim_padding)
+            # Unpad (undo num_token_padding)
             output = torch.narrow(output, 0, 0, input.shape[0])
+            x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])
 
             # DQ
             # C = sw * sx * (X * W) + bias

From 6ca8031e7130f810ce7248286754a60044f65c73 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 30 Jul 2024 14:32:12 -0700
Subject: [PATCH 20/79] [core][misc] improve free_finished_seq_groups (#6865)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/core/scheduler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5b7b569c3e08d..5cdf1d15c31e1 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -313,6 +313,7 @@ def __init__(
         # Sequence groups finished requests ids since last step iteration.
         # It lets the model know that any state associated with these requests
         # can and must be released after the current step.
+        # This is used to evict the finished requests from the Mamba cache.
         self._finished_requests_ids: List[str] = list()
         # Time at previous scheduling step
         self.prev_time = 0.0
@@ -374,6 +375,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
             for aborted_group in aborted_groups:
                 # Remove the sequence group from the state queue.
                 state_queue.remove(aborted_group)
+                # Remove the aborted request from the Mamba cache.
                 self._finished_requests_ids.append(aborted_group.request_id)
                 for seq in aborted_group.get_seqs():
                     if seq.is_finished():
@@ -1057,13 +1059,16 @@ def free_seq(self, seq: Sequence) -> None:
         self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
-        for queue in [self.running, self.swapped, self.waiting]:
-            self._finished_requests_ids += [
-                seq_group.request_id for seq_group in queue
-                if seq_group.is_finished()
-            ]
-        self.running = deque(seq_group for seq_group in self.running
-                             if not seq_group.is_finished())
+        remaining: Deque[SequenceGroup] = deque()
+        for seq_group in self.running:
+            if seq_group.is_finished():
+                # Add the finished requests to the finished requests list.
+                # This list will be used to update the Mamba cache in the
+                # next step.
+                self._finished_requests_ids.append(seq_group.request_id)
+            else:
+                remaining.append(seq_group)
+        self.running = remaining
 
     def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
         self.block_manager.allocate(seq_group)

From 40c27a7cbb496ede63da9d636c07a1f315fd36e1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 30 Jul 2024 14:59:48 -0700
Subject: [PATCH 21/79] [Build] Temporarily Disable Kernels and LoRA tests
 (#6961)

---
 .buildkite/test-pipeline.yaml | 40 +++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index be8807df0b098..91418e5ec1752 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -155,12 +155,12 @@ steps:
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+# - label: Kernels Test %N
+#   #mirror_hardwares: [amd]
+#   commands:
+#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+#   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
@@ -202,20 +202,20 @@ steps:
     - export VLLM_ATTENTION_BACKEND=XFORMERS
     - pytest -v -s spec_decode
 
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
+# - label: LoRA Test %N
+#   #mirror_hardwares: [amd]
+#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+#   parallelism: 4
+
+# - label: LoRA Long Context (Distributed)
+#   #mirror_hardwares: [amd]
+#   num_gpus: 4
+#   # This test runs llama 13B, so it is required to run on 4 GPUs.
+#   commands:
+#     # FIXIT: find out which code initialize cuda before running the test
+#     # before the fix, we need to use spawn to test it
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]

From 79319cedfabbd7772234c238921e48d6549ece49 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 16:28:05 -0700
Subject: [PATCH 22/79] [Nightly benchmarking suite] Remove pkill python from
 run benchmark suite (#6965)

---
 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 04b02adf3644c..53a45b90b15f7 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -73,11 +73,6 @@ kill_gpu_processes() {
       echo "All GPU processes have been killed."
   fi
 
-  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
-  # since we are in container anyway
-  pkill -9 -f python
-  pkill -9 -f python3
-
   # waiting for GPU processes to be fully killed
   # loop while nvidia-smi returns any processes
   while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do

From fb4f530bf5004a9afef1380cb0a84bfb98a89c63 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 16:28:49 -0700
Subject: [PATCH 23/79] [CI] [nightly benchmark] Do not re-download sharegpt
 dataset if exists (#6706)

---
 .buildkite/nightly-benchmarks/run-benchmarks-suite.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 53a45b90b15f7..1a88d038b4b52 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -34,6 +34,15 @@ check_hf_token() {
   fi
 }
 
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+      wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+      echo "$FILE already exists."
+  fi
+}
+
 json2args() {
   # transforms the JSON string to command line args, and '_' is replaced to '-'
   # example:
@@ -350,7 +359,7 @@ main() {
 
   # prepare for benchmarking
   cd benchmarks || exit 1
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+  ensure_sharegpt_downloaded
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

From c32ab8be1ada40552e8a7c104a5de4830da2c7df Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Tue, 30 Jul 2024 17:53:21 -0700
Subject: [PATCH 24/79] [Speculative decoding] Add serving benchmark for llama3
 70b + speculative decoding (#6964)

---
 .../tests/serving-tests.json                  | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 86a0fefa339f7..300af0524d7c0 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -55,5 +55,26 @@
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200
         }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "disable_log_requests": "", 
+            "tensor_parallel_size": 4,
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1,
+            "use_v2_block_manager": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200 
+        }
     }
-]
\ No newline at end of file
+]

From da1f7cc12a12ea4a744d26122e9a13ea4b3f4c7b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:38:03 +0800
Subject: [PATCH 25/79] [mypy] Enable following imports for some directories
 (#6681)

---
 .github/workflows/mypy.yaml                   | 31 ++++------
 format.sh                                     | 30 ++++-----
 pyproject.toml                                | 18 +++++-
 vllm/_custom_ops.py                           |  2 +-
 vllm/_ipex_ops.py                             | 62 +++++++++++++++----
 vllm/adapter_commons/models.py                | 30 ++++-----
 vllm/adapter_commons/request.py               | 10 +--
 vllm/adapter_commons/worker_manager.py        | 14 ++---
 vllm/config.py                                |  3 +-
 vllm/engine/llm_engine.py                     | 14 ++---
 vllm/entrypoints/openai/serving_engine.py     |  4 +-
 vllm/scripts.py                               | 10 +--
 vllm/transformers_utils/detokenizer.py        |  2 +
 .../tokenizer_group/__init__.py               |  9 ++-
 .../tokenizer_group/base_tokenizer_group.py   | 18 +++---
 .../tokenizer_group/ray_tokenizer_group.py    | 30 ++++-----
 .../tokenizer_group/tokenizer_group.py        | 28 ++++-----
 vllm/utils.py                                 | 13 ++--
 18 files changed, 185 insertions(+), 143 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5780f09a646cb..721c9c026cf16 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,22 +32,17 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy tests --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
-        mypy vllm/attention --config-file pyproject.toml
-        mypy vllm/core --config-file pyproject.toml
-        mypy vllm/distributed --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
-        mypy vllm/entrypoints --config-file pyproject.toml
-        mypy vllm/executor --config-file pyproject.toml
-        mypy vllm/inputs --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/multimodal --config-file pyproject.toml
-        mypy vllm/platforms --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/worker --config-file pyproject.toml
+        mypy tests --follow-imports skip
+        mypy vllm/attention --follow-imports skip
+        mypy vllm/core --follow-imports skip
+        mypy vllm/distributed --follow-imports skip
+        mypy vllm/engine  --follow-imports skip
+        mypy vllm/entrypoints --follow-imports skip
+        mypy vllm/executor --follow-imports skip
+        mypy vllm/lora --follow-imports skip
+        mypy vllm/model_executor  --follow-imports skip
+        mypy vllm/prompt_adapter --follow-imports skip
+        mypy vllm/spec_decode --follow-imports skip
+        mypy vllm/worker --follow-imports skip
+        mypy
 
diff --git a/format.sh b/format.sh
index 5ad6d6f2938bb..71697cffacfb4 100755
--- a/format.sh
+++ b/format.sh
@@ -96,23 +96,19 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy tests --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
-mypy vllm/attention --config-file pyproject.toml
-mypy vllm/core --config-file pyproject.toml
-mypy vllm/distributed --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
-mypy vllm/entrypoints --config-file pyproject.toml
-mypy vllm/executor --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/prompt_adapter --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/worker --config-file pyproject.toml
+mypy tests --follow-imports skip
+mypy vllm/attention --follow-imports skip
+mypy vllm/core --follow-imports skip
+mypy vllm/distributed --follow-imports skip
+mypy vllm/engine  --follow-imports skip
+mypy vllm/entrypoints --follow-imports skip
+mypy vllm/executor --follow-imports skip
+mypy vllm/lora --follow-imports skip
+mypy vllm/model_executor  --follow-imports skip
+mypy vllm/prompt_adapter --follow-imports skip
+mypy vllm/spec_decode --follow-imports skip
+mypy vllm/worker --follow-imports skip
+mypy
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/pyproject.toml b/pyproject.toml
index 1ba1eacd90084..cd5d196a16200 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,9 +48,23 @@ python_version = "3.8"
 
 ignore_missing_imports = true
 check_untyped_defs = true
-follow_imports = "skip"
+follow_imports = "silent"
 
-files = "vllm"
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from format.sh and mypy.yaml
+files = [
+    "vllm/*.py",
+    "vllm/adapter_commons",
+    "vllm/assets",
+    "vllm/inputs",
+    "vllm/logging",
+    "vllm/multimodal",
+    "vllm/platforms",
+    "vllm/server",
+    "vllm/transformers_utils",
+    "vllm/triton_utils",
+    "vllm/usage",
+]
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6ca667eb85640..e351d602189e2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -239,7 +239,7 @@ def cutlass_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
                       scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype],
+                      out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index b4721b4e1aedd..2156f6b18adb6 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -25,27 +25,33 @@ def _reshape_activation_tensor(
         x2 = x2.reshape(num, d)
         return x1, x2
 
+    @staticmethod
     def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.silu_mul(x1, x2, out)
 
+    @staticmethod
     def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.gelu_mul(x1, x2, out, "none")
 
+    @staticmethod
     def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
         x1, x2 = ipex_ops._reshape_activation_tensor(x)
         ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
 
+    @staticmethod
     def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
+    @staticmethod
     def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
         out.copy_(torch.nn.functional.gelu(x))
 
     # TODO add implementation of gelu_quick here
     # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
 
+    @staticmethod
     def paged_attention_v1(
         out: torch.Tensor,
         query: torch.Tensor,
@@ -78,12 +84,21 @@ def paged_attention_v1(
         ).view(num_kv_heads,
                1).repeat_interleave(num_queries_per_tokens).flatten()
         # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v1(out, query.contiguous(),
-                                     key_cache.view_as(value_cache),
-                                     value_cache, head_mapping, scale,
-                                     block_tables, context_lens, block_size,
-                                     max_context_len, alibi_slopes)
+        torch.xpu.paged_attention_v1(  # type: ignore
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
 
+    @staticmethod
     def paged_attention_v2(
         out: torch.Tensor,
         exp_sum: torch.Tensor,
@@ -119,13 +134,24 @@ def paged_attention_v2(
         ).view(num_kv_heads,
                1).repeat_interleave(num_queries_per_tokens).flatten()
         # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v2(out, exp_sum, max_logits, tmp_out,
-                                     query.contiguous(),
-                                     key_cache.view_as(value_cache),
-                                     value_cache, head_mapping, block_tables,
-                                     context_lens, scale, block_size,
-                                     max_context_len, alibi_slopes)
+        torch.xpu.paged_attention_v2(  # type: ignore
+            out,
+            exp_sum,
+            max_logits,
+            tmp_out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            block_tables,
+            context_lens,
+            scale,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
 
+    @staticmethod
     def rotary_embedding(
         positions: torch.Tensor,  # [batch_size, seq_len]
         query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
@@ -158,6 +184,7 @@ def rotary_embedding(
         ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
                                              rotary_dim, is_neox, positions)
 
+    @staticmethod
     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                                  key: torch.Tensor, head_size: int,
                                  cos_sin_cache: torch.Tensor, is_neox: bool,
@@ -189,17 +216,20 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
         ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
                                              rotary_dim, is_neox, positions)
 
+    @staticmethod
     def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
                  epsilon: float) -> None:
         tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
         out.copy_(tmp)
 
+    @staticmethod
     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                            weight: torch.Tensor, epsilon: float) -> None:
         tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
                                                epsilon, True)
         input.copy_(tmp)
 
+    @staticmethod
     def varlen_attention(
         query: torch.Tensor,
         key: torch.Tensor,
@@ -222,6 +252,7 @@ def varlen_attention(
                                              softmax_scale, zero_tensors,
                                              is_causal, return_softmax, gen_)
 
+    @staticmethod
     def reshape_and_cache(
         key: torch.Tensor,
         value: torch.Tensor,
@@ -240,8 +271,13 @@ def reshape_and_cache(
     def copy_blocks(key_caches: List[torch.Tensor],
                     value_caches: List[torch.Tensor],
                     block_mapping: torch.Tensor) -> None:
-        torch.xpu.copy_blocks(key_caches, value_caches, block_mapping)
+        torch.xpu.copy_blocks(  # type: ignore
+            key_caches,
+            value_caches,
+            block_mapping,
+        )
 
+    @staticmethod
     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                     block_mapping: torch.Tensor) -> None:
-        torch.xpu.swap_blocks(src, dst, block_mapping)
+        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index 6939b1405f3e1..a5c04ab78fbe8 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -31,7 +31,7 @@ def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: Hashable, value: T):
+    def _on_remove(self, key: Hashable, value: Optional[T]):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
@@ -59,46 +59,46 @@ def __len__(self) -> int:
 
     @property
     @abstractmethod
-    def adapter_slots(self):
-        ...
+    def adapter_slots(self) -> int:
+        raise NotImplementedError
 
     @property
     @abstractmethod
-    def capacity(self):
-        ...
+    def capacity(self) -> int:
+        raise NotImplementedError
 
     @abstractmethod
     def activate_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def deactivate_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def add_adapter(self, adapter: Any) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def set_adapter_mapping(self, mapping: Any) -> None:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def remove_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
-    def remove_all_adapters(self):
-        ...
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def get_adapter(self, adapter_id: int) -> Optional[Any]:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def list_adapters(self) -> Dict[int, Any]:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def pin_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 69775ab7d4548..f98adeba1c705 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,19 +1,19 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
 
 @dataclass
-class AdapterRequest:
+class AdapterRequest(ABC):
     """
     Base class for adapter requests.
     """
 
     @property
     @abstractmethod
-    def adapter_id(self):
-        ...
+    def adapter_id(self) -> int:
+        raise NotImplementedError
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.adapter_id < 1:
             raise ValueError(f"id must be > 0, got {self.adapter_id}")
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index acf18993af6d7..83929e82ebf04 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -12,25 +12,25 @@ def __init__(self, device: torch.device):
     @property
     @abstractmethod
     def is_enabled(self) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def set_active_adapters(self, requests: Set[Any],
                             mapping: Optional[Any]) -> None:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def add_adapter(self, adapter_request: Any) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
     def remove_adapter(self, adapter_id: int) -> bool:
-        ...
+        raise NotImplementedError
 
     @abstractmethod
-    def remove_all_adapters(self):
-        ...
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
 
     @abstractmethod
     def list_adapters(self) -> Set[int]:
-        ...
+        raise NotImplementedError
diff --git a/vllm/config.py b/vllm/config.py
index e7b54e04b00d5..fd48cc3a6b371 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -724,7 +724,7 @@ def __init__(
                         backend)
 
         self._verify_args()
-        self.rank = 0
+        self.rank: int = 0
 
     @property
     def use_ray(self) -> bool:
@@ -850,6 +850,7 @@ def _verify_args(self) -> None:
 
 
 class DeviceConfig:
+    device: Optional[torch.device]
 
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 004348d4c49a3..1efe2206abe81 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,8 +5,6 @@
 from typing import Sequence as GenericSequence
 from typing import Set, Type, TypeVar, Union
 
-from transformers import PreTrainedTokenizer
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -40,7 +38,8 @@
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
+from vllm.transformers_utils.tokenizer_group import (AnyTokenizer,
+                                                     BaseTokenizerGroup,
                                                      get_tokenizer_group)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
@@ -477,13 +476,12 @@ def get_tokenizer_group(
         return self.tokenizer
 
     def get_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
 
-    def get_tokenizer_for_seq(self,
-                              sequence: Sequence) -> "PreTrainedTokenizer":
+    def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(
             sequence.lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 321c9ac2c1d5f..b374a7946b11e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,7 +5,6 @@
 from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
 
 from pydantic import Field
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
@@ -30,6 +29,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
+from vllm.transformers_utils.tokenizer_group import AnyTokenizer
 
 logger = init_logger(__name__)
 
@@ -49,8 +49,6 @@ class LoRAModulePath:
 AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
                    EmbeddingRequest, TokenizeRequest]
 
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
 
 class TextTokensPrompt(TypedDict):
     prompt: str
diff --git a/vllm/scripts.py b/vllm/scripts.py
index aefa5cec93a57..403b22239aed0 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -4,9 +4,10 @@
 import os
 import signal
 import sys
-from typing import Optional
+from typing import List, Optional
 
 from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
 
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import make_arg_parser
@@ -63,15 +64,14 @@ def complete(model_name: str, client: OpenAI) -> None:
 
 def chat(system_prompt: Optional[str], model_name: str,
          client: OpenAI) -> None:
-    conversation = []
+    conversation: List[ChatCompletionMessageParam] = []
     if system_prompt is not None:
         conversation.append({"role": "system", "content": system_prompt})
 
     print("Please enter a message for the chat model:")
     while True:
         input_message = input("> ")
-        message = {"role": "user", "content": input_message}
-        conversation.append(message)
+        conversation.append({"role": "user", "content": input_message})
 
         chat_completion = client.chat.completions.create(model=model_name,
                                                          messages=conversation)
@@ -79,7 +79,7 @@ def chat(system_prompt: Optional[str], model_name: str,
         response_message = chat_completion.choices[0].message
         output = response_message.content
 
-        conversation.append(response_message)
+        conversation.append(response_message)  # type: ignore
         print(output)
 
 
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 0a45028e7759b..76f418674532f 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -37,6 +37,8 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
             The prompt logprobs with the decoded tokens.
         """
         prms = seq_group.sampling_params
+        assert prms is not None
+
         # We can pick any sequence for the prompt.
         seq = next(iter(seq_group.seqs_dict.values()))
         # Only prompt, without the generated token.
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 9f54f5409b181..7a0436dd1fb16 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -2,10 +2,9 @@
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)
+
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 if ray:
     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
@@ -34,4 +33,4 @@ def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
     return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
 
 
-__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
+__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 9682db6966ddf..abbcdf2807f6f 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -1,11 +1,13 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Union
 
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
 
 class BaseTokenizerGroup(ABC):
     """A group of tokenizers that can be used for LoRA adapters."""
@@ -47,17 +49,17 @@ async def encode_async(
 
     @abstractmethod
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         """Get a tokenizer for a LoRA request."""
         pass
 
     @abstractmethod
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         """Get a tokenizer for a LoRA request."""
         pass
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 32384398a4c12..eebdf7bf644d0 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -6,18 +6,16 @@
     from ray.exceptions import ActorDiedError
 except ImportError:
     # For older versions of Ray
-    from ray.exceptions import RayActorError as ActorDiedError
+    from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
-from transformers import PreTrainedTokenizer
 
 from vllm.config import TokenizerPoolConfig
 from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)
+
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 logger = init_logger(__name__)
 
@@ -67,7 +65,7 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
             **self._tokenizer_config, )
 
         self._ray_tokenizer_group_cls = ray.remote(
-            self._worker_cls).options(**ray_actor_options)
+            self._worker_cls).options(**ray_actor_options)  # type: ignore
         self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
         self._idle_actors: Optional[asyncio.Queue] = None
 
@@ -83,8 +81,10 @@ def pool_size(self) -> int:
         return len(self.tokenizer_actors)
 
     def ping(self):
-        return ray.get(
-            [actor.ping.remote() for actor in self.tokenizer_actors])
+        return ray.get([
+            actor.ping.remote()  # type: ignore
+            for actor in self.tokenizer_actors
+        ])
 
     def _ensure_queue_initialized(self):
         if self._idle_actors is None:
@@ -208,15 +208,15 @@ def get_max_input_len(self,
         return self._local_tokenizer_group.get_max_input_len(lora_request)
 
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
 
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         return await self._local_tokenizer_group.get_lora_tokenizer_async(
             lora_request)
 
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 74c041f13bad9..a5186e48068e9 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -1,16 +1,14 @@
 from typing import List, Optional
 
-from transformers import PreTrainedTokenizer
-
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
                                                get_lora_tokenizer_async,
                                                get_tokenizer)
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
 from vllm.utils import LRUCache
 
+from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
+
 
 class TokenizerGroup(BaseTokenizerGroup):
     """A group of tokenizers that can be used for LoRA adapters."""
@@ -22,8 +20,8 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
-        self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
-            capacity=max_num_seqs) if enable_lora else None
+        self.lora_tokenizers = LRUCache[AnyTokenizer](
+            capacity=max_num_seqs if enable_lora else 0)
 
     @classmethod
     def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
@@ -41,7 +39,7 @@ def get_max_input_len(self,
         return self.max_input_length
 
     def _raise_if_input_too_long(self,
-                                 encoded_tokens: List[str],
+                                 encoded_tokens: List[int],
                                  lora_request: Optional[LoRARequest] = None):
         input_length = len(encoded_tokens)
         if lora_request:
@@ -72,9 +70,9 @@ async def encode_async(
         return ret
 
     def get_lora_tokenizer(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -83,12 +81,12 @@ def get_lora_tokenizer(
             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
             return tokenizer
         else:
-            return self.lora_tokenizers.get(lora_request.lora_int_id)
+            return self.lora_tokenizers[lora_request.lora_int_id]
 
     async def get_lora_tokenizer_async(
-            self,
-            lora_request: Optional[LoRARequest] = None
-    ) -> "PreTrainedTokenizer":
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -97,4 +95,4 @@ async def get_lora_tokenizer_async(
             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
             return tokenizer
         else:
-            return self.lora_tokenizers.get(lora_request.lora_int_id)
+            return self.lora_tokenizers[lora_request.lora_int_id]
diff --git a/vllm/utils.py b/vllm/utils.py
index 1448316e66edb..b7589ca50ba5b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -94,8 +94,10 @@ def __contains__(self, key: Hashable) -> bool:
     def __len__(self) -> int:
         return len(self.cache)
 
-    def __getitem__(self, key: Hashable) -> Optional[T]:
-        return self.get(key)
+    def __getitem__(self, key: Hashable) -> T:
+        value = self.cache[key]  # Raise KeyError if not exists
+        self.cache.move_to_end(key)
+        return value
 
     def __setitem__(self, key: Hashable, value: T) -> None:
         self.put(key, value)
@@ -109,8 +111,9 @@ def touch(self, key: Hashable) -> None:
     def get(self,
             key: Hashable,
             default_value: Optional[T] = None) -> Optional[T]:
+        value: Optional[T]
         if key in self.cache:
-            value: Optional[T] = self.cache[key]
+            value = self.cache[key]
             self.cache.move_to_end(key)
         else:
             value = default_value
@@ -590,8 +593,8 @@ def current_memory_usage(self) -> float:
             torch.cuda.reset_peak_memory_stats(self.device)
             mem = torch.cuda.max_memory_allocated(self.device)
         elif is_xpu():
-            torch.xpu.reset_peak_memory_stats(self.device)
-            mem = torch.xpu.max_memory_allocated(self.device)
+            torch.xpu.reset_peak_memory_stats(self.device)  # type: ignore
+            mem = torch.xpu.max_memory_allocated(self.device)  # type: ignore
         return mem
 
     def __enter__(self):

From f230cc2ca6614dd4eecf3af9f12c3ddbcf83036e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:38:45 +0800
Subject: [PATCH 26/79] [Bugfix] Fix broadcasting logic for
 `multi_modal_kwargs` (#6836)

---
 .buildkite/test-pipeline.yaml                 |  5 +-
 .../dev/multimodal/multimodal_index.rst       |  2 +
 .../distributed/test_multimodal_broadcast.py  |  9 +-
 tests/distributed/test_parallel_state.py      | 57 -----------
 tests/models/test_llava_next.py               | 96 ++++++++++++-------
 vllm/distributed/parallel_state.py            | 46 +++------
 vllm/multimodal/__init__.py                   |  6 +-
 vllm/multimodal/base.py                       | 62 ++++++++----
 vllm/spec_decode/draft_model_runner.py        |  4 +-
 vllm/utils.py                                 | 51 +++++++++-
 vllm/worker/cpu_model_runner.py               | 27 +++---
 vllm/worker/embedding_model_runner.py         | 16 +++-
 vllm/worker/model_runner.py                   | 14 +--
 vllm/worker/neuron_model_runner.py            | 17 ++--
 vllm/worker/openvino_model_runner.py          | 26 ++---
 vllm/worker/xpu_model_runner.py               | 27 +++---
 16 files changed, 254 insertions(+), 211 deletions(-)
 delete mode 100644 tests/distributed/test_parallel_state.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 91418e5ec1752..9ec9ec12bfcfe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -56,7 +56,6 @@ steps:
   fast_check: true
   commands:
   - pytest -v -s core
-  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
@@ -90,13 +89,13 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 9784f4cc2e088..f70fd03e259ff 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -44,6 +44,8 @@ Base Classes
 
 .. autodata:: vllm.multimodal.BatchedTensors
 
+.. autodata:: vllm.multimodal.BatchedTensorInputs
+
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index 8e0e8ecd675eb..a99917f586949 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -19,10 +19,10 @@
 
 model = os.environ["TEST_DIST_MODEL"]
 
-if model.startswith("llava-hf/llava"):
+if model.startswith("llava-hf/llava-1.5"):
     from ..models.test_llava import models, run_test
-elif model.startswith("microsoft/Phi-3-vision"):
-    from ..models.test_phi3v import models, run_test
+elif model.startswith("llava-hf/llava-v1.6"):
+    from ..models.test_llava_next import models, run_test
 else:
     raise NotImplementedError(f"Unsupported model: {model}")
 
@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
         vllm_runner,
         image_assets,
         model=models[0],
-        size_factors=[1.0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
deleted file mode 100644
index 3adcf6b61046d..0000000000000
--- a/tests/distributed/test_parallel_state.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (_split_tensor_dict,
-                                             _update_nested_dict)
-
-
-def test_split_tensor_dict():
-    test_dict = {
-        "key_a": "a",
-        "key_b": torch.arange(8, dtype=torch.float32),
-        "key_c": {
-            "key_1": torch.arange(5, dtype=torch.float32),
-            "key_2": torch.tensor([], dtype=torch.float32),
-            "key_3": 123,
-        },
-        "key_d": {},
-    }
-    metadata_list, tensor_list = _split_tensor_dict(test_dict)
-    assert len(metadata_list) == 6
-    assert torch.allclose(tensor_list[0], test_dict["key_b"])
-    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
-    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
-
-
-def test_split_tensor_dict_invalid_key():
-    test_dict = {
-        "a%b": "a",
-    }
-    with pytest.raises(AssertionError):
-        _split_tensor_dict(test_dict)
-
-
-def test_update_nested_dict():
-    flattened_keys_values = [("key1%key2%key3", "value1"),
-                             ("key1%key2%key4", "value2"),
-                             ("key1%key5", "value3"), ("key6%key7", "value4"),
-                             ("key8", "value5")]
-    res: Dict[str, Any] = {}
-
-    for flat_key, value in flattened_keys_values:
-        _update_nested_dict(res, flat_key, value)
-    assert res == {
-        "key1": {
-            "key2": {
-                "key3": "value1",
-                "key4": "value2"
-            },
-            "key5": "value3"
-        },
-        "key6": {
-            "key7": "value4"
-        },
-        "key8": "value5"
-    }
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 2f200c13ea001..9c64f39eb6d08 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,14 +1,12 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.model_executor.models.llava_next import (
-    get_llava_next_image_feature_size)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
@@ -27,6 +25,8 @@
 
 IMAGE_TOKEN_ID = 32000
 
+models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         )
 
 
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
 @pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
                                                          (183, 488, 776)])
 def test_image_feature_size(height_and_width_and_result):
+    # Avoid initializing CUDA too early in distributed tests
+    from vllm.model_executor.models.llava_next import (
+        get_llava_next_image_feature_size)
+
     height, width, result = height_and_width_and_result
     config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     assert get_llava_next_image_feature_size(config,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 4116b1729d188..bf7a7de0724af 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -45,22 +45,16 @@ class GraphCaptureContext:
 
 
 def _split_tensor_dict(
-        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
-
-    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
-    metadata will be "key1%key2".
     """
     metadata_list: List[Tuple[str, Any]] = []
-    tensor_list = []
+    tensor_list: List[torch.Tensor] = []
     for key, value in tensor_dict.items():
-        assert "%" not in key, (
-            "Avoid having '%' in key "
-            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
@@ -68,31 +62,13 @@ def _split_tensor_dict(
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (prefix + key, TensorMetadata(device, value.dtype,
-                                              value.size())))
+                (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
-        elif isinstance(value, dict):
-            if len(value) == 0:
-                metadata_list.append((prefix + key, value))
-            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
-                value, prefix + key + "%")
-            metadata_list.extend(inner_metadata_list)
-            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((prefix + key, value))
+            metadata_list.append((key, value))
     return metadata_list, tensor_list
 
 
-def _update_nested_dict(nested_dict, flattened_key, value):
-    key_splits = flattened_key.split("%")
-    cur_dict = nested_dict
-    for k in key_splits[:-1]:
-        if k not in cur_dict:
-            cur_dict[k] = {}
-        cur_dict = cur_dict[k]
-    cur_dict[key_splits[-1]] = value
-
-
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -566,7 +542,7 @@ def broadcast_tensor_dict(
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        _update_nested_dict(tensor_dict, key, tensor)
+                        tensor_dict[key] = tensor
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
@@ -583,9 +559,9 @@ def broadcast_tensor_dict(
                             group=group,
                             async_op=True)
                     async_handles.append(handle)
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                 else:
-                    _update_nested_dict(tensor_dict, key, value)
+                    tensor_dict[key] = value
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
@@ -661,7 +637,7 @@ def recv_tensor_dict(
                                      device=value.device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                     continue
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
@@ -673,9 +649,9 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=group)
-                _update_nested_dict(tensor_dict, key, tensor)
+                tensor_dict[key] = tensor
             else:
-                _update_nested_dict(tensor_dict, key, value)
+                tensor_dict[key] = value
         return tensor_dict
 
     def barrier(self):
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 0e3b35d425cb7..456e41ebfad03 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,6 @@
-from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
-                   MultiModalInputs, MultiModalPlugin, NestedTensors)
+from .base import (BatchedTensorInputs, BatchedTensors, MultiModalDataBuiltins,
+                   MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
+                   NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -12,6 +13,7 @@
 """
 
 __all__ = [
+    "BatchedTensorInputs",
     "BatchedTensors",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5abd0ad61cdf9..f13885ef0dab0 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,10 +9,12 @@
 import torch.types
 from PIL import Image
 from torch import nn
+from typing_extensions import TypeAlias
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
+from vllm.utils import JSONTree, json_map_leaves
 
 logger = init_logger(__name__)
 
@@ -22,11 +24,16 @@
 Currently only supports up to singly nested list of tensors.
 """
 
-BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors]
+BatchedTensors: TypeAlias = JSONTree[torch.Tensor]
 """
-If each input tensor in the batch has the same size, this is a single batched
-tensor; otherwise, this is a list of :class:`NestedTensors` with one element
-per item in the batch.
+A nested JSON structure of tensors which have been batched via
+:meth:`MultiModalInputs.batch`.
+"""
+
+BatchedTensorInputs: TypeAlias = Dict[str, JSONTree[torch.Tensor]]
+"""
+A dictionary containing nested tensors which have been batched via
+:meth:`MultiModalInputs.batch`.
 """
 
 if sys.version_info < (3, 9):
@@ -46,14 +53,17 @@ class MultiModalInputs(_MultiModalInputsBase):
     """
 
     @staticmethod
-    def try_concat(
+    def _try_concat(
         tensors: List[NestedTensors],
-        *,
-        device: torch.types.Device,
-    ) -> BatchedTensors:
+    ) -> Union[GenericSequence[NestedTensors], NestedTensors]:
+        """
+        If each input tensor in the batch has the same shape, return a single
+        batched tensor; otherwise, return a list of :class:`NestedTensors` with
+        one element per item in the batch.
+        """
         # may be list rather than tensors
         if isinstance(tensors[0], list):
-            return [[t.to(device=device) for t in tensor[0]]
+            return [[t for t in tensor[0]]
                     for tensor in cast(List[List[torch.Tensor]], tensors)]
 
         tensors_ = cast(List[torch.Tensor], tensors)
@@ -62,18 +72,21 @@ def try_concat(
 
         for tensor in tensors_:
             if tensor.shape[1:] != unbatched_shape:
-                return [
-                    tensor.squeeze(0).to(device=device) for tensor in tensors_
-                ]
+                return [tensor.squeeze(0) for tensor in tensors_]
 
-        return torch.cat(tensors_, dim=0).to(device=device)
+        return torch.cat(tensors_, dim=0)
 
     @staticmethod
-    def batch(
-        inputs_list: List["MultiModalInputs"],
-        device: torch.types.Device,
-    ) -> Dict[str, BatchedTensors]:
-        """Batch multiple inputs together into a dictionary."""
+    def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
+        """
+        Batch multiple inputs together into a dictionary.
+
+        The resulting dictionary has the same keys as the inputs.
+        If the corresponding value from each input is a tensor and they all
+        share the same shape, the output value is a single batched tensor;
+        otherwise, the output value is a list containing the original value
+        from each input.
+        """
         if len(inputs_list) == 0:
             return {}
 
@@ -90,9 +103,18 @@ def batch(
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalInputs.try_concat(item_list, device=device)
+            k: MultiModalInputs._try_concat(item_list)
             for k, item_list in item_lists.items()
-        }
+        }  # type: ignore
+
+    @staticmethod
+    def as_kwargs(
+        batched_inputs: BatchedTensorInputs,
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensorInputs:
+        return json_map_leaves(lambda x: x.to(device, non_blocking=True),
+                               batched_inputs)
 
 
 class MultiModalDataBuiltins(TypedDict, total=False):
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 95071ecb6c8da..0b755600ae824 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -15,6 +15,7 @@
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
+from vllm.multimodal import MultiModalInputs
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SamplerOutput)
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
@@ -323,7 +324,8 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
-                **multi_modal_kwargs,
+                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                             device=self.device),
             )
 
             # Compute the logits.
diff --git a/vllm/utils.py b/vllm/utils.py
index b7589ca50ba5b..38e1782a51ab9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -17,7 +17,7 @@
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
                     Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
-                    Union)
+                    Union, overload)
 
 import numpy as np
 import numpy.typing as npt
@@ -53,6 +53,7 @@
 P = ParamSpec('P')
 K = TypeVar("K")
 T = TypeVar("T")
+U = TypeVar("U")
 
 
 class _Sentinel:
@@ -712,6 +713,54 @@ def merge_dicts(dict1: Dict[K, List[T]],
     return dict(merged_dict)
 
 
+JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"],
+                 Tuple["JSONTree[T]", ...], T]
+"""A nested JSON structure where the leaves need not be JSON-serializable."""
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: Dict[str, JSONTree[T]],
+) -> Dict[str, JSONTree[U]]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: List[JSONTree[T]],
+) -> List[JSONTree[U]]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: Tuple[JSONTree[T], ...],
+) -> Tuple[JSONTree[U], ...]:
+    ...
+
+
+@overload
+def json_map_leaves(
+    func: Callable[[T], U],
+    value: JSONTree[T],
+) -> JSONTree[U]:
+    ...
+
+
+def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
+    if isinstance(value, dict):
+        return {k: json_map_leaves(func, v) for k, v in value.items()}
+    elif isinstance(value, list):
+        return [json_map_leaves(func, v) for v in value]
+    elif isinstance(value, tuple):
+        return tuple(json_map_leaves(func, v) for v in value)
+    else:
+        return func(value)
+
+
 def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     """Flatten a list of lists to a single list."""
     return [item for sublist in lists for item in sublist]
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index c1dee444da512..e22e152a8a8ad 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -12,7 +11,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -41,7 +40,7 @@ class CPUModelInput(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
     virtual_engine: Optional[int] = None
 
     def as_broadcastable_tensor_dict(
@@ -136,7 +135,7 @@ def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Mapping[str, BatchedTensors]]:
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
@@ -214,8 +213,7 @@ def _prepare_prompt(
             slot_mapping=slot_mapping,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
@@ -361,11 +359,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index e919dbd18d9df..72ab96cf3c2e1 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -8,6 +8,7 @@
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MultiModalInputs
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
                            SequenceGroupMetadata)
@@ -99,11 +100,16 @@ def execute_model(
         kv_caches = [None] * num_layers
 
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 4010c45e10267..de999b11d91b5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -4,8 +4,8 @@
 import warnings
 import weakref
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
-                    Tuple, Type, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -40,7 +40,7 @@
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.prompt_adapter.layers import PromptAdapterMapping
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -94,7 +94,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     attn_metadata: Optional["AttentionMetadata"] = None
     prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
     prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
     request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
     finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
@@ -608,8 +608,7 @@ def build(self) -> ModelInputForGPU:
             data.multi_modal_inputs for data in self.inter_data_list
             if data.multi_modal_inputs is not None
         ]
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.runner.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
@@ -1361,7 +1360,8 @@ def execute_model(
             kv_caches=kv_caches,
             attn_metadata=model_input.attn_metadata,
             intermediate_tensors=intermediate_tensors,
-            **multi_modal_kwargs,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
             **seqlen_agnostic_kwargs)
 
         # Compute the logits in the last pipeline stage.
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 243e2ece56fe5..6448e5ff4ac5e 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -10,7 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
@@ -32,7 +31,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -84,8 +83,8 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
-            str, BatchedTensors]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
@@ -134,8 +133,7 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, input_block_ids, seq_lens,
                 multi_modal_kwargs)
@@ -244,7 +242,8 @@ def execute_model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
-            **(model_input.multi_modal_kwargs or {}),
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         )
 
         # Compute the logits.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 6281cec09825f..a1d09a2f9e53e 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, NamedTuple, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.openvino import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
@@ -25,7 +25,7 @@ class ModelInput(NamedTuple):
     attn_metadata: Optional[OpenVINOAttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
-    multi_modal_kwargs: Mapping[str, BatchedTensors]
+    multi_modal_kwargs: BatchedTensorInputs
 
     @classmethod
     def empty(cls, device):
@@ -265,8 +265,7 @@ def _prepare_model_input(
             max_context_len=max_context_len_tensor,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return ModelInput(
             input_tokens,
@@ -281,7 +280,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
-               SamplingMetadata, Mapping[str, BatchedTensors]]:
+               SamplingMetadata, BatchedTensorInputs]:
         # Prepare input tensors.
         (
             input_tokens,
@@ -324,11 +323,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
-            **(multi_modal_kwargs or {}),
+            "input_ids":
+            input_tokens,
+            "positions":
+            input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            attn_metadata,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 98462f0f7f38e..112e494fadede 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -14,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.interfaces import supports_vision
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
@@ -49,7 +48,7 @@ class ModelInputForXPU(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
+    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -376,11 +375,16 @@ def execute_model(
 
         model_executable = self.model
         execute_model_kwargs = {
-            "input_ids": model_input.input_tokens,
-            "positions": model_input.input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": model_input.attn_metadata,
-            **(model_input.multi_modal_kwargs or {}),
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
         }
 
         hidden_states = model_executable(**execute_model_kwargs)
@@ -404,7 +408,7 @@ def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Mapping[str, BatchedTensors]]:
+               BatchedTensorInputs]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
@@ -496,8 +500,7 @@ def _prepare_prompt(
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)

From 9f0e69b65350fad1d4a9c71ef58d6ae70eb635e8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 10:49:48 +0800
Subject: [PATCH 27/79] [CI/Build] Fix mypy errors (#6968)

---
 vllm/_custom_ops.py     | 4 ++--
 vllm/multimodal/base.py | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e351d602189e2..2c09ca2c1407c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,6 +1,6 @@
 import contextlib
 import functools
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -336,7 +336,7 @@ def scaled_fp8_quant(
     """
     # This code assumes batch_dim and num_tokens are flattened
     assert (input.ndim == 2)
-    shape = input.shape
+    shape: Union[Tuple[int, int], torch.Size] = input.shape
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index f13885ef0dab0..aefb5f438c5ad 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -53,9 +53,7 @@ class MultiModalInputs(_MultiModalInputsBase):
     """
 
     @staticmethod
-    def _try_concat(
-        tensors: List[NestedTensors],
-    ) -> Union[GenericSequence[NestedTensors], NestedTensors]:
+    def _try_concat(tensors: List[NestedTensors]) -> BatchedTensors:
         """
         If each input tensor in the batch has the same shape, return a single
         batched tensor; otherwise, return a list of :class:`NestedTensors` with
@@ -105,7 +103,7 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
         return {
             k: MultiModalInputs._try_concat(item_list)
             for k, item_list in item_lists.items()
-        }  # type: ignore
+        }
 
     @staticmethod
     def as_kwargs(

From 533d1932d23917f7a29ba04c6be5c8b2676b6969 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 31 Jul 2024 00:19:28 -0700
Subject: [PATCH 28/79] [Bugfix][TPU] Set readonly=True for non-root devices
 (#6980)

---
 vllm/worker/tpu_worker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 17fa5c35457c2..90c86d4e6c59d 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -104,7 +104,10 @@ def init_device(self) -> None:
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): This does not completely eliminate the recompilation
         # overhead because dynamo does not cache the compiled results.
-        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
+        # NOTE(woosuk): Set readonly=False only for the rank 0 process to avoid
+        # race conditions.
+        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH,
+                            readonly=not self.is_driver_worker)
 
     def load_model(self):
         self.model_runner.load_model()

From c0644cf9cef0002485749defcaa02e3fec359d49 Mon Sep 17 00:00:00 2001
From: Fei <dfdfcai4@gmail.com>
Date: Wed, 31 Jul 2024 01:16:01 -0700
Subject: [PATCH 29/79] [Bugfix] fix logit processor excceed vocab size issue
 (#6927)

---
 vllm/entrypoints/openai/logits_processors.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 31eb5aa628c52..f8e04e7f18e0f 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -58,6 +58,12 @@ def get_logits_processors(
                 "Found token_id in logit_bias that is not "
                 "an integer or string representing an integer") from exc
 
+        # Check if token_id is within the vocab size
+        for token_id, bias in clamped_logit_bias.items():
+            if token_id < 0 or token_id >= tokenizer.vocab_size:
+                raise ValueError("token_id in logit_bias contains "
+                                 "out-of-vocab token id")
+
         def logit_bias_logits_processor(token_ids: List[int],
                                         logits: torch.Tensor) -> torch.Tensor:
             for token_id, bias in clamped_logit_bias.items():

From 6512937de1d7b4738938e0bb3004be86b6883729 Mon Sep 17 00:00:00 2001
From: HandH1998 <1335248067@qq.com>
Date: Wed, 31 Jul 2024 21:55:21 +0800
Subject: [PATCH 30/79] Support W4A8 quantization for vllm (#5218)

---
 .../configs/Meta-Llama-3-8B-QQQ.yaml          |   11 +
 .../lm-eval-harness/configs/models-small.txt  |    1 +
 CMakeLists.txt                                |    1 +
 csrc/ops.h                                    |    7 +
 csrc/quantization/marlin/dense/common/base.h  |   32 +
 csrc/quantization/marlin/dense/common/mem.h   |   89 ++
 .../marlin/dense/marlin_cuda_kernel.cu        |   90 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      | 1243 +++++++++++++++++
 csrc/torch_bindings.cpp                       |    4 +
 tests/kernels/test_marlin_gemm.py             |   66 +
 vllm/_custom_ops.py                           |    9 +
 .../layers/quantization/__init__.py           |    2 +
 .../model_executor/layers/quantization/qqq.py |  285 ++++
 .../utils/marlin_utils_test_qqq.py            |  125 ++
 .../layers/quantization/utils/quant_utils.py  |   82 ++
 15 files changed, 1963 insertions(+), 84 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
 create mode 100644 csrc/quantization/marlin/dense/common/base.h
 create mode 100644 csrc/quantization/marlin/dense/common/mem.h
 create mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
 create mode 100644 vllm/model_executor/layers/quantization/qqq.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
new file mode 100644
index 0000000000000..c457468902c98
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.409
+  - name: "exact_match,flexible-extract"
+    value: 0.406
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index e4df4b547aa5e..bca89f00653e3 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -7,3 +7,4 @@ Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf00a36edc500..28b8879a7ba17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
diff --git a/csrc/ops.h b/csrc/ops.h
index f075850248d1c..f274a7e647b95 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -115,6 +115,13 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        c10::optional<torch::Tensor> const& bias);
 
+torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
+                              torch::Tensor const& b_q_weight,
+                              torch::Tensor const& s_tok,
+                              torch::Tensor const& s_ch,
+                              torch::Tensor const& s_group,
+                              torch::Tensor& workspace, int64_t size_m,
+                              int64_t size_n, int64_t size_k);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h
new file mode 100644
index 0000000000000..68c83d5478cf8
--- /dev/null
+++ b/csrc/quantization/marlin/dense/common/base.h
@@ -0,0 +1,32 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h
new file mode 100644
index 0000000000000..64f9c393d77ce
--- /dev/null
+++ b/csrc/quantization/marlin/dense/common/mem.h
@@ -0,0 +1,89 @@
+/*
+ * Modified by HandH1998
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index efbcc182a3ae4..1ce734c9d90de 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -25,6 +25,12 @@
 
 #include <iostream>
 
+#include "common/base.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "common/mem.h"
+#endif
+
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -32,23 +38,9 @@ inline std::string str(T x) {
 
 namespace marlin_dense {
 
-constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 
-// Instances of `Vec` are used to organize groups of >>registers<<, as needed
-// for instance as inputs to tensor core operations. Consequently, all
-// corresponding index accesses must be compile-time constants, which is why we
-// extensively use `#pragma unroll` throughout the kernel code to guarantee
-// this.
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
-
 using I4 = Vec<int, 4>;
-
 // Matrix fragments for tensor core instructions; their precise layout is
 // documented here:
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
@@ -57,43 +49,6 @@ using FragB = Vec<half2, 2>;
 using FragC = Vec<float, 4>;
 using FragS = Vec<half2, 1>;  // quantization scales
 
-// Predicated asynchronous global->shared copy; used for inputs A where we apply
-// predication to handle batchsizes that are not multiples of 16.
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-// Asynchronous global->shared copy
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// Async copy fence.
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-// Wait until at most `n` async copy stages are still pending.
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
 __device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
@@ -164,39 +119,6 @@ __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
-
 template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
new file mode 100644
index 0000000000000..4162a38af1035
--- /dev/null
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -0,0 +1,1243 @@
+/*
+ * Adapted from
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
+ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
+ * Modified by HandH1998
+ * Copyright (C) 2024 HandH1998
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "../dense/common/base.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #include "../dense/common/mem.h"
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+using I4 = Vec<int, 4>;
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
+using FragA = Vec<uint32_t, 2>;
+using FragB = Vec<uint32_t, 1>;
+using FragC = Vec<int, 4>;
+using FragS_GROUP = Vec<half2, 1>;  // weight per-group quantization scales
+using FragS_CHANNEL =
+    Vec<float, 2>;  // weight per-channel quantization scales or activaton
+                    // per-token quantization scales
+
+// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
+// cp.async.ca can support BYTES = 4, 8, 16;
+// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
+// and cp_size = 1 float, i.e., 4 BYTES
+// Asynchronous global->shared copy for activation quantizaton scales s_tok
+__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.ca.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// m16n8k16 tensor core mma instruction with int8 inputs and int32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  int* c = reinterpret_cast<int*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+        "r"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in int8 tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+inline __device__ half2 float2_to_half2(float2 f) {
+  uint32_t res;
+  // NOTE(HandH1998): h0,h1 should be uint16_t, not half
+  uint16_t h0, h1;
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y));
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1));
+  return reinterpret_cast<half2&>(res);
+}
+
+inline __device__ float int32_to_float(int h) {
+  float res;
+  asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h));
+  return res;
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per channel dequant.
+__device__ inline FragB dequant_per_channel(int q) {
+  static constexpr int MASK = 0xf0f0f0f0;
+  FragB frag_b;
+  frag_b[0] = (q & MASK);
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
+// for weight per group dequant.
+__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
+  static constexpr uint32_t LO = 0x000f000f;
+  static constexpr uint32_t HI = 0x00f000f0;
+  static constexpr uint32_t EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  static constexpr uint32_t SUB = 0x64086408;
+  static constexpr uint32_t MUL = 0x2c002c00;
+  static constexpr uint32_t ADD = 0xd480d480;
+  *reinterpret_cast<half2*>(&t0) = __hsub2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<const half2*>(&SUB));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<const half2*>(&MUL),
+      *reinterpret_cast<const half2*>(&ADD));
+
+  uint16_t s = reinterpret_cast<uint16_t*>(&frag_s)[i];
+  uint32_t double_s;
+  // pack 2xfp16 to half2
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s));
+  // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
+  // half, respectively)
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  *reinterpret_cast<half2*>(&t0) = __hfma2(
+      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  *reinterpret_cast<half2*>(&t1) = __hfma2(
+      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<half2*>(&double_s),
+      *reinterpret_cast<const half2*>(&MAGIC_NUM));
+  // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
+  // int8 into 1 uint32
+  FragB frag_b;
+  uint32_t uint8s;
+  static constexpr uint32_t MASK_0246 = 0x6420;
+  static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
+  asm volatile("prmt.b32 %0,%1,%2,%3;\n"
+               : "=r"(uint8s)
+               : "r"(t0), "r"(t1), "n"(MASK_0246));
+  frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK);
+  return frag_b;
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if constexpr (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4;
+    D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 16;
+      C += 16 * thread_m_blocks * prob_n / 4;
+      D += 16 * thread_m_blocks * prob_n / 8;
+      s_tok += 16 * thread_m_blocks;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 16;  // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 16;  // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      16;  // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride *
+      (threads / a_gl_rd_delta_o);  // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      1 * ((threads / 32) /
+           (thread_n_blocks / 4));  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16;  // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta);  // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  constexpr int s_tok_sh_stride = 16 * thread_m_blocks;
+
+  constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4;
+
+  int s_group_gl_stride = prob_n / 8;
+  constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_group_sh_stage = s_group_sh_stride;
+  int s_group_gl_rd_delta = s_group_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
+  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16);
+  a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  int s_tok_gl_rd = threadIdx.x;
+  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
+  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
+  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
+  // s_tok's size is not fixed, we can not shuffle before inference we shuffle
+  // it when fetching s_tok from global memory to shared memory, that's why
+  // s_tok_sh_wr is like this
+  int s_tok_sh_wr =
+      (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8;
+  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
+  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
+
+  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  int s_ch_sh_wr = threadIdx.x;
+  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   2 * ((threadIdx.x % 32) % 4);
+  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
+
+  int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd;
+  bool s_group_sh_wr_pred;
+  if constexpr (group_blocks != -1) {
+    s_group_gl_rd =
+        s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+        s_group_sh_stride * slice_col + threadIdx.x;
+    s_group_sh_wr = threadIdx.x;
+    // NOTE(HandH1998): s_group_sh_rd is related to mma output C
+    s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                    (threadIdx.x % 32) / 4;
+    s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride;
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
+  // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s_tok = sh_b + (stages * b_sh_stage);
+  int4* sh_s_ch = sh_s_tok + s_tok_sh_stride;
+  int4* sh_s_group = sh_s_ch + s_ch_sh_stride;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS_GROUP frag_s_group[2][4];
+  FragS_CHANNEL frag_s_tok[thread_m_blocks];
+  FragS_CHANNEL frag_s_ch[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<int*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if constexpr (group_blocks != -1) {
+        if (pipe % (group_blocks / thread_k_blocks) == 0) {
+          int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe;
+          if (s_group_sh_wr_pred)
+            cp_async4(&sh_s_group_stage[s_group_sh_wr],
+                      &s_group[s_group_gl_rd]);
+          s_group_gl_rd += s_group_gl_rd_delta;
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if constexpr (group_blocks != -1) {
+      int4* sh_s_group_stage =
+          sh_s_group +
+          s_group_sh_stage * ((group_blocks / thread_k_blocks) *
+                              (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s_group[k % 2])[0] =
+          sh_s_group_stage[s_group_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      // int b_quant_shift = b_quant << 4;
+      FragB frag_b0, frag_b1;
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if constexpr (group_blocks != -1) {
+        int b_quant_shift = b_quant >> 8;
+        frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0);
+        frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1);
+      } else {
+        int b_quant_shift = b_quant << 4;
+        frag_b0 = dequant_per_channel(b_quant);
+        frag_b1 = dequant_per_channel(b_quant_shift);
+      }
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                int* c_rd =
+                    reinterpret_cast<int*>(&sh[red_sh_delta * j + red_sh_rd]);
+                int* c_wr = reinterpret_cast<int*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            int* c_rd =
+                reinterpret_cast<int*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  // global_reduce works on INT32 elements, which are the results of INT8 GEMM.
+  // This is why we need another INT32 maxtrix `C` to reduce instead of the
+  // original half matrix `D`.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 4;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 8 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
+      c_gl_wr += (4 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads * 2;
+      int c_sh_wr = 2 * threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i + 1],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2) + 1],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1];
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red1)[j];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] +=
+                  reinterpret_cast<int*>(&d_red2)[j];
+            }
+          }
+          if (!last) {
+            int4 d1, d2;
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d1)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)];
+            }
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              reinterpret_cast<int*>(&d2)[j] = reinterpret_cast<int*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)];
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                d1;
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) +
+              1] = d2;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int d_gl_stride = prob_n / 8;
+    constexpr int d_sh_stride = 2 * thread_n_blocks + 1;
+    int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int d_sh_rd_delta =
+        d_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    d_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int d_sh_wr =
+        (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    d_sh_wr += 32 * (threadIdx.x / 32);
+    int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int d_gl_wr_end = d_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) {
+      float2 deq_res;
+      deq_res.x = int32_to_float(c0) * w_s[0] * a_s;
+      deq_res.y = int32_to_float(c1) * w_s[1] * a_s;
+      ((half2*)sh)[idx] = float2_to_half2(deq_res);
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = d_sh_wr + 8 * j;
+          write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s_tok[i][0],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s_tok[i][1],
+                frag_s_ch[j / 2][2 * (j % 2) + 1]);
+        }
+        d_sh_wr += 16 * (4 * d_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (d_gl_wr < d_gl_wr_end) {
+        D[d_gl_wr] = sh[d_sh_rd];
+        d_gl_wr += d_gl_wr_delta;
+        d_sh_rd += d_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (last) {
+        if (s_tok_sh_wr_pred) {
+          cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]);
+        }
+        if (s_ch_sh_wr_pred) {
+          cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]);
+        }
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+            frag_s_tok[i][0] =
+                *reinterpret_cast<float*>(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]);
+            frag_s_tok[i][1] = *reinterpret_cast<float*>(
+                &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]);
+          }
+          reinterpret_cast<int4*>(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0];
+          reinterpret_cast<int4*>(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1];
+          reinterpret_cast<int4*>(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8];
+          reinterpret_cast<int4*>(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9];
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x;
+        s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // int8 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // int32 global_reduce buffer of shape
+                           // (max_par*16*4)xn, as int8 tensor core's output is
+                           // int32 dtype
+    int4* __restrict__ D,              // fp16 output buffer of shape mxn
+    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
+                                       // scales of shape mx1
+    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
+                                       // scales of shape 1xn
+    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
+                                       // scales of shape (k/groupsize)xn, when
+                                       // group_blocks=-1, it should be nullptr
+    int prob_m,                        // batch dimension m
+    int prob_n,                        // output dimension n
+    int prob_k,                        // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8;  // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         max_shared_mem);                                      \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS>                                               \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr,      \
+            prob_m, prob_n, prob_k, locks);                                    \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D,
+                     void* s_tok, void* s_ch, void* s_group, int prob_m,
+                     int prob_n, int prob_k, void* workspace,
+                     int groupsize = -1, int dev = 0, cudaStream_t stream = 0,
+                     int thread_k = -1, int thread_n = -1, int sms = -1,
+                     int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* D_ptr = (int4*)D;
+  const float* s_tok_ptr = (const float*)s_tok;
+  const int4* s_ch_ptr = (const int4*)s_ch;
+  const int4* s_group_ptr = (const int4*)s_group;
+
+  int* locks = (int*)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par) par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par;
+    D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+    s_tok_ptr += 16 * thread_m_blocks * par;
+  }
+}
+}  // anonymous namespace
+
+torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
+                              torch::Tensor const& b_q_weight,
+                              torch::Tensor const& s_tok,
+                              torch::Tensor const& s_ch,
+                              torch::Tensor const& s_group,
+                              torch::Tensor& workspace, int64_t size_m,
+                              int64_t size_n, int64_t size_k) {
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+  TORCH_CHECK(size_m == s_tok.numel(),
+              "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  TORCH_CHECK(
+      (size_k / tile_size) == b_q_weight.size(0),
+      "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) +
+          ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size));
+
+  int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0);
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify N
+  TORCH_CHECK(s_ch.numel() == size_n,
+              "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(tile_size));
+  if (groupsize != -1) {
+    TORCH_CHECK(s_group.size(1) == size_n,
+                "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) +
+                    ", size_n = " + str(size_n));
+    TORCH_CHECK(
+        size_k % s_group.size(0) == 0,
+        "size_k = " + str(size_k) +
+            ", is not divisible by s_group.size(0) = " + str(s_group.size(0)));
+  }
+
+  int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "Shape mismatch: size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify s_tok device, strides and dtype
+  TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU");
+  TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous");
+  TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32");
+
+  // Verify s_ch device, strides and dtype
+  TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU");
+  TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous");
+  TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32");
+
+  // Verify s_group device, strides and dtype
+  TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU");
+  TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous");
+  TORCH_CHECK(s_group.dtype() == torch::kFloat16,
+              "s_group's dtype is not float16");
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " + str(min_thread_n));
+  int min_workspace_size = (size_n / min_thread_n) * max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device());
+  torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c);
+
+  // Alloc D matrix
+  auto options_d =
+      torch::TensorOptions().dtype(torch::kFloat16).device(a.device());
+  torch::Tensor d = torch::empty({size_m, size_n}, options_d);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  int dev = a.get_device();
+  marlin_qqq_cuda(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(),
+      s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n,
+      size_k, workspace.data_ptr(), groupsize, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par);
+
+  return d;
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3027b63ba2b33..bf8cefa8d4713 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -149,6 +149,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
   ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
 
+  // marlin_qqq_gemm for QQQ.
+  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization.
   ops.def(
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index bd35ef2eb2552..a9e34ac8a7aa8 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -10,6 +10,9 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
+    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
@@ -21,6 +24,8 @@
     marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
+    marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
     sort_weights)
@@ -425,3 +430,64 @@ def test_awq_marlin_gemm(
     print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
+
+
+@pytest.mark.skipif(not is_quant_method_supported("qqq"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_marlin_qqq_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+):
+    int8_traits = torch.iinfo(torch.int8)
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+
+    # Quantize activations
+    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
+        torch.float)
+    q_a = (a_input / s_a).round().clamp(int8_traits.min,
+                                        int8_traits.max).to(torch.int8)
+
+    # Quantize weights
+    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
+    marlin_qqq_quantize(b_weight, num_bits, group_size)
+
+    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
+                                MARLIN_QQQ_MAX_PARALLEL)
+
+    output = ops.marlin_qqq_gemm(
+        q_a,
+        marlin_qqq_q_w,
+        s_a,
+        marlin_qqq_s_channel,
+        marlin_qqq_s_group,
+        workspace.scratch,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+    )
+    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+
+    assert max_diff < 0.04
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2c09ca2c1407c..9e09b9a32eab9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -389,6 +389,15 @@ def scaled_int8_quant(
     return output, input_scales
 
 
+# qqq ops
+def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    s_tok: torch.Tensor, s_ch: torch.Tensor,
+                    s_group: torch.Tensor, workspace: torch.Tensor,
+                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+
 # moe
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index bd574512e3431..13da6376ec295 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+from vllm.model_executor.layers.quantization.qqq import QQQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -37,6 +38,7 @@
     "squeezellm": SqueezeLLMConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
+    "qqq": QQQConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
new file mode 100644
index 0000000000000..be10cee2cf68f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -0,0 +1,285 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+MARLIN_QQQ_TILE = 16
+MARLIN_QQQ_MIN_THREAD_N = 64
+MARLIN_QQQ_MIN_THREAD_K = 128
+MARLIN_QQQ_MAX_PARALLEL = 16
+
+MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
+MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_QQQ_SUPPORTED_SYM = [True]
+
+
+class QQQConfig(QuantizationConfig):
+    """Config class for QQQ
+    
+    Reference: https://arxiv.org/pdf/2406.09904
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        is_sym: bool = True,
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.is_sym = is_sym
+
+        # Verify
+        if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"QQQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} "
+                "are supported.")
+        if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QQQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+        if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM:
+            raise ValueError(
+                f"QQQ does not support is_sym = {self.is_sym}. "
+                f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // self.weight_bits
+
+        # Tile size used by QQQ kernels.
+        self.tile_size = MARLIN_QQQ_TILE
+
+        # Min out_features dim
+        self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N
+
+        # Min in_features dim
+        self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = MARLIN_QQQ_MAX_PARALLEL
+
+        # Permutation length used by the QQQ kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return "QQQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "qqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QQQConfig":
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QQQLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return QQQLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QQQLinearMethod(LinearMethodBase):
+    """Linear method for QQQ.
+
+    Args:
+        quant_config: The QQQ quantization config.
+    """
+
+    def __init__(self, quant_config: QQQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+                "marlin_tile_size": self.quant_config.tile_size,
+            },
+        )
+
+        s_channel = Parameter(
+            torch.empty(
+                1,
+                output_size_per_partition,
+                device="cuda",
+                dtype=torch.float,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            s_channel,
+            {
+                "input_dim": None,
+                "output_dim": 1,
+            },
+        )
+
+        if self.quant_config.group_size == -1:
+            s_group = Parameter(
+                torch.tensor(
+                    [],
+                    device="cuda",
+                    dtype=torch.half,
+                ),
+                requires_grad=False,
+            )
+        else:
+            s_group = Parameter(
+                torch.empty(
+                    input_size_per_partition // self.quant_config.group_size,
+                    output_size_per_partition,
+                    device="cuda",
+                    dtype=torch.half,
+                ),
+                requires_grad=False,
+            )
+
+        set_weight_attrs(
+            s_group,
+            {
+                "input_dim": None if self.quant_config.group_size == -1 else 0,
+                "output_dim":
+                None if self.quant_config.group_size == -1 else 1,
+            },
+        )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+        workspace = Parameter(torch.zeros(max_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.int),
+                              requires_grad=False)
+
+        layer.register_parameter("B", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+        layer.register_parameter("s_channel", s_channel)
+        set_weight_attrs(s_channel, extra_weight_attrs)
+        layer.register_parameter("s_group", s_group)
+        set_weight_attrs(s_group, extra_weight_attrs)
+        layer.register_parameter("workspace", workspace)
+        set_weight_attrs(workspace, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        s_ch = layer.s_channel
+        s_group = layer.s_group
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = s_ch.shape[1]
+
+        x_int8, s_tok = ops.scaled_int8_quant(x_2d)
+
+        output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
new file mode 100644
index 0000000000000..cb58eb9458363
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -0,0 +1,125 @@
+from typing import List
+
+import numpy
+import torch
+
+from .marlin_utils_test import marlin_permute_weights
+from .quant_utils import get_pack_factor, qqq_quantize_weights
+
+
+def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+                           dtype=numpy.uint32)
+    if group_size == size_k:
+        for i in range(pack_factor):
+            q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
+    else:
+        for i in range(pack_factor):
+            q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_qqq_scale_perms():
+    scale_perm: List[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: List[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
+def get_qqq_weight_perm(num_bits: int, quant_type: str):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = numpy.array(perm_list)
+
+    assert quant_type in ["per-channel",
+                          "per-group"], "not supported quantization type"
+    if num_bits == 4:
+        if quant_type == "per-channel":
+            interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
+        else:
+            interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    else:
+        raise Exception("num_bits must be 4, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
+    scale_perm, scale_perm_single = get_qqq_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+        s_group = s_group.reshape((-1, size_n)).contiguous()
+    else:
+        s_channel = s_channel.reshape(
+            (-1, len(scale_perm_single)))[:, scale_perm_single]
+    s_channel = s_channel.reshape((-1, size_n)).contiguous()
+
+    return s_group, s_channel
+
+
+def marlin_qqq_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    quant_type = "per-channel" if group_size == size_k else "per-group"
+
+    # Quantize
+    w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
+        w, num_bits, group_size)
+
+    # Reformat to marlin_qqq
+    weight_perm = get_qqq_weight_perm(num_bits, quant_type)
+    marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
+                                        weight_perm, group_size)
+    marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
+        s_group, s_channel, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [
+        w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
+    ]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 2ba6a9a810ec0..7ade8bf664ccc 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -205,6 +205,88 @@ def reshape_w(w):
     )
 
 
+# QQQ employs different quant schemes for per-group and
+# per-channel quantization.
+def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    if group_size < size_k:
+        # Reshape to [groupsize, -1]
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+        max_q_val = 2**num_bits - 1
+        half_q_val = (max_q_val + 1) // 2
+
+        # Compute scale for each group
+        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_group *= 2 / max_q_val  # 2 => symmetric
+
+        # Quantize
+        q_w = torch.round(w / s_group).int()
+        q_w += half_q_val
+        q_w = torch.clamp(q_w, 0, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = (q_w - half_q_val).half() * s_group
+
+        # Restore original shapes
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
+
+        # Compute int8 quantization scale for each channel
+        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
+        s_channel /= 127.0
+        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
+        w_ref = t_int8.half() * s_channel
+        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
+
+        # Fuse scales
+        s_group = (s_group.reshape(-1, size_n).contiguous() /
+                   s_channel).to(dtype=torch.half)
+    else:
+        max_q_val = 2**(num_bits - 1) - 1
+
+        # Compute scale for each channel
+        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
+        s_channel /= max_q_val
+
+        # Quantize
+        q_w = torch.round(w / s_channel).int()
+        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
+        # Compute ref (dequantized)
+        w_ref = q_w.half() * s_channel
+
+        s_group = torch.tensor([], dtype=torch.half)
+        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
+        s_channel /= (2**(8 - num_bits))
+        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s_group.to(device=orig_device),
+        s_channel.to(device=orig_device),
+    )
+
+
 def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     orig_device = q_w.device
 

From 2f4e108f75c817bf2f323e306db590e13d2863f6 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Wed, 31 Jul 2024 22:39:19 +0800
Subject: [PATCH 31/79] [Bugfix] Clean up MiniCPM-V (#6939)

Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.rst |   6 +-
 vllm/model_executor/models/llama.py     |   4 +-
 vllm/model_executor/models/minicpm.py   |   4 +-
 vllm/model_executor/models/minicpmv.py  | 249 +++++---
 vllm/model_executor/models/na_vit.py    | 804 ++++++++++++++++++++++++
 vllm/model_executor/models/qwen2.py     |   2 +-
 6 files changed, 975 insertions(+), 94 deletions(-)
 create mode 100644 vllm/model_executor/models/na_vit.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 4fe33e5ab5d80..a1ea366b82b04 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -222,9 +222,13 @@ Vision Language Models
     -
   * - :code:`MiniCPM-V`
     - MiniCPM-V
-    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+.. note::
+  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
+  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 306d22e42ed1d..2052c443a8885 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -418,11 +418,9 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
-                                  input_embeds)
+                                  attn_metadata, intermediate_tensors)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 7a8ac0bb1f949..b46e88f5fc584 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -370,6 +370,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
@@ -463,11 +464,10 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        input_embeds: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, input_embeds)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 8563216d9c392..2a7fe7ba0ebac 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -20,32 +20,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights."""
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
 from functools import partial
-from typing import Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn.functional as F
+import torch.types
 from PIL import Image
 from torch import nn
 from torch.nn.init import trunc_normal_
 from transformers.configuration_utils import PretrainedConfig
-from transformers.models.idefics2.modeling_idefics2 import (
-    Idefics2VisionTransformer)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsVision
-from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.minicpm import MiniCPMModel
+from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import (cached_get_image_processor,
@@ -53,12 +55,12 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
 _KEYS_TO_MODIFY_MAPPING = {
-    "language_model.lm_head": "lm_head",
-    "language_model.model": "language_model",
+    "llm.lm_head": "lm_head",
+    "llm.model": "llm",
 }
 
 
-def get_abs_pos(abs_pos, tgt_size):
+def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor):
     # abs_pos: L, C
     # tgt_size: (H, W)
     # return: M, C
@@ -75,10 +77,10 @@ def get_abs_pos(abs_pos, tgt_size):
 
 
 # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
-def get_2d_sincos_pos_embed(embed_dim,
-                            grid_size,
-                            cls_token=False,
-                            version=2.0):
+def get_2d_sincos_pos_embed(embed_dim: int,
+                            grid_size: Union[int, Tuple[int, int]],
+                            cls_token: bool = False,
+                            version: Tuple[int, int] = (2, 0)):
     """
     grid_size: int of the grid height and width
     return:
@@ -95,7 +97,7 @@ def get_2d_sincos_pos_embed(embed_dim,
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
 
-    if version == 2.0:
+    if version == (2, 0):
         grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
         pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
         if cls_token:
@@ -106,7 +108,9 @@ def get_2d_sincos_pos_embed(embed_dim,
     return pos_embed
 
 
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
+def get_2d_sincos_pos_embed_from_grid(embed_dim: int,
+                                      grid: Union[int, Tuple[int, int]],
+                                      version: Tuple[int, int] = (2, 0)):
     assert embed_dim % 2 == 0
 
     # use half of dimensions to encode grid_h
@@ -115,14 +119,16 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0):
     emb_w = get_1d_sincos_pos_embed_from_grid(
         embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
 
-    if version == 2.0:
+    if version == (2, 0):
         emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
     else:
         emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
     return emb
 
 
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
+def get_1d_sincos_pos_embed_from_grid(embed_dim: int,
+                                      pos: int,
+                                      version: Tuple[int, int] = (2, 0)):
     """
     embed_dim: output dimension for each position
     pos: a list of positions to be encoded: size (M,) / (H, W)
@@ -133,7 +139,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0):
     omega /= embed_dim / 2.
     omega = 1. / 10000**omega  # (D/2,)
 
-    if version == 2.0:
+    if version == (2, 0):
         pos = pos.reshape(-1)  # (M,)
         out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
         emb_sin = np.sin(out)  # (M, D/2)
@@ -158,19 +164,19 @@ class Resampler(nn.Module):
     default_norm_layer = partial(nn.LayerNorm, eps=1e-6)
 
     def __init__(self,
-                 num_queries,
-                 grid_size,
-                 embed_dim,
-                 num_heads,
-                 kv_dim=None,
-                 norm_layer=default_norm_layer,
-                 adaptive=False,
-                 max_size=(70, 70),
-                 version=2.0):
+                 num_queries: int,
+                 grid_size: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: nn.Module = default_norm_layer,
+                 adaptive: bool = False,
+                 max_size: Tuple[int, int] = (70, 70),
+                 version: Tuple[int, int] = (2, 0)):
         super().__init__()
 
         self.version = version
-        if self.version == 2.0:
+        if self.version == (2, 0):
             self.num_queries = grid_size**2
         else:
             self.num_queries = num_queries
@@ -195,7 +201,7 @@ def __init__(self,
         self.proj = nn.Parameter(
             (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
 
-        if self.version == 2.0:
+        if self.version == (2, 0):
             self.pos_embed = nn.Parameter(
                 torch.from_numpy(
                     get_2d_sincos_pos_embed(
@@ -206,14 +212,17 @@ def __init__(self,
 
         self.apply(self._init_weights)
 
-    def _set_2d_pos_cache(self, max_size, device='cpu'):
+    def _set_2d_pos_cache(self,
+                          max_size: Tuple[int, int],
+                          device: torch.types.Device = 'cpu'):
         pos_embed = torch.from_numpy(
             get_2d_sincos_pos_embed(self.embed_dim,
                                     max_size,
                                     version=self.version)).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
 
-    def _adjust_pos_cache(self, tgt_sizes, device):
+    def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
+                          device: torch.types.Device):
         max_h = torch.max(tgt_sizes[:, 0])
         max_w = torch.max(tgt_sizes[:, 1])
         if max_h > self.max_size[0] or max_w > self.max_size[1]:
@@ -223,7 +232,7 @@ def _adjust_pos_cache(self, tgt_sizes, device):
             ]
             self._set_2d_pos_cache(self.max_size, device)
 
-    def _init_weights(self, m):
+    def _init_weights(self, m: nn.Module):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
@@ -232,7 +241,9 @@ def _init_weights(self, m):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
 
-    def forward_2_5(self, x, tgt_sizes=None):
+    def forward_2_5(self,
+                    x: torch.Tensor,
+                    tgt_sizes: Optional[torch.Tensor] = None):
         assert x.shape[0] == tgt_sizes.shape[0]
         bs = x.shape[0]
 
@@ -278,7 +289,10 @@ def forward_2_5(self, x, tgt_sizes=None):
         x = x @ self.proj
         return x
 
-    def forward_2(self, x, tgt_sizes=None, attn_mask=None):
+    def forward_2(self,
+                  x: torch.Tensor,
+                  tgt_sizes: Optional[torch.Tensor] = None,
+                  attn_mask: Optional[torch.Tensor] = None):
         if self.adaptive:
             pos_embed = torch.Tensor(
                 get_2d_sincos_pos_embed(self.embed_dim,
@@ -302,8 +316,11 @@ def forward_2(self, x, tgt_sizes=None, attn_mask=None):
         x = x @ self.proj
         return x
 
-    def forward(self, x, tgt_sizes=None, attn_mask=None):
-        if self.version == 2.0:
+    def forward(self,
+                x: torch.Tensor,
+                tgt_sizes: Optional[torch.Tensor] = None,
+                attn_mask: Optional[torch.Tensor] = None):
+        if self.version == (2, 0):
             return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask)
         else:
             return self.forward_2_5(x, tgt_sizes=tgt_sizes)
@@ -322,7 +339,7 @@ def dummy_seq_data_for_minicpmv(seq_len: int):
     return SequenceData(token_ids)
 
 
-def dummy_image_for_minicpmv(hf_config):
+def dummy_image_for_minicpmv(hf_config: PretrainedConfig):
     width = height = hf_config.image_size
     image = Image.new("RGB", (width, height), color=0)
     return {"image": image}
@@ -381,7 +398,7 @@ class MiniCPMV(nn.Module, SupportsVision):
 
     def __init__(
         self,
-        config,
+        config: PretrainedConfig,
         multimodal_config: MultiModalConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -390,30 +407,48 @@ def __init__(
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.version = float(self.config.version)
+        if not hasattr(self.config, "version"):
+            if self.config.hidden_size == 2304 and self.config.query_num == 64:
+                self.version = (2, 0)
+            else:
+                self.version = (2, 5)
+        else:
+            self.version = str(self.config.version).split(".")
+            self.version = tuple([int(x) for x in self.version])
         self.llm = self.init_llm(config, cache_config, quant_config)
         self.vpm = self.init_vision_module()
         param_dtype = torch.get_default_dtype()
         self.vpm.to(dtype=param_dtype)
-        self.vision_dim = self.vpm.embed_dim if self.version == 2.0 \
+        self.vision_dim = self.vpm.embed_dim if self.version == (2, 0) \
             else self.vpm.embeddings.embed_dim
-        self.embed_dim = self.llm.config.hidden_size
+        self.embed_dim = self.config.hidden_size
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
         self.resampler.to(device="cuda", dtype=param_dtype)
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    def init_llm(self, config, cache_config, quant_config):
-        if self.version == 2.0:
-            return MiniCPMForCausalLM(config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config)
+    def init_llm(self,
+                 config: PretrainedConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
+        if self.version == (2, 0):
+            return MiniCPMModel(config,
+                                cache_config=cache_config,
+                                quant_config=quant_config)
+        elif self.version == (2, 5):
+            return LlamaModel(config,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
         else:
-            return LlamaForCausalLM(config,
-                                    cache_config=cache_config,
-                                    quant_config=quant_config)
+            return Qwen2Model(config,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
 
     def init_vision_module(self):
-        if self.version == 2.0:
+        if self.version == (2, 0):
             try:
                 import timm
             except ImportError:
@@ -433,16 +468,30 @@ def init_vision_module(self):
 
             if self.config.drop_vision_last_layer:
                 model.blocks = model.blocks[:-1]
-        else:
+        elif self.version == (2, 5):
+            from transformers.models.idefics2.modeling_idefics2 import (
+                Idefics2VisionTransformer)
             model = Idefics2VisionTransformer(self.config.vision_config)
             if self.config.drop_vision_last_layer:
                 model.encoder.layers = model.encoder.layers[:-1]
+        else:
+            from vllm.model_executor.models.na_vit import (
+                SiglipVisionTransformer)
+            if self.config._attn_implementation == 'flash_attention_2':
+                self.config.vision_config._attn_implementation \
+                    = 'flash_attention_2'
+            else:
+                # not support sdpa
+                self.config.vision_config._attn_implementation = 'eager'
+            model = SiglipVisionTransformer(self.config.vision_config)
+            if self.config.drop_vision_last_layer:
+                model.encoder.layers = model.encoder.layers[:-1]
         return model
 
-    def init_resampler(self, embed_dim, vision_dim):
+    def init_resampler(self, embed_dim: int, vision_dim: int):
         default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.float16)
-        if self.version == 2.0:
+        if self.version == (2, 0):
             resampler = Resampler(grid_size=int(
                 math.sqrt(self.config.query_num)),
                                   num_queries=None,
@@ -463,11 +512,11 @@ def init_resampler(self, embed_dim, vision_dim):
         return resampler
 
     def get_vision_embedding(self,
-                             pixel_values,
-                             patch_attn_mask=None,
-                             tgt_sizes=None,
-                             version=2.0):
-        if version == 2.0:
+                             pixel_values: List[List[torch.Tensor]],
+                             patch_attn_mask: Optional[torch.Tensor] = None,
+                             tgt_sizes: Optional[torch.Tensor] = None,
+                             version: Tuple[int, int] = (2, 0)):
+        if version == (2, 0):
             res = []
             dtype = self.vpm.pos_embed.data.dtype
             for pixel_value in pixel_values:
@@ -484,21 +533,32 @@ def get_vision_embedding(self,
                                                         num_prefix_tokens:]
                 res.append(self.resampler(vision_embedding, tgt_size))
             return torch.vstack(res)
-        else:
+        elif version == (2, 5):
             vision_embedding = self.vpm(
                 pixel_values.type(dtype),
                 patch_attention_mask=patch_attn_mask).last_hidden_state
             vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+        else:
+            vision_embedding = self.vpm(pixel_values.type(dtype),
+                                        patch_attention_mask=patch_attn_mask,
+                                        tgt_sizes=tgt_sizes).last_hidden_state
 
-    def get_image_bounds(self, input_ids):
+    def get_image_bounds(self, input_ids: torch.Tensor):
         tokenizer = cached_get_tokenizer(self.config._name_or_path,
                                          trust_remote_code=True)
-        im_start_token_id = tokenizer.im_start_id
-        im_end_token_id = tokenizer.im_end_id
-        image_start_tokens = torch.where(input_ids == im_start_token_id)[0]
+        if not hasattr(tokenizer, "slice_start_id"):
+            start_cond = input_ids == tokenizer.im_start_id
+            end_cond = input_ids == tokenizer.im_end_id
+        else:
+            start_cond = (input_ids == tokenizer.im_start_id) | (
+                input_ids == tokenizer.slice_start_id)
+            end_cond = (input_ids == tokenizer.im_end_id) | (
+                input_ids == tokenizer.slice_end_id)
+
+        image_start_tokens = torch.where(start_cond)[0]
         image_start_tokens += 1
-        image_end_tokens = torch.where(input_ids == im_end_token_id)[0]
-        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+        image_end_tokens = torch.where(end_cond)[0]
+        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
         if valid_image_nums == 0:
             return []
         image_bound = torch.hstack([
@@ -508,12 +568,14 @@ def get_image_bounds(self, input_ids):
 
         return image_bound
 
-    def get_vision_hidden_states(self, data):
+    def get_vision_hidden_states(self, data: Dict[str,
+                                                  Union[List[torch.Tensor],
+                                                        torch.Tensor]]):
         if "vision_hidden_states" not in data:
             pixel_values = data["pixel_values"]
             tgt_sizes = data["tgt_sizes"]
             vision_hidden_states = []
-            if self.version == 2.0:
+            if self.version == (2, 0):
                 if pixel_values is not None and len(pixel_values) > 0:
                     vision_hidden_states = self.get_vision_embedding(
                         pixel_values)
@@ -534,17 +596,26 @@ def get_vision_hidden_states(self, data):
                     B, L, _ = all_pixel_values.shape
                     all_pixel_values = all_pixel_values.permute(
                         0, 2, 1).reshape(B, 3, -1, L)
-
                     patch_attn_mask = torch.zeros((B, 1, max_patches),
                                                   dtype=torch.bool,
                                                   device=device)
-                    for i in range(B):
-                        patch_attn_mask[i, :tgt_sizes[i][0] *
-                                        tgt_sizes[i][1]] = True
+                    if self.version == (2, 5):
+                        for i in range(B):
+                            patch_attn_mask[i, :tgt_sizes[i][0] *
+                                            tgt_sizes[i][1]] = True
+                        vision_embedding = self.vpm(
+                            all_pixel_values.type(dtype),
+                            patch_attention_mask=patch_attn_mask
+                        ).last_hidden_state
+                    else:
+                        for i in range(B):
+                            patch_attn_mask[i, 0, :tgt_sizes[i][0] *
+                                            tgt_sizes[i][1]] = True
+                        vision_embedding = self.vpm(
+                            all_pixel_values.type(dtype),
+                            patch_attention_mask=patch_attn_mask,
+                            tgt_sizes=tgt_sizes).last_hidden_state
 
-                    vision_embedding = self.vpm(
-                        all_pixel_values.type(dtype),
-                        patch_attention_mask=patch_attn_mask).last_hidden_state
                     vision_hidden_states = self.resampler(
                         vision_embedding, tgt_sizes)
 
@@ -556,7 +627,8 @@ def get_vision_hidden_states(self, data):
 
         return vision_hidden_states
 
-    def get_embedding(self, data):
+    def get_embedding(self, data: Dict[str, Union[List[torch.Tensor],
+                                                  torch.Tensor]]):
         input_ids = data["input_ids"]
 
         vision_hidden_states = self.get_vision_hidden_states(data)
@@ -565,11 +637,11 @@ def get_embedding(self, data):
         else:
             image_bounds = []
 
-        if hasattr(self.llm.config, 'scale_emb'):
-            vlm_embedding = self.llm.model.embed_tokens(
-                input_ids) * self.llm.config.scale_emb
+        if hasattr(self.config, 'scale_emb'):
+            vlm_embedding = self.llm.embed_tokens(
+                input_ids) * self.config.scale_emb
         else:
-            vlm_embedding = self.llm.model.embed_tokens(input_ids)
+            vlm_embedding = self.llm.embed_tokens(input_ids)
         vision_hidden_states = [
             i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i
             for i in vision_hidden_states
@@ -587,7 +659,9 @@ def get_embedding(self, data):
                 vision_hidden_states.view(-1, vision_hidden_states.shape[-1]))
         return vlm_embedding, vision_hidden_states
 
-    def process_multimodal_inputs(self, inputs):
+    def process_multimodal_inputs(self, inputs: Dict[str,
+                                                     Union[List[torch.Tensor],
+                                                           torch.Tensor]]):
         pixel_values = []
         tgt_sizes = []
         for b in range(len(inputs["pixel_values"])):
@@ -613,7 +687,6 @@ def forward(
             "input_ids": input_ids,
             "tgt_sizes": kwargs.pop("tgt_sizes", None),
         }
-
         inputs = self.process_multimodal_inputs(inputs)
 
         vlm_embeddings, vision_hidden_states = self.get_embedding(inputs)
@@ -623,19 +696,21 @@ def forward(
                           kv_caches=kv_caches,
                           attn_metadata=attn_metadata,
                           intermediate_tensors=intermediate_tensors,
-                          input_embeds=vlm_embeddings)
+                          inputs_embeds=vlm_embeddings)
         return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        return self.llm.compute_logits(hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
 
     def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        next_tokens = self.llm.sample(logits, sampling_metadata)
+        next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -649,9 +724,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            #     for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-            #         if key_to_modify in name:
-            #             name = name.replace(key_to_modify, new_key)
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py
new file mode 100644
index 0000000000000..871e4128b66e1
--- /dev/null
+++ b/vllm/model_executor/models/na_vit.py
@@ -0,0 +1,804 @@
+import logging
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (ModelOutput, is_flash_attn_2_available,
+                                replace_return_docstrings)
+
+logger = logging.getLogger("vllm")
+
+
+# For Siglip: copied from
+#   HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+# Remove hints as there's little possibility to change these code.
+class SiglipVisionConfig(PretrainedConfig):
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to "
+                "instantiate a model of type %s. "
+                "This is not supported for all configurations"
+                "of models and can yield errors.", config_dict['model_type'],
+                cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l_ = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l_ - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(tensor: torch.Tensor,
+                     mean: float = 0.0,
+                     std: float = 1.0,
+                     a: float = -2.0,
+                     b: float = 2.0) -> torch.Tensor:
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+class SiglipVisionModelOutput(ModelOutput):
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor,
+                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size,
+                                              max_im_w // self.patch_size)
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads (got `embed_dim`: "
+                f"{self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(
+            2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len,
+                                   k_v_seq_len):
+            raise ValueError(
+                "Attention weights should be of size "
+                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    "Attention mask should be of size "
+                    f"{(batch_size, 1, q_len, k_v_seq_len)}",
+                    f"but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(
+                                                 query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.dropout,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len,
+                                  self.head_dim):
+            raise ValueError(
+                "`attn_output` should be of size "
+                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, "
+                "but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(
+                kv_seq_len, self.layer_idx)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning(
+                "The input hidden states seems to be "
+                "silently casted in float32, "
+                "this might be related to the fact "
+                "you have upcasted embedding or layer norm layers in float32. "
+                "We will cast back the input in"
+                " %s.", target_dtype)
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(query_states,
+                                                    key_states,
+                                                    value_states,
+                                                    attention_mask,
+                                                    q_len,
+                                                    dropout=dropout_rate)
+
+        attn_output = attn_output.reshape(bsz, q_len,
+                                          self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(self,
+                                 query_states,
+                                 key_states,
+                                 value_states,
+                                 attention_mask,
+                                 query_length,
+                                 dropout=0.0,
+                                 softmax_scale=None):
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (query_states, key_states, value_states, indices_q, cu_seq_lens,
+             max_seq_lens) = self._upad_input(query_states, key_states,
+                                              value_states, attention_mask,
+                                              query_length)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
+                                    query_length)
+        else:
+            attn_output = flash_attn_func(query_states,
+                                          key_states,
+                                          value_states,
+                                          dropout,
+                                          softmax_scale=softmax_scale,
+                                          causal=causal)
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
+                    query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                              head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                                head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+                                    head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            (query_layer, indices_q, cu_seqlens_q,
+             max_seqlen_in_batch_q) = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer
+# with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = (
+            config._attn_implementation == "flash_attention_2")
+        self.self_attn = (SiglipAttention(config)
+                          if not self._use_flash_attention_2 else
+                          SiglipFlashAttention2(config))
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states, )
+
+        if output_attentions:
+            outputs += (attn_weights, )
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight,
+                            std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder
+# with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None \
+                                else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+                        else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states,
+                               hidden_states=encoder_states,
+                               attentions=all_attentions)
+
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = (
+            config._attn_implementation == "flash_attention_2")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None \
+                                else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+                        else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s
+        # (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask,
+        # which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (_prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype)
+                              if not self._use_flash_attention_2 else
+                              patch_attention_mask)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, None) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3deb3d8840cc4..35fd6f37589a0 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -342,7 +342,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,

From daed30c4a917c870f8fbddf45e3b027710c0842b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 31 Jul 2024 23:46:17 +0800
Subject: [PATCH 32/79] [Bugfix] Fix feature size calculation for LLaVA-NeXT
 (#6982)

---
 tests/models/test_llava_next.py          | 88 +++++++++++++++++++-----
 vllm/model_executor/models/fuyu.py       |  2 +-
 vllm/model_executor/models/internvl.py   |  6 +-
 vllm/model_executor/models/llava_next.py | 48 ++++++-------
 vllm/model_executor/models/phi3v.py      |  4 +-
 5 files changed, 98 insertions(+), 50 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 9c64f39eb6d08..b6d72dee5c5b5 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,7 +1,7 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, overload
 
 import pytest
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -50,6 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+@overload
 def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
@@ -62,13 +63,55 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ):
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    if size_factors is not None:
+        inputs_per_image = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_image = [(
+            [prompt for _ in sizes],
+            [image.resize(size) for size in sizes],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
@@ -150,15 +193,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     )
 
 
-@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
-                                                         (183, 488, 776)])
-def test_image_feature_size(height_and_width_and_result):
-    # Avoid initializing CUDA too early in distributed tests
-    from vllm.model_executor.models.llava_next import (
-        get_llava_next_image_feature_size)
-
-    height, width, result = height_and_width_and_result
-    config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-    assert get_llava_next_image_feature_size(config,
-                                             input_height=height,
-                                             input_width=width) == result
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index fdea8ee30ce68..c4738263c3056 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -169,7 +169,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
         raise TypeError(f"Invalid image type: {type(image_data)}")
 
     # process prompts
-    prompt = llm_inputs["prompt"]
+    prompt = llm_inputs.get("prompt")
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     tokenizer = cached_get_tokenizer(model_config.model)
     # dim0 is batch_size, dim1 is subseq_size which will always be 1
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index f64c78c15f8ee..eabc283b1efdb 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -20,7 +20,7 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.intern_vit import InternVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.image import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SamplerOutput
@@ -43,7 +43,7 @@
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
@@ -193,7 +193,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
 
-    prompt = llm_inputs["prompt"]
+    prompt = llm_inputs.get("prompt")
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     if prompt is None:
         prompt = tokenizer.decode(prompt_token_ids)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5abb55c2cc415..4a67b9a583ea8 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
@@ -43,7 +43,7 @@
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 
@@ -62,31 +62,26 @@ class LlavaNextImagePixelInputs(TypedDict):
 LlavaNextImageInputs = LlavaNextImagePixelInputs
 
 
-# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
-# NOTE: new_height and new_width are further incremented to properly invert the
-# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
 def _get_llava_next_num_unpadded_features(
-    height: int,
-    width: int,
+    original_height: int,
+    original_width: int,
     npatches: int,
     num_patch_height: int,
     num_patch_width: int,
 ) -> Tuple[int, int]:
     current_height = npatches * num_patch_height
     current_width = npatches * num_patch_width
-    current_height = torch.tensor(current_height).to("cuda")
-    current_width = torch.tensor(current_width).to("cuda")
 
-    aspect_ratio: float = width / height
-    current_aspect_ratio: float = current_width / current_height
+    aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
     if aspect_ratio > current_aspect_ratio:
-        scale_factor = current_width / width
-        new_height = int(height * scale_factor)
+        new_height = (original_height * current_width) // original_width
         padding = (current_height - new_height) // 2
         current_height -= padding * 2
     else:
-        scale_factor = current_height / height
-        new_width = int(width * scale_factor)
+        new_width = (original_width * current_height) // original_height
         padding = (current_width - new_width) // 2
         current_width -= padding * 2
 
@@ -95,7 +90,7 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
 def get_llava_next_image_feature_size(
     hf_config: LlavaNextConfig,
     *,
@@ -111,9 +106,7 @@ def get_llava_next_image_feature_size(
         )
         base_feature_size = num_patches * num_patches
 
-        # Note: We follow the "wrong" width/height order
-        # [ref: PR huggingface/transformers#31588]
-        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
             image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
@@ -349,11 +342,12 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
             if patch_embeddings.shape[0] > 1:
                 other_patch_embeds = patch_embeddings[1:]
 
+                # Move to CPU to avoid floating-point errors
+                orig_height, orig_width = image_size.tolist()
+
                 # image_aspect_ratio == "anyres"
-                # Note: We follow the "wrong" width/height order
-                # [ref: PR huggingface/transformers#31588]
-                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    image_size,
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    (orig_height, orig_width),
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
@@ -365,7 +359,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                         .permute(4, 0, 2, 1, 3).contiguous() \
                         .flatten(1, 2).flatten(2, 3)
                     other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     image_size)
+                                                     (orig_height, orig_width))
                     other_patch_embeds = torch.cat((
                         other_patch_embeds,
                         self.image_newline[:, None, None] \
@@ -398,7 +392,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> BatchedTensors:
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
@@ -425,7 +419,9 @@ def _process_image_pixels(
         ]
 
     def _process_image_input(
-            self, image_input: LlavaNextImageInputs) -> BatchedTensors:
+        self,
+        image_input: LlavaNextImageInputs,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
         patch_embeddings = self._process_image_pixels(image_input)
 
         image_sizes = image_input.get("image_sizes")
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 75e2f5fc95cb7..823c34b101870 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
@@ -261,7 +261,7 @@ def add_image_newline(self, image_features_hd):
 
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: BatchedTensors
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
 

From 2ee8d3ba55f1175162dbc8e70b76674197b127c6 Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Wed, 31 Jul 2024 22:00:24 +0300
Subject: [PATCH 33/79] [Model] use FusedMoE layer in Jamba (#6935)

---
 vllm/model_executor/models/jamba.py | 157 +++++++++-------------------
 1 file changed, 49 insertions(+), 108 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3444578227259..cf407c86acd7d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-"""Inference-only Jurassic model."""
+"""Inference-only Jamba model."""
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple
 
@@ -15,10 +15,9 @@
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
+                              get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -282,108 +281,50 @@ def forward(self, x):
 
 
 class JambaMoE(nn.Module):
-    """A tensor-parallel MoE implementation for Mixtral that shards each expert
-    across all ranks.
 
-    Each expert's weights are sharded across all ranks and a fused MoE
-    kernel is used for the forward pass, and finally we reduce the outputs
-    across ranks.
-    """
-
-    def __init__(
-        self,
-        config: JambaConfig,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self,
+                 config: JambaConfig,
+                 num_experts: Optional[int] = None,
+                 top_k: Optional[int] = None,
+                 params_dtype: Optional[torch.dtype] = None,
+                 tp_size: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
-        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_experts
-        self.top_k = config.num_experts_per_tok
+        self.num_total_experts = num_experts or config.num_experts
+        self.top_k = top_k or config.num_experts_per_tok
         self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size // self.tp_size
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.params_dtype = params_dtype
-
-        self.router = ReplicatedLinear(self.hidden_size,
-                                       self.num_total_experts,
-                                       bias=False,
-                                       params_dtype=self.params_dtype)
-
-        self.ws = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                2 * self.intermediate_size,
-                self.hidden_size,
-                device="cuda",
-                dtype=self.params_dtype,
-            ))
-        self.w2s = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                self.hidden_size,
-                self.intermediate_size,
-                device="cuda",
-                dtype=self.params_dtype,
-            ))
+        self.intermediate_size = config.intermediate_size
 
-        set_weight_attrs(
-            self.ws,
-            {
-                "weight_loader": self.weight_loader,
-            },
-        )
-        set_weight_attrs(
-            self.w2s,
-            {
-                "weight_loader": self.weight_loader,
-            },
-        )
-
-    def weight_loader(
-        self,
-        param: nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        expert_id: int,
-    ):
-        tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.data
-        shard_size = self.intermediate_size
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        if weight_name.endswith("gate_proj.weight"):
-            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("up_proj.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("down_proj.weight"):
-            param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if self.num_total_experts > 1:
+            self.router = ReplicatedLinear(self.hidden_size,
+                                           self.num_total_experts,
+                                           bias=False,
+                                           quant_config=None,
+                                           params_dtype=params_dtype)
+
+        self.experts = FusedMoE(self.num_total_experts,
+                                self.top_k,
+                                self.hidden_size,
+                                self.intermediate_size,
+                                tp_size=tp_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=False,
+                                use_grouped_topk=False,
+                                quant_config=quant_config)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        orig_shape = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (batch * sequence_length, n_experts)
-        router_logits, _ = self.router(hidden_states)
-
-        final_hidden_states = fused_moe(
-            hidden_states,
-            self.ws,
-            self.w2s,
-            router_logits,
-            self.top_k,
-            renormalize=
-            False,  # Mixtral normalize the expert probs to 1. We don't!
-            inplace=True,
-        )
-
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_size)
+        if self.num_total_experts > 1:
+            router_logits, _ = self.router(hidden_states)
+        else:
+            router_logits = torch.ones((hidden_states.shape[0], 1),
+                                       device=hidden_states.device,
+                                       dtype=hidden_states.dtype)
+        hidden_states = self.experts(hidden_states, router_logits)
+        return hidden_states.view(orig_shape)
 
 
 class JambaMambaDecoderLayer(nn.Module):
@@ -917,15 +858,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
 
-        expert_params_mapping = [
-            # (param_name, weight_name, expert_id)
-            (
-                "ws" if weight_name in ["gate_proj", "up_proj"] else "w2s",
-                f"experts.{expert_id}.{weight_name}.weight",
-                expert_id,
-            ) for expert_id in range(self.config.num_experts)
-            for weight_name in ["down_proj", "up_proj", "gate_proj"]
-        ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
@@ -952,7 +891,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
@@ -961,6 +901,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader(param,
                                   loaded_weight,
                                   weight_name,
+                                  shard_id=shard_id,
                                   expert_id=expert_id)
                     break
                 else:

From bd700134072d9513902b42f3ef20a7cd8a1c6377 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 31 Jul 2024 12:02:17 -0700
Subject: [PATCH 34/79] [MISC] Introduce pipeline parallelism partition
 strategies (#6920)

Co-authored-by: youkaichao <youkaichao@126.com>
---
 tests/distributed/test_pipeline_partition.py | 34 ++++++++++++++++++++
 vllm/distributed/utils.py                    | 32 +++++++++++++++---
 vllm/envs.py                                 |  5 +++
 3 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 tests/distributed/test_pipeline_partition.py

diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
new file mode 100644
index 0000000000000..2d4d07dd27522
--- /dev/null
+++ b/tests/distributed/test_pipeline_partition.py
@@ -0,0 +1,34 @@
+import os
+
+import pytest
+
+from vllm.distributed.utils import get_pp_indices
+
+
+def test_custom_layer_partition():
+
+    def _verify(partition_str, num_layers, pp_size, goldens):
+        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+        for pp_rank, golden in enumerate(goldens):
+            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+        if bak is not None:
+            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+
+    # Even partition
+    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Balanced partition
+    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+    # Put reminder somewhere
+    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+    # Invalid partition strings
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    with pytest.raises(ValueError):
+        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of partitions
+    with pytest.raises(ValueError):
+        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of layers
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index b5cf6c45f478f..8c94ef8cb10ce 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -6,6 +6,11 @@
 
 import torch
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
@@ -54,11 +59,28 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
     If the number of layers is not divisible by the number of partitions,
     the last partition will have the remaining layers.
     """
-    layers_per_partition = num_hidden_layers // pp_size
-    start_layer = pp_rank * layers_per_partition
-    end_layer = start_layer + layers_per_partition
+    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
+    if partition_list_str is not None:
+        try:
+            partitions = [
+                int(layer) for layer in partition_list_str.split(",")
+            ]
+        except ValueError as err:
+            raise ValueError("Invalid partition string: {}".format(
+                partition_list_str)) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(
+                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
+        layers_per_partition = num_hidden_layers // pp_size
+        start_layer = pp_rank * layers_per_partition
+        end_layer = start_layer + layers_per_partition
 
-    if pp_rank == pp_size - 1:
-        end_layer = num_hidden_layers
+        if pp_rank == pp_size - 1:
+            end_layer = num_hidden_layers
 
     return (start_layer, end_layer)
diff --git a/vllm/envs.py b/vllm/envs.py
index f06b6d66ea6f4..aef7ac385ec66 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_OPENVINO_KVCACHE_SPACE: int = 0
@@ -242,6 +243,10 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
+    # Pipeline stage partition strategy
+    "VLLM_PP_LAYER_PARTITION":
+    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
+
     # (CPU backend only) CPU key-value cache space.
     # default is 4GB
     "VLLM_CPU_KVCACHE_SPACE":

From 460c1884e3cb781730f85cb5591a85d5864bdac8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 31 Jul 2024 15:47:46 -0400
Subject: [PATCH 35/79] [Bugfix] Support cpu offloading with fp8 quantization
 (#6960)

---
 tests/basic_correctness/test_cpu_offload.py | 43 +++++++++++++---
 vllm/model_executor/model_loader/loader.py  | 56 +++++++++++++++++++--
 vllm/model_executor/models/utils.py         | 50 +++++++++---------
 3 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 9ebcc48a9b93e..180b926637ecb 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,4 +1,6 @@
-from vllm.utils import is_hip
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
 
 from ..utils import compare_two_settings
 
@@ -6,8 +8,37 @@
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-2-7b-hf", [],
                          ["--cpu-offload-gb", "4"])
-    if not is_hip():
-        # compressed-tensors quantization is currently not supported in ROCm.
-        compare_two_settings(
-            "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", [],
-            ["--cpu-offload-gb", "1"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "2"])
+    # Test loading a quantized checkpoint
+    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq"),
+                    reason="awq is not supported on this GPU type.")
+def test_cpu_offload_awq():
+    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
+                         ["--cpu-offload-gb", "2"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors():
+    # Test wNa16
+    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+                         ["--cpu-offload-gb", "1"])
+    # Test w4a16_marlin24
+    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+                         [], ["--cpu-offload-gb", "1"])
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        ["--cpu-offload-gb", "1"])
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index bbe49655020da..f72515e014829 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -7,6 +7,7 @@
 import math
 import os
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from typing import Any, Dict, Generator, List, Optional, Tuple, Type
 
 import huggingface_hub
@@ -37,7 +38,49 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_tpu
+from vllm.utils import is_pin_memory_available, is_tpu
+
+
+@contextmanager
+def device_loading_context(module: torch.nn.Module,
+                           target_device: torch.device):
+    if target_device.type == "cpu":
+        # If target is CPU, no need to move anything
+        yield module
+        return
+
+    original_device_states: Dict[str, torch.device] = {}
+
+    # Store original device states and move parameters to GPU if they're on CPU
+    for name, p in module.named_parameters():
+        if p.device.type == "cpu":
+            original_device_states[name] = p.device
+            p.data = p.data.to(target_device)
+        # Parameters already on target device are not touched
+
+    try:
+        yield module
+
+    finally:
+        # Restore parameters to their original devices, ignoring new parameters
+        pin_memory = is_pin_memory_available()
+        for name, p in module.named_parameters():
+            if name in original_device_states:
+                original_device: torch.device = original_device_states[name]
+                if original_device.type == "cpu":
+                    # `torch.empty_like` does not support `pin_memory` argument
+                    cpu_data = torch.empty_strided(size=p.data.size(),
+                                                   stride=p.data.stride(),
+                                                   dtype=p.data.dtype,
+                                                   layout=p.data.layout,
+                                                   device="cpu",
+                                                   pin_memory=pin_memory)
+                    cpu_data.copy_(p.data)
+                    p.data = cpu_data
+                else:
+                    p.data = p.data.to(original_device)
+        # New parameters or parameters already on target device are untouched
+
 
 logger = init_logger(__name__)
 
@@ -275,8 +318,9 @@ def load_model(self, *, model_config: ModelConfig,
                    parallel_config: ParallelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
+        target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config, scheduler_config)
@@ -291,7 +335,13 @@ def load_model(self, *, model_config: ModelConfig,
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
-                    quant_method.process_weights_after_loading(module)
+                    # When quant methods need to process weights after loading
+                    # (for repacking, quantizing, etc), they expect parameters
+                    # to be on the global target device. This scope is for the
+                    # case where cpu offloading is used, where we will move the
+                    # parameters onto device for processing and back off after.
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 197d3839a766a..91b4a27814bf4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -87,6 +87,7 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
+    offloaded_parameters = False
     for p in module.parameters():
         if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
             # we use per-parameter offloading
@@ -94,35 +95,36 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
             break
 
         # `torch.empty_like` does not support `pin_memory` argument
-        cpu_data = torch.empty(size=p.data.size(),
-                               dtype=p.data.dtype,
-                               layout=p.data.layout,
-                               device='cpu',
-                               pin_memory=pin_memory)
+        cpu_data = torch.empty_strided(size=p.data.size(),
+                                       stride=p.data.stride(),
+                                       dtype=p.data.dtype,
+                                       layout=p.data.layout,
+                                       device='cpu',
+                                       pin_memory=pin_memory)
         cpu_data.copy_(p.data)
         p.data = cpu_data
         _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+        offloaded_parameters = True
+
+    if offloaded_parameters:
+        original_forward = module.forward
+
+        def forward(*args, **kwargs):
+            module.forward = original_forward
+            device_state = {
+                # here we blindly call `to(device)`
+                # if the parameter is already on the device, it will be a no-op
+                k: v.to(device, non_blocking=True)
+                for k, v in module.state_dict().items()
+            }
+            output = functional_call(module,
+                                     device_state,
+                                     args=args,
+                                     kwargs=kwargs)
+            module.forward = forward
+            return output
 
-    state_dict: Dict[str, torch.Tensor] = module.state_dict()
-
-    original_forward = module.forward
-
-    def forward(*args, **kwargs):
-        module.forward = original_forward
-        device_state = {
-            # here we blindly call `to(device)`
-            # if the parameter is already on the device, it will be a no-op
-            k: v.to(device, non_blocking=True)
-            for k, v in state_dict.items()
-        }
-        output = functional_call(module,
-                                 device_state,
-                                 args=args,
-                                 kwargs=kwargs)
         module.forward = forward
-        return output
-
-    module.forward = forward
 
     return module
 

From 93548eb37e952a0af035dc524a3826cdcd78d6cf Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 31 Jul 2024 17:40:22 -0400
Subject: [PATCH 36/79] [Kernel] Enable FP8 Cutlass for Ada Lovelace (#6950)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 605166930ccc6..8d4d94ca08452 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,13 +38,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
-    // and are slower than torch.mm. Return false unconditionally in this case.
-    return false;
-
-    // Once the CUTLASS kernels have been optimized for Lovelace systems,
-    // use the following check:
-    // return CUDA_VERSION >= 12040;
+    return CUDA_VERSION >= 12040;
   }
 #endif
 

From 35e9c12bfaf8f273281af897b7208dfba53f103c Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 31 Jul 2024 17:40:32 -0400
Subject: [PATCH 37/79] [Kernel] Tuned int8 Cutlass Kernels for SM75 (T4)
 (#6996)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   9 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  15 +--
 .../scaled_mm_c2x_sm75_dispatch.cuh           | 123 ++++++++++++++++++
 3 files changed, 135 insertions(+), 12 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 70247e94e63cf..64011b2db2395 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -112,13 +112,20 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
 
     timers = []
-    # pytorch impl
+    # pytorch impl - bfloat16
     timers.append(
         bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
                  torch.bfloat16, label, sub_label, pytorch_mm_impl,
                  "pytorch_bf16_bf16_bf16_matmul-no-scales"))
 
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(a.to(dtype=torch.float16, device="cuda"),
+                 b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
+                 torch.float16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales"))
+
     # cutlass impl
     timers.append(
         bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index aac4900f933a7..8d0dfee7bf23a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -3,6 +3,7 @@
 #include "cutlass/cutlass.h"
 
 #include "scaled_mm_c2x.cuh"
+#include "scaled_mm_c2x_sm75_dispatch.cuh"
 #include "scaled_mm_c2x_sm80_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
@@ -20,21 +21,13 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a.dtype() == torch::kInt8);
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
-
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_caller<
-        vllm::cutlass_2x_gemm<cutlass::arch::Sm75, vllm::enable_sm75_to_sm80,
-                              int8_t, cutlass::bfloat16_t, Epilogue, TileShape,
-                              WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_caller<vllm::cutlass_2x_gemm<
-        cutlass::arch::Sm75, vllm::enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
new file mode 100644
index 0000000000000..a562fd896e54d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "scaled_mm_c2x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM75 based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_default {
+  // This config is used in 2 cases,
+  // - M in (256, inf]
+  // - M in (64, 128]
+  // Shared memory required by this Gemm 32768
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M256 {
+  // M in (128, 256]
+  // Shared memory required by this Gemm 65536
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M64 {
+  // M in (32, 64]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm75_config_M32 {
+  // M in [1, 32]
+  // Shared memory required by this Gemm 49152
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b,
+                                       EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM256 =
+      typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128 = Cutlass2xGemmDefault;
+  using Cutlass2xGemmM64 =
+      typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm75_config_default has the least shared-memory requirements.
+  using FallbackGemm = Cutlass2xGemmDefault;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+  if (mp2 <= 32) {
+    // M in [1, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // M in (128, 256]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // M in (256, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm

From a0dce9383ab7de0015060fb9fedadeb7d8ffdfb9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 31 Jul 2024 17:40:44 -0400
Subject: [PATCH 38/79] [Misc] Add compressed-tensors to optimized quant list
 (#7006)

---
 vllm/config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index fd48cc3a6b371..de5d0402a1bc7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -197,13 +197,17 @@ def _verify_embedding_mode(self) -> None:
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
-            # compress-tensors uses a "compression_config" key
+            # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
         return quant_cfg
 
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = ["gptq", "squeezellm"]
+        optimized_quantization_methods = [
+            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
+            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"
+        ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -242,9 +246,7 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not "
                     f"supported in ROCm.")
-            if (self.quantization
-                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin",
-                            "awq_marlin", "fbgemm_fp8", "compressed_tensors")):
+            if self.quantization not in optimized_quantization_methods:
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "

From 7eb0cb4a14ff3de84bf18fad8054d12ea8000c22 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 31 Jul 2024 16:34:26 -0700
Subject: [PATCH 39/79] Revert "[Frontend] Factor out code for running uvicorn"
 (#7012)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 pyproject.toml                        |  1 -
 vllm/entrypoints/api_server.py        | 74 +++++++++------------------
 vllm/entrypoints/openai/api_server.py | 72 ++++++++++++++++++--------
 vllm/server/__init__.py               |  3 --
 vllm/server/launch.py                 | 42 ---------------
 5 files changed, 75 insertions(+), 117 deletions(-)
 delete mode 100644 vllm/server/__init__.py
 delete mode 100644 vllm/server/launch.py

diff --git a/pyproject.toml b/pyproject.toml
index cd5d196a16200..b0d115a091c45 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,7 +60,6 @@ files = [
     "vllm/logging",
     "vllm/multimodal",
     "vllm/platforms",
-    "vllm/server",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 3476357658522..66941442c8c9c 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -5,12 +5,12 @@
 We are also not going to accept PRs modifying this file, please
 change `vllm/entrypoints/openai/api_server.py` instead.
 """
-import asyncio
+
 import json
 import ssl
-from argparse import Namespace
-from typing import Any, AsyncGenerator, Optional
+from typing import AsyncGenerator
 
+import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
@@ -18,10 +18,8 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
-from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
 
@@ -83,50 +81,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return JSONResponse(ret)
 
 
-def build_app(args: Namespace) -> FastAPI:
-    global app
-
-    app.root_path = args.root_path
-    return app
-
-
-async def init_app(
-    args: Namespace,
-    llm_engine: Optional[AsyncLLMEngine] = None,
-) -> FastAPI:
-    app = build_app(args)
-
-    global engine
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = (llm_engine
-              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
-                  engine_args, usage_context=UsageContext.API_SERVER))
-
-    return app
-
-
-async def run_server(args: Namespace,
-                     llm_engine: Optional[AsyncLLMEngine] = None,
-                     **uvicorn_kwargs: Any) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
-
-    app = await init_app(args, llm_engine)
-    await serve_http(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level=args.log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
-        ssl_keyfile=args.ssl_keyfile,
-        ssl_certfile=args.ssl_certfile,
-        ssl_ca_certs=args.ssl_ca_certs,
-        ssl_cert_reqs=args.ssl_cert_reqs,
-        **uvicorn_kwargs,
-    )
-
-
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
@@ -151,5 +105,25 @@ async def run_server(args: Namespace,
     parser.add_argument("--log-level", type=str, default="debug")
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.API_SERVER)
+
+    app.root_path = args.root_path
 
-    asyncio.run(run_server(args))
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
+
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level=args.log_level,
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+                ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile,
+                ssl_ca_certs=args.ssl_ca_certs,
+                ssl_cert_reqs=args.ssl_cert_reqs)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c1640a10a407d..0fe4dd245b5e6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,12 +2,14 @@
 import importlib
 import inspect
 import re
-from argparse import Namespace
+import signal
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Any, Optional, Set
+from typing import Optional, Set
 
-from fastapi import APIRouter, FastAPI, Request
+import fastapi
+import uvicorn
+from fastapi import APIRouter, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -36,7 +38,6 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.logger import init_logger
-from vllm.server import serve_http
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -56,7 +57,7 @@
 
 
 @asynccontextmanager
-async def lifespan(app: FastAPI):
+async def lifespan(app: fastapi.FastAPI):
 
     async def _force_log():
         while True:
@@ -74,7 +75,7 @@ async def _force_log():
 router = APIRouter()
 
 
-def mount_metrics(app: FastAPI):
+def mount_metrics(app: fastapi.FastAPI):
     # Add prometheus asgi middleware to route /metrics requests
     metrics_route = Mount("/metrics", make_asgi_app())
     # Workaround for 307 Redirect for /metrics
@@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
-def build_app(args: Namespace) -> FastAPI:
-    app = FastAPI(lifespan=lifespan)
+def build_app(args):
+    app = fastapi.FastAPI(lifespan=lifespan)
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -213,8 +214,11 @@ async def authentication(request: Request, call_next):
     return app
 
 
-async def init_app(args: Namespace,
-                   llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
+async def build_server(
+    args,
+    llm_engine: Optional[AsyncLLMEngine] = None,
+    **uvicorn_kwargs,
+) -> uvicorn.Server:
     app = build_app(args)
 
     if args.served_model_name is not None:
@@ -277,17 +281,14 @@ async def init_app(args: Namespace,
     )
     app.root_path = args.root_path
 
-    return app
-
-
-async def run_server(args: Namespace,
-                     llm_engine: Optional[AsyncLLMEngine] = None,
-                     **uvicorn_kwargs: Any) -> None:
-    logger.info("vLLM API server version %s", VLLM_VERSION)
-    logger.info("args: %s", args)
+    logger.info("Available routes are:")
+    for route in app.routes:
+        if not hasattr(route, 'methods'):
+            continue
+        methods = ', '.join(route.methods)
+        logger.info("Route: %s, Methods: %s", route.path, methods)
 
-    app = await init_app(args, llm_engine)
-    await serve_http(
+    config = uvicorn.Config(
         app,
         host=args.host,
         port=args.port,
@@ -300,6 +301,36 @@ async def run_server(args: Namespace,
         **uvicorn_kwargs,
     )
 
+    return uvicorn.Server(config)
+
+
+async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
+    logger.info("vLLM API server version %s", VLLM_VERSION)
+    logger.info("args: %s", args)
+
+    server = await build_server(
+        args,
+        llm_engine,
+        **uvicorn_kwargs,
+    )
+
+    loop = asyncio.get_running_loop()
+
+    server_task = loop.create_task(server.serve())
+
+    def signal_handler() -> None:
+        # prevents the uvicorn signal handler to exit early
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        print("Gracefully stopping http server")
+        await server.shutdown()
+
 
 if __name__ == "__main__":
     # NOTE(simon):
@@ -308,5 +339,4 @@ async def run_server(args: Namespace,
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
     args = parser.parse_args()
-
     asyncio.run(run_server(args))
diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py
deleted file mode 100644
index 17c98b4dad6c9..0000000000000
--- a/vllm/server/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .launch import serve_http
-
-__all__ = ["serve_http"]
diff --git a/vllm/server/launch.py b/vllm/server/launch.py
deleted file mode 100644
index 1a8aeb7f1022b..0000000000000
--- a/vllm/server/launch.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import asyncio
-import signal
-from typing import Any
-
-import uvicorn
-from fastapi import FastAPI
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
-    logger.info("Available routes are:")
-    for route in app.routes:
-        methods = getattr(route, "methods", None)
-        path = getattr(route, "path", None)
-
-        if methods is None or path is None:
-            continue
-
-        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
-
-    config = uvicorn.Config(app, **uvicorn_kwargs)
-    server = uvicorn.Server(config)
-
-    loop = asyncio.get_running_loop()
-
-    server_task = loop.create_task(server.serve())
-
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
-
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
-
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        logger.info("Gracefully stopping http server")
-        await server.shutdown()

From 7ecee3432110bae563c8756a66b54e5f08dc777d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 1 Aug 2024 08:12:24 +0800
Subject: [PATCH 40/79] [Kernel][RFC] Refactor the punica kernel based on
 Triton (#5036)

---
 .github/workflows/scripts/build.sh           |    2 -
 CMakeLists.txt                               |   62 -
 Dockerfile                                   |    2 -
 Dockerfile.rocm                              |    3 +-
 csrc/punica/LICENSE                          |  217 ---
 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_config.h               |  218 ---
 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_impl.cuh               |  451 ------
 csrc/punica/bgmv/generator.py                |   48 -
 csrc/punica/bgmv/vec_dtypes.cuh              | 1325 ------------------
 csrc/punica/punica_ops.cu                    |  569 --------
 csrc/punica/punica_ops.h                     |   11 -
 csrc/punica/torch_bindings.cpp               |   18 -
 csrc/punica/type_convert.h                   |   82 --
 docs/source/getting_started/installation.rst |    1 -
 setup.py                                     |   10 -
 tests/kernels/test_sampler.py                |   48 +-
 tests/lora/test_gemma.py                     |    2 +-
 tests/lora/test_layers.py                    |  140 +-
 tests/lora/test_lora.py                      |  224 ---
 tests/lora/test_punica.py                    |  258 ----
 tests/lora/test_punica_sizes.py              |  408 ++++++
 tests/lora/test_punica_variation.py          |  342 +++++
 tests/lora/test_quant_model.py               |   48 +-
 tests/lora/utils.py                          |  148 ++
 vllm/_custom_ops.py                          |   42 +-
 vllm/envs.py                                 |    5 -
 vllm/lora/fully_sharded_layers.py            |  137 +-
 vllm/lora/layers.py                          |  437 ++----
 vllm/lora/models.py                          |  171 +--
 vllm/lora/ops/__init__.py                    |    0
 vllm/lora/ops/bgmv_expand.py                 |  169 +++
 vllm/lora/ops/bgmv_expand_slice.py           |  182 +++
 vllm/lora/ops/bgmv_shrink.py                 |  150 ++
 vllm/lora/ops/sgmv_expand.py                 |  192 +++
 vllm/lora/ops/sgmv_expand_slice.py           |  205 +++
 vllm/lora/ops/sgmv_shrink.py                 |  189 +++
 vllm/lora/ops/utils.py                       |   46 +
 vllm/lora/punica.py                          |  765 +++++++---
 vllm/triton_utils/__init__.py                |    3 +-
 vllm/triton_utils/libentry.py                |  167 +++
 vllm/worker/model_runner.py                  |   12 +-
 47 files changed, 3175 insertions(+), 4364 deletions(-)
 delete mode 100644 csrc/punica/LICENSE
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_config.h
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_impl.cuh
 delete mode 100644 csrc/punica/bgmv/generator.py
 delete mode 100644 csrc/punica/bgmv/vec_dtypes.cuh
 delete mode 100644 csrc/punica/punica_ops.cu
 delete mode 100644 csrc/punica/punica_ops.h
 delete mode 100644 csrc/punica/torch_bindings.cpp
 delete mode 100644 csrc/punica/type_convert.h
 delete mode 100644 tests/lora/test_lora.py
 delete mode 100644 tests/lora/test_punica.py
 create mode 100644 tests/lora/test_punica_sizes.py
 create mode 100644 tests/lora/test_punica_variation.py
 create mode 100644 vllm/lora/ops/__init__.py
 create mode 100644 vllm/lora/ops/bgmv_expand.py
 create mode 100644 vllm/lora/ops/bgmv_expand_slice.py
 create mode 100644 vllm/lora/ops/bgmv_shrink.py
 create mode 100644 vllm/lora/ops/sgmv_expand.py
 create mode 100644 vllm/lora/ops/sgmv_expand_slice.py
 create mode 100644 vllm/lora/ops/sgmv_shrink.py
 create mode 100644 vllm/lora/ops/utils.py
 create mode 100644 vllm/triton_utils/libentry.py

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 60a3978f9abd7..0a759d303238b 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
-# Make sure punica is built for the release (for LoRA)
-export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28b8879a7ba17..0d599c5470704 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,61 +223,7 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-#
-# _punica_C extension
-#
-
-set(VLLM_PUNICA_EXT_SRC
-  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
-  "csrc/punica/punica_ops.cu"
-  "csrc/punica/torch_bindings.cpp")
-
-#
-# Copy GPU compilation flags+update for punica
-#
-set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
-list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
-  "-D__CUDA_NO_HALF_OPERATORS__"
-  "-D__CUDA_NO_HALF_CONVERSIONS__"
-  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-  "-D__CUDA_NO_HALF2_OPERATORS__")
-
-#
-# Filter out CUDA architectures < 8.0 for punica.
-#
-if (${VLLM_GPU_LANG} STREQUAL "CUDA")
-  set(VLLM_PUNICA_GPU_ARCHES)
-  foreach(ARCH ${VLLM_GPU_ARCHES})
-    string_to_ver(CODE_VER ${ARCH})
-    if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
-    endif()
-  endforeach()
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
-  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-endif()
 
-if (VLLM_PUNICA_GPU_ARCHES)
-  define_gpu_extension_target(
-    _punica_C
-    DESTINATION vllm
-    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${VLLM_PUNICA_EXT_SRC}
-    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-    USE_SABI 3
-    WITH_SOABI)
-else()
-  message(WARNING "Unable to create _punica_C target because none of the "
-    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
-endif()
 
 #
 # Add the `default` target which detects which extensions should be
@@ -301,12 +247,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling moe extension.")
   add_dependencies(default _moe_C)
 
-  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
-  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
-  # there are supported target arches.
-  if (VLLM_PUNICA_GPU_ARCHES AND
-      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
-    message(STATUS "Enabling punica extension.")
-    add_dependencies(default _punica_C)
-  endif()
 endif()
diff --git a/Dockerfile b/Dockerfile
index b9a56e67e8d7b..db4453ab0efc9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -88,8 +88,6 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 ARG buildkite_commit
 ENV BUILDKITE_COMMIT=${buildkite_commit}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 64bc0f3c12c75..33423fde4ff96 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -131,8 +131,7 @@ COPY . .
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 
-# Make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
diff --git a/csrc/punica/LICENSE b/csrc/punica/LICENSE
deleted file mode 100644
index a46e2cdcadf7d..0000000000000
--- a/csrc/punica/LICENSE
+++ /dev/null
@@ -1,217 +0,0 @@
-Contains code from https://github.com/punica-ai/punica
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------------------------------------------------------------------------------
-
-This product bundles various third-party components under other open source licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
-
-
-Apache-2.0
-* third_party/nvbench (with LLVM exception)
-* third_party/flashinfer
-
-BSD-3-Clause:
-* third_party/cutlass
\ No newline at end of file
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
deleted file mode 100644
index 86846c274c90f..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
deleted file mode 100644
index de39c3121f5d3..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
deleted file mode 100644
index 2c8d007d8719f..0000000000000
--- a/csrc/punica/bgmv/bgmv_config.h
+++ /dev/null
@@ -1,218 +0,0 @@
-#pragma once
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale);
-
-// clang-format off
-
-#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, narrow, 128) \
-    f(in_T, out_T, W_T, narrow, 256) \
-    f(in_T, out_T, W_T, narrow, 512) \
-    f(in_T, out_T, W_T, narrow, 640) \
-    f(in_T, out_T, W_T, narrow, 768) \
-    f(in_T, out_T, W_T, narrow, 896) \
-    f(in_T, out_T, W_T, narrow, 1024) \
-    f(in_T, out_T, W_T, narrow, 1152) \
-    f(in_T, out_T, W_T, narrow, 1216) \
-    f(in_T, out_T, W_T, narrow, 1280) \
-    f(in_T, out_T, W_T, narrow, 1536) \
-    f(in_T, out_T, W_T, narrow, 1664) \
-    f(in_T, out_T, W_T, narrow, 1728) \
-    f(in_T, out_T, W_T, narrow, 1792) \
-    f(in_T, out_T, W_T, narrow, 2048) \
-    f(in_T, out_T, W_T, narrow, 2240) \
-    f(in_T, out_T, W_T, narrow, 2304) \
-    f(in_T, out_T, W_T, narrow, 2368) \
-    f(in_T, out_T, W_T, narrow, 2432) \
-    f(in_T, out_T, W_T, narrow, 2560) \
-    f(in_T, out_T, W_T, narrow, 2752) \
-    f(in_T, out_T, W_T, narrow, 2816) \
-    f(in_T, out_T, W_T, narrow, 3072) \
-    f(in_T, out_T, W_T, narrow, 3328) \
-    f(in_T, out_T, W_T, narrow, 3456) \
-    f(in_T, out_T, W_T, narrow, 3584) \
-    f(in_T, out_T, W_T, narrow, 3712) \
-    f(in_T, out_T, W_T, narrow, 4096) \
-    f(in_T, out_T, W_T, narrow, 4480) \
-    f(in_T, out_T, W_T, narrow, 4608) \
-    f(in_T, out_T, W_T, narrow, 4736) \
-    f(in_T, out_T, W_T, narrow, 4864) \
-    f(in_T, out_T, W_T, narrow, 5120) \
-    f(in_T, out_T, W_T, narrow, 5504) \
-    f(in_T, out_T, W_T, narrow, 5632) \
-    f(in_T, out_T, W_T, narrow, 5888) \
-    f(in_T, out_T, W_T, narrow, 6144) \
-    f(in_T, out_T, W_T, narrow, 6400) \
-    f(in_T, out_T, W_T, narrow, 6848) \
-    f(in_T, out_T, W_T, narrow, 6912) \
-    f(in_T, out_T, W_T, narrow, 7168) \
-    f(in_T, out_T, W_T, narrow, 7424) \
-    f(in_T, out_T, W_T, narrow, 8192) \
-    f(in_T, out_T, W_T, narrow, 8960) \
-    f(in_T, out_T, W_T, narrow, 9216) \
-    f(in_T, out_T, W_T, narrow, 9472) \
-    f(in_T, out_T, W_T, narrow, 10240) \
-    f(in_T, out_T, W_T, narrow, 11008) \
-    f(in_T, out_T, W_T, narrow, 11264) \
-    f(in_T, out_T, W_T, narrow, 12288) \
-    f(in_T, out_T, W_T, narrow, 13696) \
-    f(in_T, out_T, W_T, narrow, 13824) \
-    f(in_T, out_T, W_T, narrow, 14336) \
-    f(in_T, out_T, W_T, narrow, 14784) \
-    f(in_T, out_T, W_T, narrow, 14848) \
-    f(in_T, out_T, W_T, narrow, 15360) \
-    f(in_T, out_T, W_T, narrow, 16384) \
-    f(in_T, out_T, W_T, narrow, 18944) \
-    f(in_T, out_T, W_T, narrow, 20480) \
-    f(in_T, out_T, W_T, narrow, 22016) \
-    f(in_T, out_T, W_T, narrow, 22528) \
-    f(in_T, out_T, W_T, narrow, 24576) \
-    f(in_T, out_T, W_T, narrow, 27392) \
-    f(in_T, out_T, W_T, narrow, 27648) \
-    f(in_T, out_T, W_T, narrow, 28672) \
-    f(in_T, out_T, W_T, narrow, 29568) \
-    f(in_T, out_T, W_T, narrow, 29696) \
-    f(in_T, out_T, W_T, narrow, 32000) \
-    f(in_T, out_T, W_T, narrow, 32256) \
-    f(in_T, out_T, W_T, narrow, 32512) \
-    f(in_T, out_T, W_T, narrow, 32768) \
-    f(in_T, out_T, W_T, narrow, 33024) \
-    f(in_T, out_T, W_T, narrow, 36864) \
-    f(in_T, out_T, W_T, narrow, 43264) \
-    f(in_T, out_T, W_T, narrow, 49152) \
-    f(in_T, out_T, W_T, narrow, 49408) \
-    f(in_T, out_T, W_T, narrow, 60544) \
-    f(in_T, out_T, W_T, narrow, 60672) \
-    f(in_T, out_T, W_T, narrow, 64000) \
-    f(in_T, out_T, W_T, narrow, 64256) \
-    f(in_T, out_T, W_T, narrow, 64512) \
-    f(in_T, out_T, W_T, narrow, 102400) \
-    f(in_T, out_T, W_T, narrow, 102656) \
-    f(in_T, out_T, W_T, narrow, 102912) \
-    f(in_T, out_T, W_T, narrow, 128000) \
-    f(in_T, out_T, W_T, narrow, 128256) \
-    f(in_T, out_T, W_T, narrow, 128512) \
-    
-    
-// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
-// and vllm/tests/lora/test_punica.py
-
-// Used for defining kernels going from the variety of
-// dim in to the narrow dim out
-    // Using it for the fully sharded column
-    // parallel LoRA A which splits the rank dim
-#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, 128, narrow) \
-    f(in_T, out_T, W_T, 256, narrow) \
-    f(in_T, out_T, W_T, 512, narrow) \
-    f(in_T, out_T, W_T, 640, narrow) \
-    f(in_T, out_T, W_T, 768, narrow) \
-    f(in_T, out_T, W_T, 896, narrow) \
-    f(in_T, out_T, W_T, 1024, narrow) \
-    f(in_T, out_T, W_T, 1152, narrow) \
-    f(in_T, out_T, W_T, 1216, narrow) \
-    f(in_T, out_T, W_T, 1280, narrow) \
-    f(in_T, out_T, W_T, 1536, narrow) \
-    f(in_T, out_T, W_T, 1664, narrow) \
-    f(in_T, out_T, W_T, 1728, narrow) \
-    f(in_T, out_T, W_T, 1792, narrow) \
-    f(in_T, out_T, W_T, 2048, narrow) \
-    f(in_T, out_T, W_T, 2240, narrow) \
-    f(in_T, out_T, W_T, 2304, narrow) \
-    f(in_T, out_T, W_T, 2368, narrow) \
-    f(in_T, out_T, W_T, 2432, narrow) \
-    f(in_T, out_T, W_T, 2560, narrow) \
-    f(in_T, out_T, W_T, 2752, narrow) \
-    f(in_T, out_T, W_T, 2816, narrow) \
-    f(in_T, out_T, W_T, 3072, narrow) \
-    f(in_T, out_T, W_T, 3328, narrow) \
-    f(in_T, out_T, W_T, 3456, narrow) \
-    f(in_T, out_T, W_T, 3584, narrow) \
-    f(in_T, out_T, W_T, 3712, narrow) \
-    f(in_T, out_T, W_T, 4096, narrow) \
-    f(in_T, out_T, W_T, 4480, narrow) \
-    f(in_T, out_T, W_T, 4608, narrow) \
-    f(in_T, out_T, W_T, 4736, narrow) \
-    f(in_T, out_T, W_T, 4864, narrow) \
-    f(in_T, out_T, W_T, 5120, narrow) \
-    f(in_T, out_T, W_T, 5504, narrow) \
-    f(in_T, out_T, W_T, 5632, narrow) \
-    f(in_T, out_T, W_T, 5888, narrow) \
-    f(in_T, out_T, W_T, 6144, narrow) \
-    f(in_T, out_T, W_T, 6400, narrow) \
-    f(in_T, out_T, W_T, 6848, narrow) \
-    f(in_T, out_T, W_T, 6912, narrow) \
-    f(in_T, out_T, W_T, 7168, narrow) \
-    f(in_T, out_T, W_T, 7424, narrow) \
-    f(in_T, out_T, W_T, 8192, narrow) \
-    f(in_T, out_T, W_T, 8960, narrow) \
-    f(in_T, out_T, W_T, 9216, narrow) \
-    f(in_T, out_T, W_T, 9472, narrow) \
-    f(in_T, out_T, W_T, 10240, narrow) \
-    f(in_T, out_T, W_T, 11008, narrow) \
-    f(in_T, out_T, W_T, 11264, narrow) \
-    f(in_T, out_T, W_T, 12288, narrow) \
-    f(in_T, out_T, W_T, 13696, narrow) \
-    f(in_T, out_T, W_T, 13824, narrow) \
-    f(in_T, out_T, W_T, 14336, narrow) \
-    f(in_T, out_T, W_T, 14784, narrow) \
-    f(in_T, out_T, W_T, 14848, narrow) \
-    f(in_T, out_T, W_T, 15360, narrow) \
-    f(in_T, out_T, W_T, 16384, narrow) \
-    f(in_T, out_T, W_T, 18944, narrow) \
-    f(in_T, out_T, W_T, 20480, narrow) \
-    f(in_T, out_T, W_T, 22016, narrow) \
-    f(in_T, out_T, W_T, 22528, narrow) \
-    f(in_T, out_T, W_T, 24576, narrow) \
-    f(in_T, out_T, W_T, 27392, narrow) \
-    f(in_T, out_T, W_T, 27648, narrow) \
-    f(in_T, out_T, W_T, 28672, narrow) \
-    f(in_T, out_T, W_T, 29568, narrow) \
-    f(in_T, out_T, W_T, 29696, narrow) \
-    f(in_T, out_T, W_T, 32000, narrow) \
-    f(in_T, out_T, W_T, 32256, narrow) \
-    f(in_T, out_T, W_T, 32512, narrow) \
-    f(in_T, out_T, W_T, 32768, narrow) \
-    f(in_T, out_T, W_T, 33024, narrow) \
-    f(in_T, out_T, W_T, 36864, narrow) \
-    f(in_T, out_T, W_T, 43264, narrow) \
-    f(in_T, out_T, W_T, 49152, narrow) \
-    f(in_T, out_T, W_T, 49408, narrow) \
-    f(in_T, out_T, W_T, 60544, narrow) \
-    f(in_T, out_T, W_T, 60672, narrow) \
-    f(in_T, out_T, W_T, 64000, narrow) \
-    f(in_T, out_T, W_T, 64256, narrow) \
-    f(in_T, out_T, W_T, 64512, narrow) \
-    f(in_T, out_T, W_T, 102400, narrow) \
-    f(in_T, out_T, W_T, 102656, narrow) \
-    f(in_T, out_T, W_T, 102912, narrow) \
-    f(in_T, out_T, W_T, 128000, narrow) \
-    f(in_T, out_T, W_T, 128256, narrow) \
-    f(in_T, out_T, W_T, 128512, narrow) \
-// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
-
-
-// Keep this in sync with vllm/config::LoRAConfig
-#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
-
-
-#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
-    f(in_T, out_T, W_T, 8, 64) \
-    f(in_T, out_T, W_T, 16, 64) \
-    f(in_T, out_T, W_T, 32, 64) \
-    f(in_T, out_T, W_T, 64, 64)
-
-// clang-format on
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
deleted file mode 100644
index d225a1eaa82b0..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
deleted file mode 100644
index b37d288a75561..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
deleted file mode 100644
index a1ab2deecbabf..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
deleted file mode 100644
index 0b35bf5699898..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_impl.cuh b/csrc/punica/bgmv/bgmv_impl.cuh
deleted file mode 100644
index 8a3b8403b4a6f..0000000000000
--- a/csrc/punica/bgmv/bgmv_impl.cuh
+++ /dev/null
@@ -1,451 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#ifndef USE_ROCM
-#include <cooperative_groups.h>
-#else
-#include <hip/hip_cooperative_groups.h>
-#endif
-#ifndef USE_ROCM
-#include <cuda/pipeline>
-#endif
-#include <cuda_runtime.h>
-#include <iostream>
-#include <stdio.h>
-
-#include "vec_dtypes.cuh"
-
-namespace cg = cooperative_groups;
-
-#ifdef USE_ROCM
-template <size_t len>
-__host__ __device__
-inline void* memcpy_blocking(void *dst, const void *src) {
-  // Does not handle the case of long datatypes
-  char *d = reinterpret_cast<char *>(dst);
-  const char *s = reinterpret_cast<const char *>(src);
-  size_t i = 0;
-#pragma unroll
-  for (i = 0; i < len; ++i) {
-    d[i] = s[i];
-  }
-  return dst;
-}
-#endif
-
-#ifndef USE_ROCM
-
-// nthrs = (32, 4)
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t j = blockIdx.x;
-  constexpr size_t num_pipeline_stages = 2;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  __shared__ W_T W_shared[num_pipeline_stages * tile_size];
-  __shared__ in_T X_shared[num_pipeline_stages * tile_size];
-  __shared__ float y_warpwise[ty];
-
-  size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  auto pipe = cuda::make_pipeline();
-
-  // pipeline load W/X and compute WX;
-  pipe.producer_acquire();
-  cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     W + (idx * feat_out + j) * feat_in +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-  cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     X + (batch_idx * feat_in) +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-  pipe.producer_commit();
-  size_t copy_idx, compute_idx;
-  float y = 0.f;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size;
-       ++tile_idx) {
-    copy_idx = tile_idx % num_pipeline_stages;
-    // pipeline stage: async copy W fragment
-    pipe.producer_acquire();
-    if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) {
-      cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         W + (idx * feat_out + j) * feat_in +
-                             tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-      cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         X + (batch_idx * feat_in) + tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-    }
-    pipe.producer_commit();
-
-    compute_idx = (tile_idx - 1) % num_pipeline_stages;
-    // pipeline stage: compute WX
-    pipe.consumer_wait();
-    block.sync();
-    x_vec.load(X_shared + X_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    w_vec.load(W_shared + W_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += __shfl_down_sync(0xffffffff, sum, offset);
-    }
-    y_warpwise[threadIdx.y] = sum;
-    block.sync();
-#pragma unroll
-    for (size_t i = 0; i < ty; ++i) {
-      y += y_warpwise[i];
-    }
-
-    block.sync();
-    pipe.consumer_release();
-  }
-
-  compute_idx = (tile_idx - 1) % num_pipeline_stages;
-  // final pipeline stage
-  pipe.consumer_wait();
-  block.sync();
-  x_vec.load(X_shared + X_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  w_vec.load(W_shared + W_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-  }
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += __shfl_down_sync(0xffffffff, sum, offset);
-  }
-  y_warpwise[threadIdx.y] =
-      ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in)
-          ? sum
-          : 0.f;
-  block.sync();
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y += y_warpwise[i];
-  }
-
-  block.sync();
-  pipe.consumer_release();
-
-  // write Y;
-  if (block.thread_rank() == 0) {
-    Y[batch_idx * full_y_size + y_offset + j] += static_cast<out_T>(y);
-  }
-}
-
-#else
-
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  size_t j = blockIdx.x;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  constexpr size_t num_tiles = (feat_in + tile_size - 1) / tile_size;
-  __shared__ float y_warpwise[ty];
-
-  float y = 0;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 0; tile_idx < num_tiles; ++tile_idx) {
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      x_vec.load(X + (batch_idx * feat_in) +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-      w_vec.load(W + (idx * feat_out + j) * feat_in +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-    }
-
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += VLLM_SHFL_DOWN_SYNC(sum, offset);
-    }
-
-    __syncthreads();
-
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      y += sum;
-    }
-  }
-
-  if (threadIdx.x == 0) {
-    y_warpwise[threadIdx.y] = y;
-  }
-  __syncthreads();
-
-  float y_write = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y_write += y_warpwise[i];
-  }
- 
-  // write Y;
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    size_t y_idx = batch_idx * full_y_size + y_offset + j;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(y_write));
-  }
-}
-
-#endif
-
-// nthrs = (2, 16, 4)
-template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz,
-          typename in_T, typename out_T, typename W_T>
-__global__ void
-bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t tile_idx = blockIdx.x;
-
-  // load X;
-  vec_t<in_T, vec_size> x_vec;
-  x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size);
-
-  // load W;
-  vec_t<W_T, vec_size> w_vec;
-  w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in +
-             block.thread_rank() * vec_size);
-
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-#ifndef USE_ROCM
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-#else
-    sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-#endif
-  }
-
-  cg::thread_block_tile g = cg::tiled_partition<tx>(block);
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += g.shfl_down(sum, offset);
-  }
-  sum = g.shfl(sum, 0);
-
-  if (threadIdx.x == 0) {
-#ifndef USE_ROCM
-    Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-      threadIdx.z * ty + threadIdx.y] += static_cast<out_T>(sum);
-#else
-    size_t y_idx = batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-                   threadIdx.z * ty + threadIdx.y;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(sum));
-#endif
-  }
-}
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale) {
-  constexpr size_t vec_size = 8;
-  constexpr int tz = 4;
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  if constexpr (feat_in <= feat_out) {
-    static_assert(feat_in % vec_size == 0);
-    constexpr int tx = feat_in / vec_size;
-
-    static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) ||
-                  (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) ||
-                  (8 % tx == 0 && feat_out % (8 / tx * tz) == 0));
-
-    if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) {
-      constexpr int ty = 32 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) {
-      constexpr int ty = 16 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else {
-      constexpr int ty = 8 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-  } else {
-#ifndef USE_ROCM
-    static_assert(feat_in % (vec_size * 32) == 0 ||
-                  feat_in % (vec_size * 16) == 0 ||
-                  feat_in % (vec_size * 8) == 0);
-
-    if constexpr (feat_in % (vec_size * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(in_T),
-                         vec_size * sizeof(W_T), tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) {
-      constexpr int tx = 16;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-#else
-    constexpr size_t rocm_warp_size = warpSize;
-
-#define CHECK_INPUT_TILEABLE_BY(vec_size_) \
-    feat_in % (rocm_warp_size * vec_size_) == 0
-
-#define LAUNCH_BGMV_SHRINK_KERNELS_ROCM(factor_, vec_size_, tx_, ty_)       \
-    if constexpr (CHECK_INPUT_TILEABLE_BY(factor_)) {                       \
-      constexpr size_t vec_size_shrink = vec_size_;                         \
-      constexpr int tx = tx_;                                               \
-      constexpr int ty = ty_;                                               \
-      dim3 nblks(feat_out, batch_size);                                     \
-      dim3 nthrs(tx, ty);                                                   \
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size_shrink,                \
-                          vec_size_shrink * sizeof(in_T),                   \
-                          vec_size_shrink * sizeof(W_T),                    \
-                          tx, ty, tz>                                       \
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,        \
-                                        full_y_size, num_layers, layer_idx, \
-                                        scale);                             \
-    }
-
-    static_assert(CHECK_INPUT_TILEABLE_BY(32) ||
-                  CHECK_INPUT_TILEABLE_BY(16) ||
-                  CHECK_INPUT_TILEABLE_BY( 8) ||
-                  CHECK_INPUT_TILEABLE_BY( 4) ||
-                  CHECK_INPUT_TILEABLE_BY( 2) ||
-                  CHECK_INPUT_TILEABLE_BY( 1));
-    
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(32, vec_size, rocm_warp_size, 32/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(16, vec_size, rocm_warp_size, 16/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 8, vec_size, rocm_warp_size,  8/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 4, vec_size, rocm_warp_size/(vec_size/4), vec_size/4)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 2, vec_size, rocm_warp_size/(vec_size/2), vec_size/2)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 1, vec_size, rocm_warp_size/(vec_size/1), vec_size/1)
-
-#undef CHECK_INPUT_TILEABLE_BY
-#undef LAUNCH_BGMV_SHRINK_KERNELS_ROCM
-#endif
-  }
-}
-
-#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)                         \
-  template void bgmv_kernel<feat_in, feat_out>(                                \
-      out_T * __restrict__ Y, const in_T *__restrict__ X,                      \
-      const W_T *__restrict__ W, const int64_t *__restrict__ indicies,         \
-      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
-      int64_t num_layers, int64_t layer_idx, float scale);
-
-#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out)                 \
-  INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
-
-#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide)                      \
-  INST_BGMV(narrow, wide, in_T, out_T, W_T)                                    \
-  INST_BGMV(wide, narrow, in_T, out_T, W_T)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
deleted file mode 100644
index 972df5a7208c2..0000000000000
--- a/csrc/punica/bgmv/generator.py
+++ /dev/null
@@ -1,48 +0,0 @@
-DTYPES = ["fp16", "bf16", "fp32"]
-DTYPE_MAP = {
-    "fp16": "nv_half",
-    "bf16": "nv_bfloat16",
-    "fp32": "float",
-}
-
-TEMPLATE = """
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()  # noqa: E501
-
-for input_dtype in DTYPES:
-    for output_dtype in DTYPES:
-        for weight_dtype in DTYPES:
-            if weight_dtype == "fp32":
-                # FP32 weights are not supported.
-                continue
-            if output_dtype == "fp32":
-                # LoRA A matrix.
-                if input_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # input and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif input_dtype == "fp32":
-                # LoRA B matrix.
-                if output_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # output and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif not (input_dtype == output_dtype == weight_dtype):
-                # NOTE(woosuk): While Punica supports mixed data types for
-                # input, output, and weight, we only generate the kernels with
-                # the same data types to reduce the binary size.
-                continue
-
-            kernel_definition = TEMPLATE.format(
-                input_dtype=DTYPE_MAP[input_dtype],
-                output_dtype=DTYPE_MAP[output_dtype],
-                weight_dtype=DTYPE_MAP[weight_dtype])
-            filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
-            with open(filename, "w") as f:
-                f.write(kernel_definition)
diff --git a/csrc/punica/bgmv/vec_dtypes.cuh b/csrc/punica/bgmv/vec_dtypes.cuh
deleted file mode 100644
index 2738892e6dc4a..0000000000000
--- a/csrc/punica/bgmv/vec_dtypes.cuh
+++ /dev/null
@@ -1,1325 +0,0 @@
-#ifndef VEC_DTYPES_CUH_
-#define VEC_DTYPES_CUH_
-
-#ifdef FLASHINFER_USE_FP8
-#include <cuda_fp8.h>
-#endif
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-#include "../type_convert.h"
-#include "../../cuda_compat.h"
-
-#define FLASHINFER_INLINE \
-  inline __attribute__((always_inline)) __device__ __host__
-
-template <typename float_t, size_t vec_size>
-struct vec_t {
-  FLASHINFER_INLINE float_t &operator[](size_t i);
-  FLASHINFER_INLINE const float_t &operator[](size_t i) const;
-  FLASHINFER_INLINE void fill(float_t val);
-  FLASHINFER_INLINE void load(const float_t *ptr);
-  FLASHINFER_INLINE void store(float_t *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src);
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr);
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const;
-  FLASHINFER_INLINE static void memcpy(float_t *dst, const float_t *src);
-};
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<src_float_t, vec_size> &src,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    dst[i] = tgt_float_t(src[i]);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_load_impl(const src_float_t *src_ptr,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    dst.load(src_ptr);
-  } else {
-    vec_t<src_float_t, vec_size> tmp;
-    tmp.load(src_ptr);
-    dst.cast_from(tmp);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_store_impl(const vec_t<src_float_t, vec_size> &src,
-                                       tgt_float_t *dst_ptr) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    src.store(dst_ptr);
-  } else {
-    vec_t<tgt_float_t, vec_size> tmp;
-    tmp.cast_from(src);
-    tmp.store(dst_ptr);
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-/******************* vec_t<__nv_fp8_e4m3> *******************/
-
-// __nv_fp8_e4m3 x 1
-template <>
-struct vec_t<__nv_fp8_e4m3, 1> {
-  __nv_fp8_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e4m3 x 2
-template <>
-struct vec_t<__nv_fp8_e4m3, 2> {
-  __nv_fp8x2_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x2_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x2_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x2_e4m3 *)dst) = *((__nv_fp8x2_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 4
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 4> {
-  __nv_fp8x4_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x4_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x4_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x4_e4m3 *)dst) = *((__nv_fp8x4_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 8
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) {
-  ((__nv_fp8x4_e4m3 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e4m3 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8_e4m3 *)dst) = *((__nv_fp8_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 16 or more
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e4m3, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<__nv_fp8_e5m2> *******************/
-
-// __nv_fp8_e5m2 x 1
-template <>
-struct vec_t<__nv_fp8_e5m2, 1> {
-  __nv_fp8_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e5m2 x 2
-template <>
-struct vec_t<__nv_fp8_e5m2, 2> {
-  __nv_fp8x2_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x2_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x2_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x2_e5m2 *)dst) = *((__nv_fp8x2_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 4
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 4> {
-  __nv_fp8x4_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x4_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x4_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x4_e5m2 *)dst) = *((__nv_fp8x4_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 8
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) {
-  ((__nv_fp8x4_e5m2 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e5m2 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8_e5m2 *)dst) = *((__nv_fp8_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 16 or more
-
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e5m2, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-#endif
-
-/******************* vec_t<half> *******************/
-
-// half x 1
-template <>
-struct vec_t<half, 1> {
-  half data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 1>::fill(half val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::load(const half *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::store(half *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::memcpy(half *dst, const half *src) {
-  *dst = *src;
-}
-
-// half x 2
-template <>
-struct vec_t<half, 2> {
-  half2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 2>::fill(half val) {
-  data = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::load(const half *ptr) {
-  data = *((half2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::store(half *ptr) const {
-  *((half2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::memcpy(half *dst, const half *src) {
-  *((half2 *)dst) = *((half2 *)src);
-}
-
-// half x 4
-
-template <>
-struct vec_t<half, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 4>::fill(half val) {
-  *(half2 *)(&data.x) = make_half2(val, val);
-  *(half2 *)(&data.y) = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::load(const half *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::store(half *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::memcpy(half *dst, const half *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// half x 8 or more
-
-template <size_t vec_size>
-struct vec_t<half, vec_size> {
-  uint4 data[vec_size / 8];
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)data)[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(half val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      *(half2 *)(&(data[i].x)) = make_half2(val, val);
-      *(half2 *)(&(data[i].y)) = make_half2(val, val);
-      *(half2 *)(&(data[i].z)) = make_half2(val, val);
-      *(half2 *)(&(data[i].w)) = make_half2(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const half *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(half *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<nv_bfloat16> *******************/
-
-// nv_bfloat16 x 1
-template <>
-struct vec_t<nv_bfloat16, 1> {
-  nv_bfloat16 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *dst = *src;
-}
-
-// nv_bfloat16 x 2
-template <>
-struct vec_t<nv_bfloat16, 2> {
-  nv_bfloat162 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) {
-  data = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16 *ptr) {
-  data = *((nv_bfloat162 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16 *ptr) const {
-  *((nv_bfloat162 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((nv_bfloat162 *)dst) = *((nv_bfloat162 *)src);
-}
-
-// nv_bfloat16 x 4
-
-template <>
-struct vec_t<nv_bfloat16, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) {
-  *(nv_bfloat162 *)(&data.x) = make_bfloat162(val, val);
-  *(nv_bfloat162 *)(&data.y) = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// nv_bfloat16 x 8 or more
-
-template <size_t vec_size>
-struct vec_t<nv_bfloat16, vec_size> {
-  uint4 data[vec_size / 8];
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      *(nv_bfloat162 *)(&(data[i].x)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].y)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].z)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].w)) = make_bfloat162(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<float> *******************/
-
-// float x 1
-
-template <>
-struct vec_t<float, 1> {
-  float data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::load(const float *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::store(float *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::memcpy(float *dst, const float *src) {
-  *dst = *src;
-}
-
-// float x 2
-
-template <>
-struct vec_t<float, 2> {
-  float2 data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 2>::fill(float val) {
-  data = make_float2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::load(const float *ptr) {
-  data = *((float2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::store(float *ptr) const {
-  *((float2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::memcpy(float *dst, const float *src) {
-  *((float2 *)dst) = *((float2 *)src);
-}
-
-// float x 4 or more
-template <size_t vec_size>
-struct vec_t<float, vec_size> {
-  float4 data[vec_size / 4];
-
-  FLASHINFER_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = make_float4(val, val, val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const float *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = ((float4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(float *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)dst)[i] = ((float4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t type cast *******************/
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] = __half22float2(((half2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = half(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = __float22half2_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<nv_bfloat16, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] =
-          __bfloat1622float2(((nv_bfloat162 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<nv_bfloat16, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = nv_bfloat16(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((nv_bfloat162 *)(&dst.data))[i] =
-          __float22bfloat162_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e4m3 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] =
-          __nv_fp8x4_e4m3(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = __nv_fp8x4_e4m3(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e5m2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e5m2(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] =
-          __nv_fp8x4_e5m2(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = __nv_fp8x4_e5m2(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-#endif  // FLASHINFER_USE_FP8
-
-#endif  // VEC_DTYPES_CUH_
diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu
deleted file mode 100644
index dd29820144b34..0000000000000
--- a/csrc/punica/punica_ops.cu
+++ /dev/null
@@ -1,569 +0,0 @@
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cstdint>
-
-#include "type_convert.h"
-#include "../cuda_compat.h"
-#include "bgmv/bgmv_config.h"
-
-
-//====== utils ======
-
-inline void check_shape(const torch::Tensor &a, const torch::Tensor &b,
-                        const char *a_name, const char *b_name) {
-  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ",
-              a.dim(), " vs ", b.dim());
-  for (int i = 0; i < a.dim(); ++i) {
-    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name,
-                ".size(", i, ")");
-  }
-}
-
-inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) {
-  return (uint64_t(a) << 32) | uint64_t(b);
-}
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-
-#define CHECK_CONTIGUOUS(x)                                                    \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-#define CHECK_DIM(d, x)                                                        \
-  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
-
-#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
-
-#define CHECK_EQ(a, b)                                                         \
-  TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
-
-//====== bgmv ======
-
-template <typename in_T, typename out_T, typename W_T>
-inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
-                               const int64_t *lora_indices,
-                               uint32_t in_features, uint32_t out_features,
-                               int64_t y_offset, int64_t full_y_size,
-                               int64_t batch_size, int64_t num_layers,
-                               int64_t layer_idx, float scale) {
-  // NOTE(woosuk): While Punica supports various combinations of input/output
-  // data types, we limit the supported data types to reduce the binary size.
-  constexpr bool is_input_float = std::is_same<in_T, float>::value;
-  constexpr bool is_output_float = std::is_same<out_T, float>::value;
-  if (is_input_float) {
-    if (!std::is_same<out_T, W_T>::value) {
-      return false;
-    }
-  } else if (is_output_float) {
-    if (!std::is_same<in_T, W_T>::value) {
-      return false;
-    }
-  } else if (!(std::is_same<in_T, W_T>::value &&
-               std::is_same<out_T, W_T>::value)) {
-    return false;
-  }
-
-  switch (pack_u32(in_features, out_features)) {
-#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
-  case pack_u32(feat_in, feat_out):                                            \
-    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset,            \
-                                   full_y_size, batch_size, num_layers,        \
-                                   layer_idx, scale);                          \
-    break;
-#define CASE(_in_T, _out_T, _W_T, narrow, wide)                                \
-  CASE_ONESIDE(in_T, out_T, W_T, narrow, wide)                                 \
-  CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
-
-    FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
-    FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
-#undef CASE
-#undef CASE_ONESIDE
-  default:
-    return false;
-  }
-  return true;
-}
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t h_in = x.size(1);
-  int64_t h_out = y.size(1);
-  int64_t num_layers = w.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t num_layers = w.size(1);
-  int64_t full_y_size = y.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h
deleted file mode 100644
index 5d625d0564f75..0000000000000
--- a/csrc/punica/punica_ops.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale);
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset);
diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp
deleted file mode 100644
index 894e229b6d9db..0000000000000
--- a/csrc/punica/torch_bindings.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "registration.h"
-#include "punica_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  m.def(
-      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
-      "layer_idx, float scale) -> ()");
-  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
-
-  m.def(
-      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
-      "Tensor indicies, int layer_idx,"
-      "float scale, int h_in, int h_out,"
-      "int y_offset) -> ()");
-  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/punica/type_convert.h b/csrc/punica/type_convert.h
deleted file mode 100644
index dff7ce49283d7..0000000000000
--- a/csrc/punica/type_convert.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CSRC__PUNICA__TYPE_CONVERT_H__
-#define CSRC__PUNICA__TYPE_CONVERT_H__
-
-#ifndef USE_ROCM
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-#else
-
-#include <hip/hip_bf16.h>
-#include <hip/hip_fp16.h>
-
-#define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__
-
-typedef __half nv_half;
-typedef __hip_bfloat16 nv_bfloat16;
-typedef __hip_bfloat162 nv_bfloat162;
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) {
-  return __hip_bfloat162{val, val};
-}
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) {
-  return __hip_bfloat162{vall, valr};
-}
-
-template <typename T_src, typename T_dst>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T_dst convert_type(T_src val) {
-  return static_cast<T_dst>(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__half, float>(__half val) {
-  return __half2float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half convert_type<float, __half>(float val) {
-  return __float2half(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) {
-  return __bfloat162float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 convert_type<float, __hip_bfloat16>(float val) {
-  return __float2bfloat16(val);
-}
-
-template <typename T>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T vllm_add(T a, T b) {
-  return a + b;
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half vllm_add<__half>(__half a, __half b) {
-  return __hadd(a, b);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) {
-  return __hadd(a, b);
-}
-
-#undef __TYPE_CONVERT__HOST_DEVICE__
-
-#endif // USE_ROCM
-
-#endif // CSRC__PUNICA__TYPE_CONVERT_H__
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index fe041e03a1b6c..0253717da3cda 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -66,7 +66,6 @@ You can also build and install vLLM from source:
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
     $ pip install -e .  # This may take 5-10 minutes.
 
 .. tip::
diff --git a/setup.py b/setup.py
index 72ef26f15e405..63c1f466d2910 100644
--- a/setup.py
+++ b/setup.py
@@ -181,9 +181,6 @@ def configure(self, ext: CMakeExtension) -> None:
         # match.
         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
 
-        if _install_punica():
-            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
-
         #
         # Setup parallelism and build tool
         #
@@ -274,10 +271,6 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def _install_punica() -> bool:
-    return envs.VLLM_INSTALL_PUNICA_KERNELS
-
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -446,9 +439,6 @@ def _read_requirements(filename: str) -> List[str]:
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
-    if _install_punica():
-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
-
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index 3c53f7decc6eb..713e868986a5f 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -1,14 +1,17 @@
 import gc
+from unittest.mock import patch
 
 import pytest
 import torch
 import triton
 import triton.language as tl
 
-from vllm.model_executor.layers.ops.sample import (_uniform_to_exponential,
+from vllm.model_executor.layers.ops.sample import (_sample_triton,
+                                                   _uniform_to_exponential,
                                                    sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.libentry import LibEntry
 from vllm.triton_utils.sample import (MAX_TRITON_N_COLS,
                                       get_num_triton_sampler_splits)
 
@@ -76,15 +79,20 @@ def test_sample_decoding_only(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, bs),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=save_logprobs,
-        _save_modified_probs=True)
+    #The current _sample_triton does not utilize the
+    # libentry decoration. The purpose of adding this patch is to test
+    # the correctness of libentry.
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=save_logprobs,
+            _save_modified_probs=True)
     assert sampled_tokens.shape == (bs, max_best_of)
     for i in range(bs):
         assert torch.all(sampled_tokens[i] == i * (vocab_size // bs))
@@ -130,6 +138,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
                          [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
 def test_sample_prompt_logprobs(random_sampling, max_best_of,
                                 modify_greedy_probs, seed, vocab_size):
+
     set_random_seed(seed)
     prompt_sizes = [16, 32, 64, 128] * 2
     samples = 8
@@ -157,14 +166,17 @@ def test_sample_prompt_logprobs(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, samples),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, _ = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=True)
+    #ditto
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, _ = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=True)
     assert sampled_tokens.shape == (samples, max_best_of)
     assert sampled_logprobs.shape == (samples, max_best_of)
     for i, t in enumerate(sample_indices):
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 709246179bfe4..478bb86b78610 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files):
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "so little time\nAuthor: Frank Zappa\n",
+        "so little time.\nAuthor: Frank Zappa\n",
     ]
 
     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 7207af6b1a4b3..6f33f56616fcd 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -26,7 +26,8 @@
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
 from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights, convert_mapping)
+                              PackedLoRALayerWeights)
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -47,6 +48,9 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
+# We will launch different triton kernels between the prefill and decode
+# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+STAGES = [True, False]
 
 
 def get_random_id_to_index(num_loras: int,
@@ -182,10 +186,12 @@ def create_random_inputs(
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -204,7 +210,7 @@ def create_random_embedding_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         embedding, lora_embedding = create_random_embedding_layer()
-
+        lora_embedding.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_embedding,
@@ -217,12 +223,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info)
 
         lora_result = lora_embedding(torch.cat(inputs))
 
@@ -255,12 +261,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
 
         lora_result = lora_embedding(torch.cat(inputs))
         expected_result = embedding(torch.cat(inputs))
@@ -278,11 +284,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("stage", STAGES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
-                                        vocab_size) -> None:
+                                        vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -318,6 +326,7 @@ def create_random_embedding_layer():
             generate_embeddings_tensor=256,
         )
 
+        lora_embedding.set_mapping(punica_wrapper)
         # All embeddings tensors have the same shape.
         embeddings_tensors = [
             lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
@@ -334,8 +343,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
         original_inputs = deepcopy(inputs)
 
         # Force some of the inputs to be in the extended embeddings range
@@ -349,11 +362,6 @@ def create_random_embedding_layer():
                 (embedding_id + 1) * embeddings_tensor_len - 1)
             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         expanded_embedding.weight[vocab_size:vocab_size +
                                   (embeddings_tensor_len *
                                    max_loras)] = torch.cat(embeddings_tensors)
@@ -390,15 +398,13 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
         original_inputs = deepcopy(inputs)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         lora_result = lora_embedding(torch.cat(original_inputs))
         expected_result = expanded_embedding(torch.cat(inputs))
 
@@ -413,11 +419,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_lm_head_logits_processor(dist_init, num_loras, device,
-                                  vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
+                                  stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -443,7 +451,7 @@ def _pretest():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
-
+        lora_logits_processor.set_mapping(punica_wrapper)
         # NOTE: all the generated loras share the same embeddings tensor.
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -461,17 +469,17 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        input_ = torch.rand(20, 1024)
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             vocab_size,
             lora_config.lora_extra_vocab_size,
         )
-        lora_logits_processor.set_mapping(*mapping_info, )
+        input_ = torch.rand(20, 1024)
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -510,12 +518,16 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_logits_processor.set_mapping(*mapping_info, )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -538,10 +550,12 @@ def _pretest():
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device) -> None:
+                         device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -575,7 +589,7 @@ def create_random_linear_parallel_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_parallel_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -589,16 +603,16 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -628,11 +642,12 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        512, lora_config.lora_extra_vocab_size)
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -649,10 +664,12 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device) -> None:
+                                device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -707,7 +724,7 @@ class FakeConfig:
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
         linear, lora_linear = create_column_parallel_packed_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -722,16 +739,17 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -762,16 +780,18 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
+        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -803,7 +823,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
-
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -825,6 +845,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         is_neox_style,
     )
     lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
+    lora_rope.set_mapping(punica_wrapper)
     lora_rope.create_lora_weights(max_loras, lora_config)
     linear_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
@@ -840,6 +861,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         input_range=(0, lora_config.lora_extra_vocab_size),
         input_type=torch.float16,
     )
+
     lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
     long_lora_context = LongContextLoRAContext(list(scaling_factors),
                                                rotary_dim)
@@ -854,7 +876,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     for i in range(len(scaling_factors)):
         long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
             scaling_factors[i], 0)
-    mapping_info = convert_mapping(
+    punica_wrapper.update_metadata(
         lora_mapping,
         id_to_index,
         max_loras,
@@ -862,7 +884,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         lora_config.lora_extra_vocab_size,
         long_lora_context=long_lora_context,
     )
-    lora_rope.set_mapping(*mapping_info)
+    # lora_rope.set_mapping(*mapping_info)
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query = torch.randn(batch_size,
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
deleted file mode 100644
index 3415d36b7e341..0000000000000
--- a/tests/lora/test_lora.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import pytest
-import torch
-
-from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
-
-from .utils import DummyLoRAManager
-
-TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
-QKV_TENSOR_SIZES = [
-    (8192, 1024, 1024),
-    (8192 // 8, 1024 // 8, 1024 // 8),
-    (4096, 4096, 4096),
-    (4096 // 2, 4096 // 2, 4096 // 2),
-]
-BATCH_SIZES = [8, 32, 256]
-RANKS = [8]
-DTYPES = [torch.float16]
-TOLERANCES = {
-    torch.float16: (5e-3, 5e-3),
-    torch.bfloat16: (3e-2, 2e-2),
-}
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora(m, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name, weight, rank=rank)
-    lora = manager.get_module_lora(module_name)
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
-
-    lora_a_stack = torch.zeros(8,
-                               1,
-                               lora.lora_a.shape[1],
-                               lora.lora_a.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    lora_b_stack = torch.zeros(8,
-                               1,
-                               lora.lora_b.shape[1],
-                               lora.lora_b.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    for i in range(lora_a_stack.shape[0]):
-        lora_a_stack[i][0] = lora.lora_a.T
-        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
-
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora(
-        input, lora_a_stack, lora_b_stack,
-        torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
-        output)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.full((len(input), ), -1, device="cuda"), output)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
-    if m % 2 != 0:
-        pytest.skip("m must be divisible by 2")
-    if m // 2 not in TENSOR_SIZES:
-        pytest.skip("m//2 must be in TENSOR_SIZES")
-
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "1", weight, rank=rank)
-    lora_1 = manager.get_module_lora(module_name + "1")
-    manager.init_random_lora(module_name + "2", weight, rank=rank)
-    lora_2 = manager.get_module_lora(module_name + "2")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
-        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_a.shape[1],
-                    lora_1.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_b.shape[1],
-                    lora_1.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_1.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
-        lora_a_stacks[1][i][0] = lora_2.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
-
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (m // 2, m // 2))
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              output, (m // 2, m // 2))
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
-    weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
-    lora_q = manager.get_module_lora(module_name + "q")
-    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
-    lora_k = manager.get_module_lora(module_name + "k")
-    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
-    lora_v = manager.get_module_lora(module_name + "v")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
-        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
-        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_a.shape[1],
-                    lora_q.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_a.shape[1],
-                    lora_k.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_b.shape[1],
-                    lora_q.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_b.shape[1],
-                    lora_k.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_q.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
-        lora_a_stacks[1][i][0] = lora_k.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
-        lora_a_stacks[2][i][0] = lora_v.lora_a.T
-        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
-
-    output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              output, (qkv[0], qkv[1], qkv[2]))
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
deleted file mode 100644
index dbeb16cb21ad3..0000000000000
--- a/tests/lora/test_punica.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Based on code from https://github.com/punica-ai/punica
-
-import pytest
-import torch
-
-import vllm.lora.punica as punica
-
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (5e-3, 5e-3),
-        torch.bfloat16: (3e-2, 2e-2),
-        torch.float32: (None, None),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
-def _lora_ref_impl(
-    y_final: torch.Tensor,
-    x: torch.Tensor,
-    wa_T_all: torch.Tensor,
-    wb_T_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    y_stage_1 = torch.empty(
-        (x.size(0), wa_T_all.size(-2)),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    bs = x.shape[0]
-    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
-    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
-        xi = x[i].unsqueeze(0).to(torch.float32)
-        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
-        if wb_T_all is not None:
-            wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-                                                         -2).to(torch.float32)
-
-        tmp = xi @ wa
-        y_stage_1[i] = tmp.squeeze(0)
-        y_final[i] += ((tmp @ wb).squeeze(0) *
-                       s if wb_T_all is not None else y_stage_1[i])
-    return y_final, y_stage_1
-
-
-H1 = H2 = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    49408,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-H2 = [64] + H2
-R = [1, 2, 4]
-SEED = [0xabcdabcd987]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("r", R)
-@pytest.mark.parametrize("seed", SEED)
-@torch.inference_mode()
-def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    bs = 32
-    dtype = getattr(torch, dtype_str)
-    device = torch.device("cuda")
-
-    wa_T_all = torch.randn(num_loras,
-                           num_layers,
-                           r,
-                           h1,
-                           dtype=dtype,
-                           device=device)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype, device=device)
-        y = torch.randn(bs, r, dtype=dtype, device=device)
-
-        y_ref = y.clone()
-        _lora_ref_impl(
-            y_ref,
-            x,
-            wa_T_all,
-            None,
-            indices,
-            layer_idx,
-            1.0,
-        )
-
-        y_our = y.clone()
-        punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness(dtype_str, h1, h2, seed, device):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
-                        scale)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
-    if h2 % 3 != 0 or h2 // 3 not in H1:
-        pytest.skip("h2 must be divisible by 3 and in supported shapes")
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-        s = h2 // 3
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
-                       layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
-                              layer_idx, scale, 0, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
-                              layer_idx, scale, s, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
-                              layer_idx, scale, s * 2, s)
-
-        assert_close(y_ref[:, :s], y_our[:, :s])
-        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
-        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
new file mode 100644
index 0000000000000..c052568dc2e33
--- /dev/null
+++ b/tests/lora/test_punica_sizes.py
@@ -0,0 +1,408 @@
+"""
+This script is mainly used to tests various hidden_sizes. We have collected the 
+hidden_sizes included in the LoRA models currently supported by vLLM. It tests
+whether the corresponding Triton kernel can run normally when tensor parallelism
+is set to [1, 2, 4, 8, 16, 32, 64].
+"""
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
+
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 4, 8, 16, 32, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+BATCHES = [4]
+NUM_LORA = [4]
+DTYPES = [torch.float16, torch.bfloat16]
+MAX_RANKS = [32]
+SCALES = [0.5]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 128
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            add_inputs=True,
+        )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 1
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    if op_type == "shrink":
+        # The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
+    else:
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    seq_length = 128 if op_type == "sgmv" else 1
+    (
+        inputs_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            # The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
+        ref_torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+            inputs_tensor,
+            lora_weights,
+            lora_indices_tensor,
+            seq_len_tensor,
+            batches,
+            1.0,
+            op_type="expand",
+        )
+
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
new file mode 100644
index 0000000000000..7e73ea67ee5f4
--- /dev/null
+++ b/tests/lora/test_punica_variation.py
@@ -0,0 +1,342 @@
+"""
+This script is mainly used to test whether trtion kernels can run normally 
+under different conditions, including various batches, numbers of LoRA , and 
+maximum ranks.
+"""
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
+
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
+HIDDEN_SIZES = [3424, 4096, 4097]
+
+BATCHES = [1, 4, 16, 32]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
+DTYPES = [torch.float16, torch.bfloat16]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+SCALES = [0.5]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 128
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            add_inputs=True,
+        )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 1
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    if op_type == "shrink":
+        # The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
+    else:
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    seq_length = 128 if op_type == "sgmv" else 1
+    (
+        inputs_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            # The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
+        ref_torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+            inputs_tensor,
+            lora_weights,
+            lora_indices_tensor,
+            seq_len_tensor,
+            batches,
+            1.0,
+            op_type="expand",
+        )
+
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
+
+
+if __name__ == "__main__":
+    from itertools import product
+
+    lst = list(
+        product(
+            BATCHES,
+            NUM_LORA,
+            MAX_RANKS,
+            [1.0],
+            [torch.float16],
+            ["expand"],
+            SEED,
+            CUDA_DEVICES,
+        ))
+    for ele in lst:
+        test_punica_bgmv(*ele)
+        print(f"{ele},pass")
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 8fd968c69e58f..2370c693e9534 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
     # if torch.cuda.device_count() < tp_size:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    llm = vllm.LLM(model=model.model_path,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   max_model_len=400,
-                   tensor_parallel_size=tp_size,
-                   quantization=model.quantization,
-                   trust_remote_code=True)
+    llm = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_model_len=400,
+        tensor_parallel_size=tp_size,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
@@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
     # if torch.cuda.device_count() < 2:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
 
-    llm_tp1 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=1,
-                       quantization=model.quantization,
-                       trust_remote_code=True)
+    llm_tp1 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
     cleanup()
 
-    llm_tp2 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=2,
-                       quantization=model.quantization)
+    llm_tp2 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b73cf5bf55324..00f8e26d1041f 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -86,3 +86,151 @@ def init_packed_lora(
         packed_lora = PackedLoRALayerWeights.pack(base_loras)
         self.set_module_lora(module_name, packed_lora)
         return packed_lora
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+def ref_torch_groupgemm(
+    out_tensor,
+    inputs,
+    lora_weights,
+    lora_indices_tensor,
+    seq_len_tensor,
+    batches,
+    scaling,
+    op_type,
+) -> torch.Tensor:
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_len_tensor):
+        input_weight = inputs[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        result *= scaling
+        out_list.append(result)
+    cat_result = torch.cat(out_list, dim=0)
+    if op_type == "expand":
+        out_tensor += cat_result
+    else:
+        out_tensor.copy_(cat_result)
+    return
+
+
+def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
+                  op_type, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=torch.float32).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+        ).to(device)
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, )).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]].copy_(lora_index)
+        current_offset += seq_len_tensor[b_id].item()
+    return (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
+                                     seq_length, dtype, nslices, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    inputs_tensor = torch.rand(
+        (total_tokens, max_rank),
+        dtype=dtype,
+    ).to(device)
+    lora_weights_lst = []
+    for _ in range(nslices):
+        lora_weights_lst.append(
+            torch.rand(
+                (lora_nums, hidden_size, max_rank),  # col-major
+                dtype=dtype,
+            ).to(device))
+    # expand op needs to complete y+=a@lora_b, so output is
+    # initinized randomly
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                dtype=dtype).to(device)
+    # Ensure the same input.
+    our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9e09b9a32eab9..6cd77f75cae8d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -13,12 +13,9 @@
 except ImportError as e:
     logger.warning("Failed to import from vllm._C with %r", e)
 
-with contextlib.suppress(ImportError):
-    import vllm._moe_C
-
 with contextlib.suppress(ImportError):
     # ruff: noqa: F401
-    import vllm._punica_C
+    import vllm._moe_C
 
 
 def is_custom_op_supported(op_name: str) -> bool:
@@ -519,43 +516,6 @@ def register_graph_buffers(fa: int, handles: List[str],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
-# punica
-def dispatch_bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
-                                      scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    h_in: int,
-    h_out: int,
-    y_offset: int,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        h_in,
-        h_out,
-        y_offset,
-    )
-
-
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
 # TODO: remove this in v0.6.0
 names_and_values = globals()
diff --git a/vllm/envs.py b/vllm/envs.py
index aef7ac385ec66..9bcb26f8e5a64 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,6 @@
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
-    VLLM_INSTALL_PUNICA_KERNELS: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
@@ -94,10 +93,6 @@ def get_default_config_root():
     "VLLM_USE_PRECOMPILED":
     lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
 
-    # If set, vllm will install Punica kernels
-    "VLLM_INSTALL_PUNICA_KERNELS":
-    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
-
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index d27171f720832..a7887a048746a 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -14,7 +14,6 @@
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
-from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
 
 if TYPE_CHECKING:
     pass
@@ -28,7 +27,7 @@ def _fully_sharded_can_replace(can_replace):
 
     def dec(*args, **kwargs):
         return (can_replace(*args, **kwargs)
-                and kwargs['lora_config'].fully_sharded_loras)
+                and kwargs["lora_config"].fully_sharded_loras)
 
     return dec
 
@@ -59,25 +58,30 @@ def apply(self, x: torch.Tensor,
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        bgmv(output, buffer, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -88,14 +92,14 @@ def can_replace_layer(cls, source_layer: nn.Module,
         )
 
 
-def _mcp_apply(x, bias, layer):
+def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     """
-    MergedColumnParallelLinearWithShardedLoRA and 
-    MergedQKVParallelLinearWithShardedLora share the same 
+    MergedColumnParallelLinearWithShardedLoRA and
+    MergedQKVParallelLinearWithShardedLora share the same
     LoRa weight application method.
     
     The main difference is the step by shard_size for lora_b which can
-    vary for MergedQKVParallelLinearWithShardedLora but is constant for 
+    vary for MergedQKVParallelLinearWithShardedLora but is constant for
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
@@ -104,21 +108,27 @@ def _mcp_apply(x, bias, layer):
 
     x = x.view(-1, x.shape[-1])
     output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros((n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-                          dtype=torch.float32,
-                          device=x.device)
+    buffers = torch.zeros(
+        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
     for idx in range(n):
-        bgmv(buffers[idx], x, layer.lora_a_stacked[idx],
-             layer.indices[:layer.indices_len[0]], 0, 1.0)
+        layer.punica_wrapper.add_shrink(buffers[idx], x,
+                                        layer.lora_a_stacked[idx], 1.0)
 
     buffers = tensor_model_parallel_all_gather(buffers)
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
-        dispatch_bgmv_low_level(output, buffers[idx],
-                                layer.lora_b_stacked[idx],
-                                layer.indices[:layer.indices_len[0]], 0, 1.0,
-                                left_offset, shard_size)
+        layer.punica_wrapper.add_expand_slice(
+            output,
+            buffers[idx],
+            layer.lora_b_stacked[idx],
+            left_offset,
+            shard_size,
+            add_input=True,
+        )
         left_offset += shard_size
 
     output = output.view(*out_orig_shape)
@@ -129,7 +139,7 @@ def _mcp_apply(x, bias, layer):
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
-    Differs from MergedColumnParallelLinearWithLoRA by slicing the 
+    Differs from MergedColumnParallelLinearWithLoRA by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -145,7 +155,8 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:,
                       output_start_idx:output_start_idx + output_shard_size],
-            lora_a[1][:, output_start_idx:output_start_idx + output_shard_size]
+            lora_a[1][:,
+                      output_start_idx:output_start_idx + output_shard_size],
         ]
         return lora_a
 
@@ -155,9 +166,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -170,7 +185,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
     """
-    Differs from QKVParallelLinearWithLora by slicing the 
+    Differs from QKVParallelLinearWithLora by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -193,14 +208,13 @@ def apply(self, x: torch.Tensor,
         buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
                              dtype=torch.float32,
                              device=x.device)
-
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        bgmv(output, buffer, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
@@ -237,7 +251,7 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]],
             lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]],
-            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]]
+            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]],
         ]
         return lora_a
 
@@ -247,9 +261,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -262,11 +280,11 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
-    Differs from RowParallelLinearWithLoRA by slicing the 
+    Differs from RowParallelLinearWithLoRA by slicing the
     LoRA B's also.
 
     Based on S-LoRA, slicing happens along the output dim.
-    This yields a combined partial sum from the row parallel base 
+    This yields a combined partial sum from the row parallel base
     layer and column partitioned output from the LoRA.
     """
 
@@ -283,11 +301,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -298,18 +318,21 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-        dispatch_bgmv_low_level(output, buffer, self.lora_b_stacked,
-                                self.indices[:self.indices_len[0]], 0, 1.0,
-                                start_idx, shard_size)
-
+        self.punica_wrapper.add_expand_slice(output, buffer,
+                                             self.lora_b_stacked, start_idx,
+                                             shard_size)
         output = output.view(*out_orig_shape)
         return output
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 87de285a373a2..3176badabbc7f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -17,7 +17,7 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import add_lora, add_lora_slice, bgmv
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -55,88 +55,17 @@ def _not_fully_sharded_can_replace(can_replace):
     """
 
     def dec(*args, **kwargs):
-        decorate = kwargs.pop('decorate') if 'decorate' in kwargs else True
-        condition = (not kwargs['lora_config'].fully_sharded_loras
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = (not kwargs["lora_config"].fully_sharded_loras
                      if decorate else True)
         return can_replace(*args, **kwargs) and condition
 
     return dec
 
 
-def _apply_lora(
-    x: torch.Tensor,
-    lora_a_stacked: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    indices: torch.Tensor,
-    output: torch.Tensor,
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    Input shapes:
-        x:               (batch_size, hidden_dim)
-        lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:  (num_loras, output_dim, lora_rank)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
-    return output.view_as(org_output)
-
-
-def _apply_lora_packed_nslice(
-    x: torch.Tensor,
-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    This method is used for layers that are composed of multiple sublayers
-    (slices) packed together.
-
-    Input shapes:
-        x:                 (batch_size, hidden_dim)
-        lora_a_stacked:    3 element tuple of (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    offset_left = 0
-    for slice_idx in range(len(output_slices)):
-        add_lora_slice(output, x, lora_a_stacked[slice_idx],
-                       lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
-                       output_slices[slice_idx])
-        offset_left += output_slices[slice_idx]
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
-    pass
+    is_prefill: bool = False
 
 
 class BaseLayerWithLoRA(nn.Module):
@@ -154,10 +83,11 @@ def slice_lora_b(
         ...
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         """Initializes lora matrices."""
         ...
 
@@ -177,20 +107,18 @@ def set_lora(
 
     def set_mapping(
         self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
+        punica_wrapper: PunicaWrapper,
     ):
-        """Sets the mapping indices."""
-        ...
+        self.punica_wrapper: PunicaWrapper = punica_wrapper
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
 
@@ -259,10 +187,6 @@ def create_lora_weights(
             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
             self.lora_a_stacked.shape[2],
         )
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.embeddings_indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -285,40 +209,27 @@ def set_lora(
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1]].copy_(embeddings_tensor, non_blocking=True)
+                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
                 embeddings = self.embeddings_tensors.view(
                     self.embeddings_tensors.shape[0] *
                     self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2]
+                    self.embeddings_tensors.shape[2],
                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
                 assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.embeddings_indices = embeddings_indices
-        self.indices_len = indices_len
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embedding_len = self.indices_len[3]
-        indices = self.embeddings_indices[1][:embedding_len].view_as(x)
+        embeddings_indices = self.punica_wrapper.embeddings_indices
+        indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,
             self.lora_a_stacked_2d,
         )
-        indices = self.embeddings_indices[0][:embedding_len].view_as(x)
+        indices = embeddings_indices[0].view_as(x)
         full_output = self.base_layer.forward(
             x.add_(indices * added_tokens_mask))
 
@@ -329,22 +240,32 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if full_lora_a_embeddings.ndim == 3:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1], -1)
-        bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+
+        # Embedding layer only need expand op
+        self.punica_wrapper.add_expand(full_output,
+                                       full_lora_a_embeddings,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
 
 class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-    
+
     LoRA B is sliced for tensor parallelism.
     """
 
@@ -357,10 +278,11 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         lora_a_output_size_per_partition = (
@@ -384,10 +306,6 @@ def create_lora_weights(
         )
         self.output_dim = self.lora_b_stacked.shape[2]
 
-        # lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -423,28 +341,11 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -473,9 +374,13 @@ def forward(self, input_):
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
@@ -494,10 +399,11 @@ def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         n_slices = 2
         if not (len(self.base_layer.output_sizes) == n_slices
@@ -533,8 +439,6 @@ def create_lora_weights(
             ) for _ in range(n_slices))
 
         self.output_dim = self.lora_b_stacked[0].shape[2]
-        # Lazily initialized.
-        self.indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -556,7 +460,8 @@ def slice_lora_b(
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = [
-            lora_b[0][:, start_idx:end_idx], lora_b[1][:, start_idx:end_idx]
+            lora_b[0][:, start_idx:end_idx],
+            lora_b[1][:, start_idx:end_idx],
         ]
         return lora_b
 
@@ -591,34 +496,33 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-            (self.output_dim, self.output_dim),
-        )
+        self.punica_wrapper.add_lora_packed_nslice(
+            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
+            (self.output_dim, self.output_dim))
         return output
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is MergedColumnParallelLinear and len(
-            packed_modules_list) == 2
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is MergedColumnParallelLinear
+                and len(packed_modules_list) == 2)
 
 
 class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     """
-    ColumnParallelLinear layer that is specifically designed for  
-    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,  
-    only contains a single LoRA within their qkv_proj layer. 
+    ColumnParallelLinear layer that is specifically designed for
+    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,
+    only contains a single LoRA within their qkv_proj layer.
 
-    During inference with Tensor Parallel, the weights of lora_b 
+    During inference with Tensor Parallel, the weights of lora_b
     must be accurately partitioned according to the respective ranks.
-    
+
     Q slice may have different shape than K and V slices (which both have
     the same shape).
     """
@@ -696,10 +600,11 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -767,11 +672,15 @@ def create_lora_weights(
             ),
         )
 
-        self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size,
-                              self.kv_proj_shard_size)
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
         # lazily initialized.
+        self.indices: torch.Tensor
         self.indices_len: List[int]
 
     def reset_lora(self, index: int):
@@ -794,15 +703,15 @@ def slice_lora_b(
         if lora_b[0] is not None:
             lora_b_q = lora_b[0][:, self.q_proj_shard_size *
                                  self.q_shard_id:self.q_proj_shard_size *
-                                 (self.q_shard_id + 1)]
+                                 (self.q_shard_id + 1), ]
         if lora_b[1] is not None:
             lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         if lora_b[2] is not None:
             lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         lora_b = [lora_b_q, lora_b_k, lora_b_v]
         return lora_b
 
@@ -851,23 +760,23 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-            self.output_slices,
-        )
+        self.punica_wrapper.add_lora_packed_nslice(output, x,
+                                                   self.lora_a_stacked,
+                                                   self.lora_b_stacked, 1.0,
+                                                   self.output_slices)
         return output
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 3
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 3)
 
 
 class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
@@ -880,10 +789,11 @@ def __init__(self, base_layer: RowParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_rank = get_tensor_model_parallel_rank()
         self.lora_a_stacked = torch.zeros(
@@ -911,9 +821,6 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -950,27 +857,10 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -1013,14 +903,18 @@ def forward(self, input_):
 
     @property
     def weight(self):
-        return self.base_layer.weight if hasattr(
-            self.base_layer, "weight") else self.base_layer.qweight
+        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
+                else self.base_layer.qweight)
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is RowParallelLinear
 
 
@@ -1125,10 +1019,6 @@ def create_lora_weights(
                 dtype=torch.long)
         else:
             self.sharded_to_full_mapping_gpu = None
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.indices_padded: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1154,19 +1044,6 @@ def set_lora(
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
                 shape[1], ] = embeddings_tensor
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = sampler_indices
-        self.indices_padded = sampler_indices_padded
-        self.indices_len = indices_len
-
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
@@ -1212,38 +1089,37 @@ def _get_logits(
                      out=lora_logits[:-1])
         lora_logits[-1] = float("-inf")
         lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0,
-                       self.indices_padded[:self.indices_len[2]]).nan_to_num_(
-                           nan=float("-inf"),
-                           posinf=float("inf"),
-                           neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
+                                                      posinf=float("inf"),
+                                                      neginf=float("-inf")))
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1]] = lora_logits
-
-        _apply_lora(
-            hidden_states,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[1]],
-            logits,
-        )
+               lora_logits.shape[1], ] = lora_logits
+
+        # LogitsProcessorWithLoRA always using bgmv
+        self.punica_wrapper.add_lora_logits(logits, hidden_states,
+                                            self.lora_a_stacked,
+                                            self.lora_b_stacked, 1.0)
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
-
         return logits
 
     def forward(self, *args, **kwargs):
         return type(self.base_layer).forward(self, *args, **kwargs)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
 
@@ -1259,9 +1135,6 @@ class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
     def __init__(self, base_layer: RotaryEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
-        # Lazily initialized
-        self.long_lora_indices: torch.Tensor
-        self.indices_len: List[int]
 
     @property
     def scaling_factors(self):
@@ -1277,9 +1150,8 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
-        scaling_factors = list(
-            lora_config.long_lora_scaling_factors
-        ) if lora_config.long_lora_scaling_factors else []
+        scaling_factors = (list(lora_config.long_lora_scaling_factors)
+                           if lora_config.long_lora_scaling_factors else [])
         base_scaling_factor = (self.base_layer.scaling_factor if isinstance(
             self.base_layer, LinearScalingRotaryEmbedding) else 1.0)
         scaling_factors = sorted(
@@ -1306,18 +1178,6 @@ def set_lora(
     ):
         ...
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.long_lora_indices = long_lora_indices
-        self.indices_len = indices_len
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -1328,19 +1188,24 @@ def forward(
             positions,
             query,
             key,
-            offsets=self.long_lora_indices[:self.indices_len[4]])
+            offsets=self.punica_wrapper.long_lora_indices,
+        )
 
     @property
     def scaling_factor_to_offset(self) -> Dict[float, int]:
         return self.base_layer.scaling_factor_to_offset
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        return type(source_layer) is LinearScalingRotaryEmbedding or type(
-            source_layer) is RotaryEmbedding
+        return (type(source_layer) is LinearScalingRotaryEmbedding
+                or type(source_layer) is RotaryEmbedding)
 
     def extra_repr(self) -> str:
         return self.base_layer.extra_repr()
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e1ede7d4d710a..017a1002bb9a7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import safetensors.torch
 import torch
@@ -21,6 +21,7 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.punica import PunicaWrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -43,115 +44,6 @@ class LongContextLoRAContext:
     offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
 
 
-def convert_mapping(
-    mapping: LoRAMapping,
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    long_lora_context: Optional[LongContextLoRAContext] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices). If long_lora doesn't
-                exist, it only contains first 4 entries.
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, lora_indices, embedding_indices
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
-                                         dtype=torch.long)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size)
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-
-    return (base_indices, sampler_indices, sampler_indices_padded,
-            embeddings_indices, long_lora_indices, indices_len)
-
-
 def get_lora_id():
     global _GLOBAL_LORA_ID
     _GLOBAL_LORA_ID += 1
@@ -422,29 +314,12 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.base_indices = torch.empty(self.max_num_batched_tokens,
-                                        dtype=torch.long,
-                                        device="cuda")
-        self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                           dtype=torch.long,
-                                           device="cuda")
-        self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
-                                                  dtype=torch.long,
-                                                  device="cuda")
-        self.embeddings_indices = torch.empty(2,
-                                              self.max_num_batched_tokens,
-                                              dtype=torch.long,
-                                              device="cuda")
-        self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
-                                             dtype=torch.long,
-                                             device="cuda")
+        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
+                                            max_batches=self.max_num_seqs,
+                                            device="cuda")
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
-        # 4 is the number of indicies tensors defined above
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices
-        self.indices_len: List[Optional[int]] = [None] * 4
         super().__init__(model)
         if hasattr(self.model, "supported_lora_modules"):
             self.supported_lora_modules = copy.deepcopy(
@@ -536,28 +411,16 @@ def pin_adapter(self, lora_id: int) -> bool:
             "Pinning is not supported in LoRAModelManager."
             "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
-    # TODO see if this can be vectorized
     def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
-        (base_indices, sampler_indices, sampler_indices_padded,
-         embeddings_indices, long_lora_offsets_tensor,
-         indices_len) = convert_mapping(mapping, self.lora_index_to_id,
-                                        self.lora_slots + 1, self.vocab_size,
-                                        self.lora_config.lora_extra_vocab_size,
-                                        self.long_lora_context)
-        self.base_indices[:base_indices.shape[0]].copy_(base_indices)
-        self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self.embeddings_indices[:embeddings_indices.
-                                shape[0], :embeddings_indices.shape[1]].copy_(
-                                    embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self.long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self.long_lora_indices.zero_()
-        # Maintain the reference
-        self.indices_len[:] = indices_len
+        # update lora states
+        self.punica_wrapper.update_metadata(
+            mapping,
+            self.lora_index_to_id,
+            self.lora_slots + 1,
+            self.vocab_size,
+            self.lora_config.lora_extra_vocab_size,
+            self.long_lora_context,
+        )
 
     def remove_all_adapters(self):
         """Remove all LoRAModels from the manager."""
@@ -595,10 +458,8 @@ def _create_lora_modules(self):
                                                 self.model.config))
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
-            new_module.set_mapping(self.base_indices, self.sampler_indices,
-                                   self.sampler_indices_padded,
-                                   self.embeddings_indices,
-                                   self.long_lora_indices, self.indices_len)
+            # All lora layers share the same punica_wrapper based on reference.
+            new_module.set_mapping(self.punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
diff --git a/vllm/lora/ops/__init__.py b/vllm/lora/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
new file mode 100644
index 0000000000000..dcaf2e3d462cc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -0,0 +1,169 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
+    """
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_k = tl.arange(0, BLOCK_K)
+    offset_n = tl.arange(0, BLOCK_N)
+    if EVEN_K:
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
+    # N must be divisible by SPLIT_N
+    split_n_length = tl.cdiv(N, SPLIT_N)
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
+    for n in range(0, split_n_length, BLOCK_N):
+        current_n = n + offset_n
+        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
+        c_mask = current_n < split_n_length
+        tiled_b = tl.load(
+            b_ptr + current_n_c[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_K = triton.next_power_of_2(K)
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    batches = lora_indices_tensor.size(0)
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batches, N)
+    grid = lambda META: (
+        META["SPLIT_N"],
+        batches,
+    )
+    _bgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
new file mode 100644
index 0000000000000..fa6571074f3ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -0,0 +1,182 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_expand_slice_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
+    """
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_k = tl.arange(0, BLOCK_K)
+    offset_n = tl.arange(0, BLOCK_N)
+    if EVEN_K:
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
+    # N must be divisible by SPLIT_N
+    split_n_length = tl.cdiv(N, SPLIT_N)
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
+             slice_offset * cn_stride)
+
+    for n in range(0, split_n_length, BLOCK_N):
+        current_n = n + offset_n
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
+        c_mask = current_n < split_n_length
+        tiled_b = tl.load(
+            b_ptr + current_n[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'b weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        batches (int): batch size
+        add_inputs (bool, optional): Defaults to False.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None.
+            Triton grid config
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+
+    assert slice_size == lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_K = triton.next_power_of_2(K)
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+
+    batches = lora_indices_tensor.size(0)
+
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batches, N)
+
+    grid = lambda META: (
+        META["SPLIT_N"],
+        batches,
+    )
+    _bgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
new file mode 100644
index 0000000000000..e69d33078f5aa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -0,0 +1,150 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import Dict, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from .utils import get_lora_op_configs
+
+
+@triton.jit
+def _bgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    scaling,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    """
+    GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's
+    performance
+    """
+    pid_sk = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+
+    offset_n = tl.arange(0, BLOCK_N)
+    offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
+    a_ptr = input_ptr + cur_batch * xm_stride
+    b_ptr = lora_ptr + l0_stride * lora_index
+    accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
+    for k in range(0, K, BLOCK_K * SPLIT_K):
+        current_k = k + offset_k
+        current_k_c = tl.max_contiguous(current_k, BLOCK_K)
+        tiled_a = tl.load(
+            a_ptr + current_k_c,
+            mask=current_k < K,
+            other=0.0,
+        )  # [BLOCK_K]
+        b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K)
+
+        tiled_b = tl.load(
+            b_ptr + offset_n[:, None] * lora_k_stride +
+            current_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        accumulator += tl.sum(tiled_a * tiled_b, 1)
+    accumulator *= scaling
+    offset_cn = tl.arange(0, BLOCK_N)
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
+    c_mask = offset_cn < N
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+    override_config: Optional[Dict[str, int]] = None,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        scaling (float):  Scaling factor.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
+    """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
+    batches = lora_indices_tensor.size(0)
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    BLOCK_N = triton.next_power_of_2(N)
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        config = get_lora_op_configs("bgmv_shrink", batches, K)
+
+    grid = lambda META: (
+        META["SPLIT_K"],
+        batches,
+    )
+    _bgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_N=BLOCK_N,
+        **config,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
new file mode 100644
index 0000000000000..4590495469096
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -0,0 +1,192 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+    The sgmv's expand triton kernel is based on GroupGEMM.
+    """
+    pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    add_inputs: bool = False,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batches,
+    )
+    _sgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
new file mode 100644
index 0000000000000..ff3bcda071b80
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -0,0 +1,205 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_expand_slice_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter 
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
+    might be that in the future, we could implement a fusion operator to 
+    achieve the current functionality instead of having to call it multiple 
+    times.
+    """
+    pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
+                                                           (slice_offset + N))
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output..
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert slice_size == lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batches,
+    )
+    _sgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
new file mode 100644
index 0000000000000..8ab049989abef
--- /dev/null
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -0,0 +1,189 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.triton_utils import libentry
+
+
+@libentry()
+@triton.jit
+def _sgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    scaling,
+    xm_stride,  # hidden_size
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    """
+    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
+    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
+    introducing SPLIT-K can improve performance
+    """
+    pid = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=1)
+    cur_batch = tl.program_id(axis=2)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride)
+    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
+             offset_k[:, None] * lora_n_stride)
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * (BLOCK_K * SPLIT_K)
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < k_remaining,
+                              other=0.0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < k_remaining,
+                              other=0.0)
+        accumulator += tl.dot(tiled_a, tiled_b)
+
+        a_ptr += BLOCK_K * SPLIT_K * xk_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    accumulator *= scaling
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    scaling: float,
+):
+    """
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
+        batches (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        scaling (float):  Scaling factor.
+    """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
+    assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 32
+    SPLIT_K = 8
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
+    grid = (
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        SPLIT_K,
+        batches,
+    )
+
+    _sgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+    )
+    return
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
new file mode 100644
index 0000000000000..7c3e27313ad97
--- /dev/null
+++ b/vllm/lora/ops/utils.py
@@ -0,0 +1,46 @@
+import functools
+from typing import Dict
+
+
+@functools.lru_cache
+def _get_op_configs(op_type: str, batch: int, hidden_size: int):
+    # TODO: add optimal configurations
+    return None
+
+
+def _check_divisibility(hidden_size: int):
+    # The bgmv_expand kernel requires that the hidden_size be divisible by
+    # the number below.
+    divisibility = [2, 4, 8, 16, 32, 64]
+    divisibility.sort(reverse=True)
+    for div in divisibility:
+        if hidden_size % div == 0:
+            return div
+    # hidden_size is an odd number
+    return 1
+
+
+def _get_default_config(op_type: str, batch: int, hidden_size: int):
+    if op_type == "expand":
+        return {
+            "BLOCK_N": 256,
+            "SPLIT_N": _check_divisibility(hidden_size),
+            "num_warps": 8
+        }
+    else:
+        return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
+
+
+def get_lora_op_configs(op_type: str, batch: int,
+                        hidden_size: int) -> Dict[str, int]:
+    """Inspired by `fused_moe_kernel`
+    The return value will be a dictionary mapping an irregular grid of batch 
+    sizes and hidden_size to configurations of the bgmv-related kernel. 
+    NOTE: It currently only supports the default configuration. We plan to 
+    generate optimal configurations for different hardware in the future using 
+    scripts similar to `benchmark_moe.py`.
+    """
+    config = _get_op_configs(op_type, batch, hidden_size)
+    if not config:
+        config = _get_default_config(op_type, batch, hidden_size)
+    return config
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 64f87a4b2c69d..6d5c834299961 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,207 +1,604 @@
-# Based on code from https://github.com/punica-ai/punica
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
 
-from typing import Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
 
 import torch
 
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
 
+if HAS_TRITON:
+    from vllm.lora.ops.bgmv_expand import bgmv_expand
+    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+    from vllm.lora.ops.sgmv_expand import sgmv_expand
+    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-def _check_punica_support():
-    if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
-        return
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
 
-    if current_platform.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0")
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.")
-
-
-def bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-        matrices.
-      indicies: Shape: `[B]`. Indices of the weight matrices.
-      layer_idx: Layer index of the weight matrices.
-      scale: Scaling factor.
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
     """
-    _check_punica_support()
 
-    ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
 
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, no_lora)
 
-def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
-                            w_t_all: torch.Tensor, indicies: torch.LongTensor,
-                            layer_idx: int, scale: float, y_offset: int,
-                            y_slice_size: int):
-    """
-    Same as `bgmv` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
 
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
 
     Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
-        all of the transposed LoRA matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
     """
-    _check_punica_support()
-
-    ops.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        x.size(1),
-        y_slice_size,
-        y_offset,
-    )
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device="cuda",
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
 
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         device="cuda",
+                                         dtype=torch.long)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
 
-def add_lora(y: torch.Tensor,
-             x: torch.Tensor,
-             wa_t_all: torch.Tensor,
-             wb_t_all: torch.Tensor,
-             indicies: torch.LongTensor,
-             layer_idx: int,
-             scale: float,
-             *,
-             buffer: Optional[torch.Tensor] = None):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+
+class PunicaWrapper:
     """
-    _check_punica_support()
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale)
-
-
-def add_lora_slice(y: torch.Tensor,
-                   x: torch.Tensor,
-                   wa_t_all: torch.Tensor,
-                   wb_t_all: torch.Tensor,
-                   indicies: torch.LongTensor,
-                   layer_idx: int,
-                   scale: float,
-                   y_offset: int,
-                   y_slice_size: int,
-                   *,
-                   buffer: Optional[torch.Tensor] = None):
+    PunicaWrapper is designed to manage and provide metadata for the punica 
+    kernel. The main function  is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica kernel.
     """
-    Same as `add_lora` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
 
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: str):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._long_lora_indices = torch.empty(max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device=device)
 
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    _check_punica_support()
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    ops.dispatch_bgmv_low_level(
-        buffer,
-        x,
-        wa_t_all,
-        indicies,
-        layer_idx,
-        1.0,
-        x.size(1),
-        buffer.size(1),
-        0,
-    )
-    ops.dispatch_bgmv_low_level(
-        y,
-        buffer,
-        wb_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        buffer.size(1),
-        y_slice_size,
-        y_offset,
-    )
+        # 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,long_lora_indices
+        self.indices_len: List[Optional[int]] = [None] * 5
+        # these attributes are the information required for sgmv kernel
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+        self.max_length: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+        self.no_lora = False
+
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metada(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            long_lora_context,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length, no_lora) = compute_meta(token_lora_tensor)
+
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+        self.no_lora = no_lora
+
+    @property
+    def prefill_metadata(
+            self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions
+            2. seq_lengths: Tensor of sequence lengths
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
+            4. batch_size: batch size after clustering identical lora indices
+            5. max_length: The maximum sequence length in the batch
+        """
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length)
+
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch. An index of -1 means no lora should be applied.
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
+
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
+
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    @property
+    def long_lora_indices(self) -> torch.Tensor:
+        """ 
+        This property provides access to the indices used for long context 
+        lora, specifically for LinearScalingRotaryEmbeddingWithLora
+        """
+        long_lora_len = self.indices_len[4]
+        return self._long_lora_indices[:long_lora_len]
+
+    def shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_input,
+        )
+
+    def expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+
+    def expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_input,
+        )
+
+    def expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_input)
+
+    def add_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the shrink_decode function
+        should be called.
+        """
+        shrink_fun: Callable = (self.shrink_prefill
+                                if self.is_prefill else self.shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'b.
+        When `is_prefill` is true, it indicates that it is currently the
+        prefill stage, and the `expand_prefill` function should be called.
+        Otherwise, it is the decode stage, and the expand_decode function
+        should be called.
+        """
+
+        expand_fun: Callable = (self.expand_prefill
+                                if self.is_prefill else self.expand_decode)
+        expand_fun(y, x, w_t_all, add_input)
+
+    def add_expand_slice(self,
+                         y: torch.Tensor,
+                         x: torch.Tensor,
+                         w_t_all: torch.Tensor,
+                         y_offset: Optional[int],
+                         y_slice_size: Optional[int],
+                         add_input: bool = True):
+        """
+        Similar to `add_expand`
+        """
+
+        expand_slice_fun: Callable = (self.expand_slice_prefill
+                                      if self.is_prefill else
+                                      self.expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def add_lora(self,
+                 y: torch.Tensor,
+                 x: torch.Tensor,
+                 wa_t_all: torch.Tensor,
+                 wb_t_all: torch.Tensor,
+                 scale: float,
+                 y_offset: Optional[int] = None,
+                 y_slice_size: Optional[int] = None,
+                 *,
+                 buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        Semantics:
+        y[i] += (
+            x[i].unsqueeze(0)
+            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            * scale
+            ).squeeze(0)
+        Args:
+            y (torch.Tensor):  Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            wa_t_all (torch.Tensor): lora_a's weight
+            wb_t_all (torch.Tensor): lora_b's weight
+            scale (float): Scaling factor.
+            y_offset (Optional[int], optional): Offset to apply to the starting
+                column of y.
+            y_slice_size (Optional[int], optional): Size of the y column slice..
+            buffer (Optional[torch.Tensor], optional): Defaults to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        self.add_shrink(buffer, x, wa_t_all, scale)
+        if y_offset is None and y_slice_size is None:
+            self.add_expand(y, buffer, wb_t_all, add_input=True)
+        else:
+            self.add_expand_slice(y,
+                                  buffer,
+                                  wb_t_all,
+                                  y_offset,
+                                  y_slice_size,
+                                  add_input=True)
+        y = y.view_as(y_org)
+
+    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                               lora_a_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               lora_b_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               scale: float,
+                               output_slices: Tuple[int, ...]) -> None:
+        """
+        Applies lora to each input. Similar to add_lora, This method is 
+        used for layers that are composed of multiple sublayers
+        (slices) packed together.
+        """
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+        # TODO fuse these kernels
+        for slice_idx in range(len(output_slices)):
+            self.add_lora(y, x, lora_a_stacked[slice_idx],
+                          lora_b_stacked[slice_idx], scale, offset_left,
+                          output_slices[slice_idx])
+            offset_left += output_slices[slice_idx]
+
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        wa_t_all: torch.Tensor,
+                        wb_t_all: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        LogitsProcessorWithLoRA always using bgmv
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 568185383aa5c..3f57c22e1f2e4 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -6,5 +6,6 @@
 
     from vllm.triton_utils.custom_cache_manager import (
         maybe_set_triton_cache_manager)
+    from vllm.triton_utils.libentry import libentry
 
-    __all__ += ["maybe_set_triton_cache_manager"]
+    __all__ += ["maybe_set_triton_cache_manager", "libentry"]
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
new file mode 100644
index 0000000000000..ae00af44a048a
--- /dev/null
+++ b/vllm/triton_utils/libentry.py
@@ -0,0 +1,167 @@
+# Copied From https://github.com/FlagOpen/FlagGems
+
+import inspect
+
+import triton
+
+
+class LibEntry(triton.KernelInterface):
+
+    def __init__(
+        self,
+        fn,
+    ):
+        self.fn = fn
+        self.arg_names = fn.arg_names
+        self.divisibility = 16
+        self.kernel_cache = dict()
+        fn = self.fn
+        while not isinstance(fn, triton.runtime.JITFunction):
+            fn = fn.fn
+        self.jit_function: triton.runtime.JITFunction = fn
+        self.specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and not p.do_not_specialize
+        ]
+        self.do_not_specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and p.do_not_specialize
+        ]
+
+    def key(self, spec_args, dns_args, const_args):
+        spec_key = [(arg.dtype, arg.data_ptr() %
+                     self.divisibility == 0) if hasattr(arg, "data_ptr") else
+                    (type(arg), arg) for arg in spec_args]
+        dns_key = [
+            arg.dtype if hasattr(
+                arg, "data_ptr") else type(arg) if not isinstance(arg, int)
+            else "i32" if -(2**31) <= arg and arg <= 2**31 -
+            1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64"
+            for arg in dns_args
+        ]
+        # const args passed by position
+        return tuple(spec_key + dns_key + const_args)
+
+    def run(self, *args, **kwargs):
+        grid = kwargs["grid"]
+        # collect all the arguments
+        spec_args = []  # specialize arguments
+        dns_args = []  # do not specialize arguments
+        const_args = []  # constexpr arguments
+        k_args = []  # kernel arguments
+        for i, arg in enumerate(args):
+            if i in self.specialize_indices:
+                k_args.append(arg)
+                spec_args.append(arg)
+            elif i in self.do_not_specialize_indices:
+                k_args.append(arg)
+                dns_args.append(arg)
+            else:
+                const_args.append(arg)
+        for p in self.jit_function.params[len(args):]:
+            if p.name in kwargs:
+                val = kwargs[p.name]
+            elif p.default is inspect._empty:
+                continue
+            else:
+                val = p.default
+
+            if p.is_constexpr:
+                const_args.append(val)
+            elif p.do_not_specialize:
+                dns_args.append(val)
+                k_args.append(val)
+            else:
+                spec_args.append(val)
+                k_args.append(val)
+
+        entry_key = self.key(spec_args, dns_args, const_args)
+
+        if entry_key not in self.kernel_cache:
+            # compile the kernel also completes the related computations
+            kernel = self.fn.run(*args, **kwargs)
+            fn = self.fn
+            # collect constexpr arguments for grid computation
+            constexprs = {}
+            while not isinstance(fn, triton.runtime.JITFunction):
+                if isinstance(fn, triton.runtime.Autotuner):
+                    config = fn.best_config
+                    constexprs["num_warps"] = config.num_warps
+                    constexprs["num_stages"] = config.num_stages
+                    constexprs["num_ctas"] = config.num_ctas
+                    constexprs = {**constexprs, **config.kwargs}
+                elif isinstance(fn, triton.runtime.Heuristics):
+                    for v, heur in fn.values.items():
+                        constexprs[v] = heur({
+                            **dict(zip(fn.arg_names, args)),
+                            **kwargs,
+                            **constexprs,
+                        })
+                else:
+                    raise RuntimeError("Invalid Runtime Function")
+                fn = fn.fn
+            # In vLLM, certain kernels like fused_moe_kernel get the
+            # best_config(as kwargs) from a configuration json file, rather
+            # than using Autotuner & Heuristics. Therefore, all their constexprs
+            # (tl.constexpr) are assigned values through the following loop.
+            for p in self.jit_function.params:
+                if p.is_constexpr and p.name not in constexprs:
+                    constexprs[p.name] = p.default  #default=inspect._empty
+            self.kernel_cache[entry_key] = (kernel, constexprs)
+        else:
+            # load kernel from cache directly
+            kernel, constexprs = self.kernel_cache[entry_key]
+
+            if callable(grid):
+                # collect all arguments to the grid fn，ie:
+                # 1. args,
+                # 2. kwargs,
+                # 3. all all other captured arguments in CompiledKernel from
+                # Autotunner & Heuristics when kwargs & captured args conflict,
+                # captured args have higher priority
+                # 4. We must filter out captured args with default value firstly
+                constexprs = {
+                    k: v
+                    for k, v in constexprs.items() if v is not inspect._empty
+                }
+                meta = {
+                    **dict(zip(self.arg_names, args)),
+                    **kwargs,
+                    **constexprs,
+                }
+                grid = grid(meta)
+            if isinstance(grid, tuple):
+                grid = grid + (1, 1)
+            elif isinstance(grid, list):
+                grid = grid + [1, 1]
+            kernel[grid[0:3]](*k_args)
+        # maintaining the same return type as the JITFunction.run
+        return kernel
+
+
+def libentry():
+    """
+    Decorator for triton library entries.
+    Motivation:
+        The runtime overhead of Triton kernels is the reason for the lower 
+        performance of small kernels, particularly evident with smaller models. 
+        Using this decorator can reduce Triton runtime overhead.
+    How:
+        The `run` function of JITFunction needs to accomplish:
+            - Parameter binding using inspect
+            - KernelArg type wrapping
+            - Cache key calculation
+        When dealing with small size, these steps can become bottlenecks in 
+        Triton runtime. Libentry simplifies these steps to reduce runtime 
+        overhead, thereby improving the runtime expenses of small kernels.
+    NOTE:
+        When Triton is upgraded to version 3.0.0, libentry can be removed,
+        see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245
+        
+
+    """
+
+    def decorator(fn):
+        return LibEntry(fn)
+
+    return decorator
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index de999b11d91b5..7773442899585 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -578,9 +578,9 @@ def build(self) -> ModelInputForGPU:
                 for inter_data in self.inter_data_list
             ])
             lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=not self.decode_only))
 
         # Prompt adapter data.
         prompt_adapter_requests: Set[PromptAdapterRequest] = set()
@@ -1152,9 +1152,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
 
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
-                            [0] * batch_size,
-                            [0] * batch_size,
-                        )
+                            **dict(index_mapping=[0] * batch_size,
+                                   prompt_mapping=[0] * batch_size,
+                                   is_prefill=False))
                         self.set_active_loras(set(), lora_mapping)
 
                     if self.prompt_adapter_config:

From 1d2e7fb73f1205ae03e4ee3bcd3de566733bf582 Mon Sep 17 00:00:00 2001
From: xuyi <xuyi@me.com>
Date: Thu, 1 Aug 2024 09:49:51 +0800
Subject: [PATCH 41/79] [Model] Pipeline parallel support for Qwen2 (#6924)

---
 vllm/config.py                          |  2 +
 vllm/model_executor/models/qwen2.py     | 57 +++++++++++++++-----
 vllm/model_executor/models/qwen2_moe.py | 69 +++++++++++++++++++------
 3 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index de5d0402a1bc7..e065744592378 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -40,6 +40,8 @@
     "GPT2LMHeadModel",
     "MixtralForCausalLM",
     "NemotronForCausalLM",
+    "Qwen2ForCausalLM",
+    "Qwen2MoeForCausalLM",
 ]
 
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 35fd6f37589a0..99fdd993943be 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,7 +30,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -49,6 +49,7 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
 
 
 class Qwen2MLP(nn.Module):
@@ -227,6 +228,7 @@ def __init__(
         config: Qwen2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -237,10 +239,14 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.layers = nn.ModuleList([
-            Qwen2DecoderLayer(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen2DecoderLayer(config=config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config),
+            prefix=f"{prefix}.layers",
+        )
+
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -255,20 +261,30 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
         else:
-            hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -351,6 +367,20 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def sample(
         self,
         logits: torch.Tensor,
@@ -381,6 +411,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -393,7 +425,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 name = maybe_remap_kv_scale_name(name, params_dict)
                 if name is None:
                     continue
-
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2cc2f1440d147..b895788206d10 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -31,7 +31,8 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -52,6 +53,8 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
+from .utils import is_pp_missing_parameter, make_layers
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -315,6 +318,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
@@ -324,13 +328,15 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.layers = nn.ModuleList([
-            Qwen2MoeDecoderLayer(config,
-                                 layer_idx,
-                                 cache_config,
-                                 quant_config=quant_config)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen2MoeDecoderLayer(config=config,
+                                                layer_idx=int(
+                                                    prefix.split(".")[-1]),
+                                                cache_config=cache_config,
+                                                quant_config=quant_config),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
@@ -339,14 +345,25 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i], attn_metadata,
-                                            residual)
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
@@ -380,7 +397,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
@@ -389,6 +406,20 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def sample(
         self,
         logits: Optional[torch.Tensor],
@@ -435,6 +466,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 if name not in params_dict:
                     continue
 
@@ -448,6 +482,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -460,6 +497,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     # Remapping the name of FP8 kv-scale.
                     if name.endswith("kv_scale"):
                         remapped_kv_scale_name = name.replace(
@@ -474,7 +514,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                             continue
                         else:
                             name = remapped_kv_scale_name
-
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)

From 23993a7997ff927decaca60281871d5fdab11334 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 31 Jul 2024 18:50:28 -0700
Subject: [PATCH 42/79] [Bugfix][TPU] Do not use torch.Generator for TPUs
 (#6981)

---
 vllm/model_executor/model_loader/weight_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 942215da01af4..5e142e8cb8b8a 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+from vllm.platforms import current_platform
 from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
@@ -490,6 +491,11 @@ def initialize_dummy_weights(
     """
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
+            if current_platform.is_tpu():
+                # XLA device does not support torch.Generator()
+                param.uniform_(low, high)
+                continue
+
             generator = torch.Generator(device=param.data.device)
             generator.manual_seed(seed)
             if torch.finfo(param.data.dtype).bits < 16:

From 630dd9e0aea166085a4c897e21a98ec752954265 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 31 Jul 2024 20:49:11 -0600
Subject: [PATCH 43/79] [Bugfix][Model] Skip loading lm_head weights if using
 tie_word_embeddings (#6758)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/model_executor/models/chameleon.py | 7 +++++++
 vllm/model_executor/models/llama.py     | 5 +++++
 vllm/model_executor/models/minicpm.py   | 6 +++++-
 vllm/model_executor/models/olmo.py      | 5 +++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 7659f598bab94..10a82207d90ef 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -998,6 +998,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
             use_default_weight_loading = False
             if "vqmodel" in name:
                 if self.model.vqmodel is not None:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2052c443a8885..048c292c79c83 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -469,6 +469,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             if scale_name := get_compressed_tensors_cache_scale(name):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index b46e88f5fc584..7f8f38fe8439a 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -514,7 +514,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 408c0c883a9d0..1a0a3774dc8fb 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -343,6 +343,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From 0437492ea97f0650a8b2ca39121be8864625fd70 Mon Sep 17 00:00:00 2001
From: Aurick Qiao <aurickq@users.noreply.github.com>
Date: Wed, 31 Jul 2024 20:15:42 -0700
Subject: [PATCH 44/79] PP comm optimization: replace send with partial send +
 allgather (#6695)

Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
---
 vllm/distributed/parallel_state.py | 38 ++++++++++++++++++++++++++++--
 vllm/worker/worker_base.py         |  8 ++++---
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index bf7a7de0724af..d7ca8fd82e1a2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -569,7 +569,8 @@ def broadcast_tensor_dict(
     def send_tensor_dict(
         self,
         tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        dst: Optional[int] = None
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
     ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
@@ -578,6 +579,11 @@ def send_tensor_dict(
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return tensor_dict
 
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
         group = self.device_group
         metadata_group = self.cpu_group
 
@@ -598,6 +604,12 @@ def send_tensor_dict(
             if tensor.numel() == 0:
                 # Skip sending empty tensors.
                 continue
+
+            # send-allgather: send only a slice, then do allgather.
+            if (all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0):
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
             if tensor.is_cpu:
                 # use metadata_group for CPU tensors
                 torch.distributed.send(tensor,
@@ -612,7 +624,8 @@ def send_tensor_dict(
 
     def recv_tensor_dict(
         self,
-        src: Optional[int] = None
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
     ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
         """Recv the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
@@ -621,6 +634,11 @@ def recv_tensor_dict(
         if not torch.distributed.is_initialized() or self.world_size == 1:
             return None
 
+        all_gather_size = (1 if all_gather_group is None else
+                           all_gather_group.world_size)
+        all_gather_rank = (0 if all_gather_group is None else
+                           all_gather_group.rank_in_group)
+
         group = self.device_group
         metadata_group = self.cpu_group
 
@@ -639,6 +657,16 @@ def recv_tensor_dict(
                     # Skip broadcasting empty tensors.
                     tensor_dict[key] = tensor
                     continue
+
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (all_gather_group is not None
+                                  and tensor.numel() % all_gather_size == 0)
+
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size,
+                                            -1)[all_gather_rank]
+
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
                     torch.distributed.recv(tensor,
@@ -649,6 +677,12 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=group)
+                if use_all_gather:
+                    # do the allgather
+                    tensor = all_gather_group.all_gather(  # type: ignore
+                        tensor, dim=0)
+                    tensor = tensor.reshape(orig_shape)
+
                 tensor_dict[key] = tensor
             else:
                 tensor_dict[key] = value
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 03e3857e23c4b..8a4d1958c65a0 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
+from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
@@ -267,7 +267,8 @@ def execute_model(
         intermediate_tensors = None
         if not get_pp_group().is_first_rank:
             intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict())
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
 
         output = self.model_runner.execute_model(
             model_input, self.kv_cache[worker_input.virtual_engine]
@@ -276,7 +277,8 @@ def execute_model(
 
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
-            get_pp_group().send_tensor_dict(output.tensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
             return [None]
 
         # output is List[SamplerOutput]

From 3c10591ef2e78fbb6aa341195c4b24c36ae8b84d Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Wed, 31 Jul 2024 21:13:34 -0700
Subject: [PATCH 45/79] [Bugfix] Set SamplingParams.max_tokens for OpenAI
 requests if not provided by user (#6954)

---
 tests/entrypoints/openai/test_serving_chat.py | 39 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 30 ++++++++++----
 vllm/entrypoints/openai/serving_chat.py       | 23 ++++-------
 vllm/entrypoints/openai/serving_completion.py | 27 +++++--------
 vllm/entrypoints/openai/serving_engine.py     | 17 ++++++--
 5 files changed, 92 insertions(+), 44 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 464465494b714..168ba7ba888ef 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,7 +1,12 @@
 import asyncio
+from contextlib import suppress
 from dataclasses import dataclass
+from unittest.mock import MagicMock
 
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
@@ -42,3 +47,37 @@ async def _async_serving_chat_init():
 def test_async_serving_chat_init():
     serving_completion = asyncio.run(_async_serving_chat_init())
     assert serving_completion.chat_template == CHAT_TEMPLATE
+
+
+def test_serving_chat_should_set_correct_max_tokens():
+    mock_engine = MagicMock(spec=AsyncLLMEngine)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     served_model_names=[MODEL_NAME],
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     lora_modules=None,
+                                     prompt_adapters=None,
+                                     request_logger=None)
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    # AsyncLLMEngine.generate(inputs, sampling_params, ...)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    req.max_tokens = 10
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 205860aa8e722..3b35ae1ebd705 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import LogitsProcessor, SamplingParams
 from vllm.utils import random_uuid
 
 
@@ -215,15 +215,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self,
-                           tokenizer: PreTrainedTokenizer) -> SamplingParams:
-        # We now allow logprobs being true without top_logrobs.
+    def to_sampling_params(
+            self, tokenizer: PreTrainedTokenizer,
+            guided_decode_logits_processor: Optional[LogitsProcessor],
+            default_max_tokens: int) -> SamplingParams:
+        max_tokens = self.max_tokens
+        if max_tokens is None:
+            max_tokens = default_max_tokens
 
+        # We now allow logprobs being true without top_logrobs.
         logits_processors = get_logits_processors(
             logit_bias=self.logit_bias,
             allowed_token_ids=None,
             tokenizer=tokenizer,
         )
+        if guided_decode_logits_processor:
+            logits_processors.append(guided_decode_logits_processor)
 
         return SamplingParams(
             n=self.n,
@@ -241,7 +248,7 @@ def to_sampling_params(self,
             logprobs=self.top_logprobs if self.logprobs else None,
             prompt_logprobs=self.top_logprobs if self.echo else None,
             ignore_eos=self.ignore_eos,
-            max_tokens=self.max_tokens,
+            max_tokens=max_tokens,
             min_tokens=self.min_tokens,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
@@ -395,7 +402,14 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
+    def to_sampling_params(
+            self, tokenizer: PreTrainedTokenizer,
+            guided_decode_logits_processor: Optional[LogitsProcessor],
+            default_max_tokens: int) -> SamplingParams:
+        max_tokens = self.max_tokens
+        if max_tokens is None:
+            max_tokens = default_max_tokens
+
         echo_without_generation = self.echo and self.max_tokens == 0
 
         logits_processors = get_logits_processors(
@@ -403,6 +417,8 @@ def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
             allowed_token_ids=self.allowed_token_ids,
             tokenizer=tokenizer,
         )
+        if guided_decode_logits_processor:
+            logits_processors.append(guided_decode_logits_processor)
 
         return SamplingParams(
             n=self.n,
@@ -419,7 +435,7 @@ def to_sampling_params(self, tokenizer: PreTrainedTokenizer):
             stop_token_ids=self.stop_token_ids,
             logprobs=self.logprobs,
             ignore_eos=self.ignore_eos,
-            max_tokens=self.max_tokens if not echo_without_generation else 1,
+            max_tokens=max_tokens if not echo_without_generation else 1,
             min_tokens=self.min_tokens,
             use_beam_search=self.use_beam_search,
             early_stopping=self.early_stopping,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 01843930bf11d..c832cf2a24b50 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -25,8 +25,6 @@
                                                     PromptAdapterPath)
 from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
@@ -134,28 +132,23 @@ async def create_chat_completion(
 
         request_id = f"chat-{random_uuid()}"
         try:
-            sampling_params = request.to_sampling_params(tokenizer)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
             guided_decode_logits_processor = (
-                await
-                get_guided_decoding_logits_processor(guided_decoding_backend,
-                                                     request, tokenizer))
-            if guided_decode_logits_processor:
-                if sampling_params.logits_processors is None:
-                    sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logits_processor)
+                await self._guided_decode_logits_processor(request, tokenizer))
 
             prompt_inputs = self._tokenize_prompt_input(
                 request,
                 tokenizer,
                 prompt,
-                truncate_prompt_tokens=sampling_params.truncate_prompt_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
 
+            sampling_params = request.to_sampling_params(
+                tokenizer,
+                guided_decode_logits_processor,
+                default_max_tokens=self.max_model_len -
+                len(prompt_inputs["prompt_token_ids"]))
+
             self._log_inputs(request_id,
                              prompt_inputs,
                              params=sampling_params,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8548352791680..7765c5903f341 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -24,8 +24,6 @@
                                                     OpenAIServing,
                                                     PromptAdapterPath)
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -95,31 +93,24 @@ async def create_completion(self, request: CompletionRequest,
 
             tokenizer = await self.engine.get_tokenizer(lora_request)
 
-            sampling_params = request.to_sampling_params(tokenizer)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
-            guided_decode_logit_processor = (
-                await
-                get_guided_decoding_logits_processor(guided_decoding_backend,
-                                                     request, tokenizer))
-            if guided_decode_logit_processor is not None:
-                if sampling_params.logits_processors is None:
-                    sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logit_processor)
-
+            guided_decode_logits_processor = (
+                await self._guided_decode_logits_processor(request, tokenizer))
             prompts = list(
                 self._tokenize_prompt_input_or_inputs(
                     request,
                     tokenizer,
                     request.prompt,
-                    truncate_prompt_tokens=sampling_params.
-                    truncate_prompt_tokens,
+                    truncate_prompt_tokens=request.truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 ))
 
             for i, prompt_inputs in enumerate(prompts):
+                sampling_params = request.to_sampling_params(
+                    tokenizer,
+                    guided_decode_logits_processor,
+                    default_max_tokens=self.max_model_len -
+                    len(prompt_inputs["prompt_token_ids"]))
+
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b374a7946b11e..8c7929a12e9a0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -25,9 +25,11 @@
 from vllm.inputs import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import LogitsProcessor, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer_group import AnyTokenizer
 
@@ -150,6 +152,15 @@ def create_streaming_error_response(
         })
         return json_str
 
+    async def _guided_decode_logits_processor(
+            self, request: Union[ChatCompletionRequest, CompletionRequest],
+            tokenizer: AnyTokenizer) -> Optional[LogitsProcessor]:
+        decoding_config = await self.engine.get_decoding_config()
+        guided_decoding_backend = request.guided_decoding_backend \
+            or decoding_config.guided_decoding_backend
+        return await get_guided_decoding_logits_processor(
+            guided_decoding_backend, request, tokenizer)
+
     async def _check_model(
         self,
         request: AnyRequest,
@@ -254,9 +265,7 @@ def _validate_input(
                     f"{self.max_model_len} tokens. However, you requested "
                     f"{token_num} tokens in the messages, "
                     f"Please reduce the length of the messages.")
-            request.max_tokens = self.max_model_len - token_num
-
-        if token_num + request.max_tokens > self.max_model_len:
+        elif token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
                 f"This model's maximum context length is "
                 f"{self.max_model_len} tokens. However, you requested "

From c8a7e93273ff4338d6f89f8a63ff16426ac240b8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 31 Jul 2024 23:51:09 -0700
Subject: [PATCH 46/79] [core][scheduler] simplify and improve scheduler
 (#6867)

---
 tests/core/block/e2e/test_correctness.py |   2 +-
 tests/core/test_scheduler.py             | 163 ++++++++++-------------
 vllm/core/policy.py                      |  45 -------
 vllm/core/scheduler.py                   | 116 ++++++----------
 4 files changed, 112 insertions(+), 214 deletions(-)
 delete mode 100644 vllm/core/policy.py

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 8502eab0f8da0..e0dee43f500a0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -183,7 +183,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
 
             # Allow only 2 sequences of ~128 tokens in worst case.
             # Note 16 = 128/block_size
-            "num_gpu_blocks_override": 2 * (16 + 1),
+            "num_gpu_blocks_override": 2 * (16 + 2),
         }
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 4ca2260b5e017..447e8f8a586f6 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,13 +1,12 @@
 import time
 from collections import deque
-from typing import Deque, List, Set, Tuple
+from typing import List, Set, Tuple
 from unittest.mock import MagicMock
 
 import pytest  # noqa
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
-from vllm.core.policy import PolicyFactory
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
@@ -348,10 +347,10 @@ def test_prefill_schedule_max_prompt_len():
     """
     scheduler = initialize_scheduler(max_model_len=30)
     _, seq_group = create_dummy_prompt("0", prompt_length=60)
-    waiting = deque([seq_group])
+    scheduler.add_seq_group(seq_group)
     budget = create_token_budget()
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 1
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -364,15 +363,14 @@ def test_prefill_schedule_token_budget():
     Test token budget respected.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=0)
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
 
     # 0 token budget == nothing is scheduled.
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -381,8 +379,8 @@ def test_prefill_schedule_token_budget():
 
     # 60 token budget == 1 request scheduled.
     budget = create_token_budget(token_budget=60)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 1
     assert budget.num_batched_tokens == 60
@@ -391,14 +389,13 @@ def test_prefill_schedule_token_budget():
 
     # Test when current_batched_tokens respected.
     scheduler = initialize_scheduler()
-    waiting = deque()
     budget = create_token_budget(token_budget=60)
     add_token_budget(budget, 30, 0)
     _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
     # Cannot schedule a prompt that doesn't fit the budget.
-    waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 30
@@ -406,8 +403,8 @@ def test_prefill_schedule_token_budget():
     assert len(remaining_waiting) == 1
     budget = create_token_budget(token_budget=90)
     add_token_budget(budget, 30, 0)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.seq_groups) == 1
     assert budget.num_batched_tokens == 90
     assert budget.num_curr_seqs == 1
@@ -419,13 +416,12 @@ def test_prefill_schedule_max_seqs():
     Test max seq respected.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(max_num_seqs=2)
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+        scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 2
     assert budget.num_batched_tokens == 120
@@ -433,13 +429,13 @@ def test_prefill_schedule_max_seqs():
     assert len(remaining_waiting) == 1
 
     # Verify curr_num_seqs respected.
-    waiting = deque()
+    scheduler.waiting = deque()
     budget = create_token_budget(max_num_seqs=2)
     add_token_budget(budget, 0, 2)
     _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-    waiting.append(seq_group)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    scheduler.add_seq_group(seq_group)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -453,7 +449,6 @@ def test_prefill_schedule_max_lora():
     """
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget(token_budget=120)
     curr_loras: Set[int] = set()
     for i in range(2):
@@ -463,7 +458,7 @@ def test_prefill_schedule_max_lora():
                                                lora_name=str(i),
                                                lora_int_id=i + 1,
                                                lora_path="abc"))
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     # Add two more requests to verify lora is prioritized.
     # 0: Lora, 1: Lora, 2: regular, 3: regular
     # In the first iteration, index 0, 2 is scheduled.
@@ -471,10 +466,10 @@ def test_prefill_schedule_max_lora():
     # prioritized. Verify that.
     for i in range(2, 4):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     # Schedule 2 requests (0 and 2)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, curr_loras)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 2
     assert budget.num_batched_tokens == 120
@@ -485,8 +480,8 @@ def test_prefill_schedule_max_lora():
     # Reset curr_loras so that it can be scheduled.
     curr_loras = set()
     budget = create_token_budget(token_budget=60)
-    remaining_waiting, output = scheduler._schedule_prefills(
-        remaining_waiting, budget, curr_loras)
+    output = scheduler._schedule_prefills(budget, curr_loras)
+    remaining_waiting = scheduler.waiting
     assert len(output.seq_groups) == 1
     assert output.seq_groups[0].seq_group.request_id == "1"
     assert len(remaining_waiting) == 1
@@ -499,31 +494,29 @@ def test_prefill_schedule_no_block_manager_capacity():
     Test sequence cannot be scheduled due to block manager has no capacity.
     """
     scheduler = initialize_scheduler()
-    waiting: Deque[SequenceGroup] = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
-    remainig_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 0
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
     assert budget.num_curr_seqs == 0
-    assert len(remainig_waiting) == 3
+    assert len(remaining_waiting) == 3
 
     scheduler = initialize_scheduler()
-    waiting = deque()
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        waiting.append(seq_group)
+        scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
-    remaining_waiting, output = scheduler._schedule_prefills(
-        waiting, budget, None)
+    output = scheduler._schedule_prefills(budget, None)
+    remaining_waiting = scheduler.waiting
     assert len(output.ignored_seq_groups) == 3
     assert len(output.seq_groups) == 0
     assert budget.num_batched_tokens == 0
@@ -536,14 +529,12 @@ def test_decode_schedule_preempted():
     Test decodes cannot be scheduled and preempted.
     """
     scheduler = initialize_scheduler()
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
-        running.append(seq_group)
+        scheduler._add_seq_group_to_running(seq_group)
     scheduler.block_manager.can_append_slots = MagicMock()
 
     def cannot_append_second_group(seq_group, num_lookahead_slots):
@@ -555,8 +546,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     # 1 cannot be scheduled, and the lowest priority (request 2)
     # should be preempted. 1 will also be preempted.
     budget = create_token_budget()
-    remainig_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remainig_running = scheduler.running
     assert len(remainig_running) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
@@ -577,14 +568,12 @@ def test_decode_swap_beam_search():
     Test best_of > 1 swap out blocks
     """
     scheduler = initialize_scheduler()
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     budget = create_token_budget()
     for i in range(3):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
         scheduler._allocate_and_set_running(seq_group)
-        running.append(seq_group)
+        scheduler._add_seq_group_to_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         budget.add_num_seqs(seq_group.request_id,
                             seq_group.get_max_num_running_seqs())
@@ -603,8 +592,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     expected_swap_mapping = [("5", "7")]
     scheduler.block_manager.swap_out.return_value = expected_swap_mapping
 
-    remainig_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remainig_running = scheduler.running
     assert len(remainig_running) == 0
     assert len(output.decode_seq_groups) == 2
     assert len(output.prefill_seq_groups) == 0
@@ -628,20 +617,18 @@ def test_schedule_decode_blocks_to_copy_update():
     """
     scheduler = initialize_scheduler()
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
-    running.append(seq_group)
+    scheduler._add_seq_group_to_running(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.append_slots = MagicMock()
     scheduler.block_manager.append_slots.return_value = [(2, 3)]
 
     budget = create_token_budget()
-    remaining_running, output = scheduler._schedule_running(
-        running, budget, curr_loras, policy)
+    output = scheduler._schedule_running(budget, curr_loras)
+    remaining_running = scheduler.running
     assert len(remaining_running) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
@@ -656,19 +643,17 @@ def test_schedule_decode_blocks_to_copy_update():
 
 def test_schedule_swapped_simple():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     scheduler._swap_out(seq_group, blocks_to_swap_out)
-    swapped.append(seq_group)
+    scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 2
@@ -683,8 +668,6 @@ def test_schedule_swapped_simple():
 
 def test_schedule_swapped_max_token_budget():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -692,11 +675,11 @@ def test_schedule_swapped_max_token_budget():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget(token_budget=1)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 2
@@ -706,8 +689,8 @@ def test_schedule_swapped_max_token_budget():
     # Verify num_batched_tokens are respected.
     budget = create_token_budget(token_budget=1)
     add_token_budget(budget, 1, 0)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        remaining_swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 0
@@ -717,8 +700,6 @@ def test_schedule_swapped_max_token_budget():
 
 def test_schedule_swapped_max_seqs():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(4):
@@ -726,11 +707,11 @@ def test_schedule_swapped_max_seqs():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget(max_num_seqs=2)
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 2
     assert budget.num_curr_seqs == 2
@@ -738,8 +719,8 @@ def test_schedule_swapped_max_seqs():
     assert len(output.prefill_seq_groups) == 0
 
     # Verify num_curr_seqs are respected.
-    remaining_swapped, output = scheduler._schedule_swapped(
-        remaining_swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 2
     assert budget.num_curr_seqs == 2
@@ -750,8 +731,6 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
     scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras: Set[int] = set()
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for i in range(2):
@@ -764,11 +743,11 @@ def test_schedule_swapped_max_loras():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 1
     assert budget.num_batched_tokens == 1
     assert budget.num_curr_seqs == 1
@@ -779,8 +758,6 @@ def test_schedule_swapped_max_loras():
 
 def test_schedule_swapped_cannot_swap_in():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -788,15 +765,15 @@ def test_schedule_swapped_cannot_swap_in():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.can_swap_in = MagicMock()
     scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
     # Since we cannot swap in, none of the requests are swapped in.
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 2
     assert budget.num_batched_tokens == 0
     assert budget.num_curr_seqs == 0
@@ -806,8 +783,6 @@ def test_schedule_swapped_cannot_swap_in():
 
 def test_infeasible_swap():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     blocks_to_swap_out: List[Tuple[int, int]] = []
     for _ in range(2):
@@ -815,15 +790,15 @@ def test_infeasible_swap():
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
-        swapped.append(seq_group)
+        scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.can_swap_in = MagicMock()
     scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
     # Since we cannot swap in, none of the requests are swapped in.
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert len(output.infeasible_seq_groups) == 2
     assert budget.num_batched_tokens == 0
@@ -834,23 +809,21 @@ def test_infeasible_swap():
 
 def test_schedule_swapped_blocks_to_copy():
     scheduler = initialize_scheduler()
-    swapped: Deque[SequenceGroup] = deque()
-    policy = PolicyFactory.get_policy(policy_name="fcfs")
     curr_loras = None
     _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     blocks_to_swap_out: List[Tuple[int, int]] = []
     scheduler._swap_out(seq_group, blocks_to_swap_out)
-    swapped.append(seq_group)
+    scheduler._add_seq_group_to_swapped(seq_group)
 
     # The last request should be swapped out.
     scheduler.block_manager.append_slots = MagicMock()
     scheduler.block_manager.append_slots.return_value = [(2, 3)]
 
     budget = create_token_budget()
-    remaining_swapped, output = scheduler._schedule_swapped(
-        swapped, budget, curr_loras, policy)
+    output = scheduler._schedule_swapped(budget, curr_loras)
+    remaining_swapped = scheduler.swapped
     assert len(remaining_swapped) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
diff --git a/vllm/core/policy.py b/vllm/core/policy.py
deleted file mode 100644
index a4463ac0f340e..0000000000000
--- a/vllm/core/policy.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from collections import deque
-from typing import Deque
-
-from vllm.sequence import SequenceGroup
-
-
-class Policy:
-
-    def get_priority(
-        self,
-        now: float,
-        seq_group: SequenceGroup,
-    ) -> float:
-        raise NotImplementedError
-
-    def sort_by_priority(
-        self,
-        now: float,
-        seq_groups: Deque[SequenceGroup],
-    ) -> Deque[SequenceGroup]:
-        return deque(
-            sorted(
-                seq_groups,
-                key=lambda seq_group: self.get_priority(now, seq_group),
-                reverse=True,
-            ))
-
-
-class FCFS(Policy):
-
-    def get_priority(
-        self,
-        now: float,
-        seq_group: SequenceGroup,
-    ) -> float:
-        return now - seq_group.metrics.arrival_time
-
-
-class PolicyFactory:
-
-    _POLICY_REGISTRY = {'fcfs': FCFS}
-
-    @classmethod
-    def get_policy(cls, policy_name: str, **kwargs) -> Policy:
-        return cls._POLICY_REGISTRY[policy_name](**kwargs)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5cdf1d15c31e1..11d020be0c940 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -8,7 +8,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.core.policy import Policy, PolicyFactory
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -345,6 +344,16 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
         # Add sequence groups to the waiting queue.
         self.waiting.append(seq_group)
 
+    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the running queue.
+        # Only for testing purposes.
+        self.running.append(seq_group)
+
+    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the swapped queue.
+        # Only for testing purposes.
+        self.swapped.append(seq_group)
+
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a sequence group with the given ID.
 
@@ -398,32 +407,26 @@ def get_and_reset_finished_requests_ids(self) -> List[str]:
 
     def _schedule_running(
         self,
-        running_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
-        policy: Policy,
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerRunningOutputs]:
+    ) -> SchedulerRunningOutputs:
         """Schedule sequence groups that are running.
 
         Running queue should include decode and chunked prefill requests.
 
         Args:
-            running_queue: The queue that contains running requests (i.e.,
-                decodes). The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any decodes are preempted.
             curr_loras: Currently batched lora request ids. The argument is
                 in-place updated when any decodes are preempted.
-            policy: The sorting policy to sort running_queue.
             enable_chunking: If True, seq group can be chunked and only a
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
     
         Returns:
-            A tuple of remaining running queue (should be always 0) after
-            scheduling and SchedulerRunningOutputs.
+            SchedulerRunningOutputs.
         """
         # Blocks that need to be swapped or copied before model execution.
         blocks_to_swap_out: List[Tuple[int, int]] = []
@@ -436,10 +439,9 @@ def _schedule_running(
 
         # NOTE(woosuk): Preemption happens only when there is no available slot
         # to keep all the sequence groups in the RUNNING state.
-        # In this case, the policy is responsible for deciding which sequence
-        # groups to preempt.
-        now = time.time()
-        running_queue = policy.sort_by_priority(now, running_queue)
+
+        running_queue = self.running
+
         while running_queue:
             seq_group = running_queue[0]
             num_running_tokens = self._get_num_new_tokens(
@@ -503,7 +505,7 @@ def _schedule_running(
                 if curr_loras is not None and seq_group.lora_int_id > 0:
                     curr_loras.add(seq_group.lora_int_id)
 
-        return running_queue, SchedulerRunningOutputs(
+        return SchedulerRunningOutputs(
             decode_seq_groups=decode_seq_groups,
             prefill_seq_groups=prefill_seq_groups,
             preempted=preempted,
@@ -515,12 +517,10 @@ def _schedule_running(
 
     def _schedule_swapped(
         self,
-        swapped_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
-        policy: Policy,
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerSwappedInOutputs]:
+    ) -> SchedulerSwappedInOutputs:
         """Schedule sequence groups that are swapped out.
 
         It schedules swapped requests as long as it fits `budget` and
@@ -528,20 +528,16 @@ def _schedule_swapped(
         `budget` and `curr_loras` are updated based on scheduled seq_groups.
 
         Args:
-            swapped_queue: The queue that contains swapped out requests.
-                The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any requests are swapped in.
             curr_loras: Currently batched lora request ids. The argument is
                 in-place updated when any requests are swapped in.
-            policy: The sorting policy to sort swapped_queue.
             enable_chunking: If True, seq group can be chunked and only a
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
 
         Returns:
-            A tuple of remaining swapped_queue after scheduling and
             SchedulerSwappedInOutputs.
         """
         # Blocks that need to be swapped or copied before model execution.
@@ -549,10 +545,10 @@ def _schedule_swapped(
         blocks_to_copy: List[Tuple[int, int]] = []
         decode_seq_groups: List[ScheduledSequenceGroup] = []
         prefill_seq_groups: List[ScheduledSequenceGroup] = []
-        now = time.time()
-        swapped_queue = policy.sort_by_priority(now, swapped_queue)
         infeasible_seq_groups: List[SequenceGroup] = []
 
+        swapped_queue = self.swapped
+
         leftover_swapped: Deque[SequenceGroup] = deque()
         while swapped_queue:
             seq_group = swapped_queue[0]
@@ -617,7 +613,7 @@ def _schedule_swapped(
 
         swapped_queue.extendleft(leftover_swapped)
 
-        return swapped_queue, SchedulerSwappedInOutputs(
+        return SchedulerSwappedInOutputs(
             decode_seq_groups=decode_seq_groups,
             prefill_seq_groups=prefill_seq_groups,
             blocks_to_swap_in=blocks_to_swap_in,
@@ -644,11 +640,10 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
 
     def _schedule_prefills(
         self,
-        waiting_queue: deque,
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
-    ) -> Tuple[deque, SchedulerPrefillOutputs]:
+    ) -> SchedulerPrefillOutputs:
         """Schedule sequence groups that are in prefill stage.
 
         Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
@@ -660,8 +655,6 @@ def _schedule_prefills(
         `budget` and `curr_loras` are updated based on scheduled seq_groups.
 
         Args:
-            waiting_queue: The queue that contains prefill requests.
-                The given arguments are NOT in-place modified.
             budget: The scheduling budget. The argument is in-place updated
                 when any requests are scheduled.
             curr_loras: Currently batched lora request ids. The argument is
@@ -672,14 +665,12 @@ def _schedule_prefills(
                 all tokens.
 
         Returns:
-            A tuple of remaining waiting_queue after scheduling and
             SchedulerSwappedInOutputs.
         """
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[SequenceGroup] = []
-        # We don't sort waiting queue because we assume it is sorted.
-        # Copy the queue so that the input queue is not modified.
-        waiting_queue = deque([s for s in waiting_queue])
+
+        waiting_queue = self.waiting
 
         leftover_waiting_sequences: Deque[SequenceGroup] = deque()
         while self._passed_delay(time.time()) and waiting_queue:
@@ -758,7 +749,7 @@ def _schedule_prefills(
         if len(seq_groups) > 0:
             self.prev_prompt = True
 
-        return waiting_queue, SchedulerPrefillOutputs(
+        return SchedulerPrefillOutputs(
             seq_groups=seq_groups,
             ignored_seq_groups=ignored_seq_groups,
             num_lookahead_slots=self._get_num_lookahead_slots(is_prefill=True))
@@ -785,53 +776,43 @@ def _schedule_default(self) -> SchedulerOutputs:
             seq_group.lora_int_id for seq_group in self.running
             if seq_group.lora_int_id > 0) if self.lora_enabled else None
 
-        remaining_waiting, prefills = (self.waiting,
-                                       SchedulerPrefillOutputs.create_empty())
-        remaining_running, running_scheduled = (
-            self.running, SchedulerRunningOutputs.create_empty())
-        remaining_swapped, swapped_in = (
-            self.swapped, SchedulerSwappedInOutputs.create_empty())
+        prefills = SchedulerPrefillOutputs.create_empty()
+        running_scheduled = SchedulerRunningOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
 
         # If any requests are swapped, prioritized swapped requests.
         if not self.swapped:
-            remaining_waiting, prefills = self._schedule_prefills(
-                self.waiting, budget, curr_loras, enable_chunking=False)
+            prefills = self._schedule_prefills(budget,
+                                               curr_loras,
+                                               enable_chunking=False)
 
-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
         # Don't schedule decodes if prefills are scheduled.
         # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
         # only contains decode requests, not chunked prefills.
         if len(prefills.seq_groups) == 0:
-            remaining_running, running_scheduled = self._schedule_running(
-                self.running,
-                budget,
-                curr_loras,
-                fcfs_policy,
-                enable_chunking=False)
+            running_scheduled = self._schedule_running(budget,
+                                                       curr_loras,
+                                                       enable_chunking=False)
 
             # If any sequence group is preempted, do not swap in any sequence
             # group. because it means there's no slot for new running requests.
             if len(running_scheduled.preempted) + len(
                     running_scheduled.swapped_out) == 0:
-                remaining_swapped, swapped_in = self._schedule_swapped(
-                    self.swapped, budget, curr_loras, fcfs_policy)
+                swapped_in = self._schedule_swapped(budget, curr_loras)
 
         assert (budget.num_batched_tokens <=
                 self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
-        self.waiting = remaining_waiting
         self.waiting.extendleft(running_scheduled.preempted)
         # Update new running requests.
-        self.running = remaining_running
         self.running.extend([s.seq_group for s in prefills.seq_groups])
         self.running.extend(
             [s.seq_group for s in running_scheduled.decode_seq_groups])
         self.running.extend(
             [s.seq_group for s in swapped_in.decode_seq_groups])
         # Update swapped requests.
-        self.swapped = remaining_swapped
         self.swapped.extend(running_scheduled.swapped_out)
         preempted = (len(running_scheduled.preempted) +
                      len(running_scheduled.swapped_out))
@@ -877,42 +858,32 @@ def _schedule_chunked_prefill(self):
         )
         curr_loras: Set[int] = set()
 
-        remaining_waiting, prefills = (self.waiting,
-                                       SchedulerPrefillOutputs.create_empty())
-        remaining_running, running_scheduled = (
-            self.running, SchedulerRunningOutputs.create_empty())
-        remaining_swapped, swapped_in = (
-            self.swapped, SchedulerSwappedInOutputs.create_empty())
+        prefills = SchedulerPrefillOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
 
         # Decoding should be always scheduled first by fcfs.
-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
-        remaining_running, running_scheduled = self._schedule_running(
-            self.running,
-            budget,
-            curr_loras,
-            fcfs_policy,
-            enable_chunking=True)
+        running_scheduled = self._schedule_running(budget,
+                                                   curr_loras,
+                                                   enable_chunking=True)
 
         # Schedule swapped out requests.
         # If preemption happens, it means we don't have space for swap-in.
         if len(running_scheduled.preempted) + len(
                 running_scheduled.swapped_out) == 0:
-            remaining_swapped, swapped_in = self._schedule_swapped(
-                self.swapped, budget, curr_loras, fcfs_policy)
+            swapped_in = self._schedule_swapped(budget, curr_loras)
 
         # Schedule new prefills.
-        remaining_waiting, prefills = self._schedule_prefills(
-            self.waiting, budget, curr_loras, enable_chunking=True)
+        prefills = self._schedule_prefills(budget,
+                                           curr_loras,
+                                           enable_chunking=True)
 
         assert (budget.num_batched_tokens <=
                 self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
-        self.waiting = remaining_waiting
         self.waiting.extendleft(running_scheduled.preempted)
         # Update new running requests.
-        self.running = remaining_running
         self.running.extend([s.seq_group for s in prefills.seq_groups])
         self.running.extend(
             [s.seq_group for s in running_scheduled.decode_seq_groups])
@@ -923,7 +894,6 @@ def _schedule_chunked_prefill(self):
         self.running.extend(
             [s.seq_group for s in swapped_in.prefill_seq_groups])
         # Update swapped requests.
-        self.swapped = remaining_swapped
         self.swapped.extend(running_scheduled.swapped_out)
         return SchedulerOutputs(
             scheduled_seq_groups=(prefills.seq_groups +

From a72a424b3eac43a26d2214c0f2a7f91cc59f2f84 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 1 Aug 2024 13:07:37 -0500
Subject: [PATCH 47/79] [Build/CI] Fixing Docker Hub quota issue. (#7043)

---
 .buildkite/run-amd-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 77e451354caf6..85b2b6b503531 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -56,7 +56,7 @@ done
 
 echo "--- Pulling container" 
 docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
-image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}
 

From 7e0861bd0bb25ea5ceaa3a513da4133fb828b5fe Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 1 Aug 2024 11:11:24 -0700
Subject: [PATCH 48/79] [CI/Build] Update PyTorch to 2.4.0 (#6951)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml            | 6 +++---
 .github/workflows/publish.yml            | 2 +-
 CMakeLists.txt                           | 2 +-
 Dockerfile                               | 2 +-
 pyproject.toml                           | 2 +-
 requirements-build.txt                   | 2 +-
 requirements-cuda.txt                    | 8 ++++----
 vllm/model_executor/layers/ops/sample.py | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9ec9ec12bfcfe..573c3740f0bbb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -44,7 +44,7 @@ steps:
   fast_check: true
   commands:
   # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
@@ -164,7 +164,7 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -281,7 +281,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 15c2ec05b25db..607fda754bf23 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d599c5470704..9c2cb360fca30 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
diff --git a/Dockerfile b/Dockerfile
index db4453ab0efc9..7294707046abc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -192,7 +192,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b0d115a091c45..26d963aa51099 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.1",
+    "torch == 2.4.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index b05f38a0ed919..d0f677fd344ed 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.1
+torch==2.4.0
 wheel
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3eb91212e976e..1f60d54083b4c 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,8 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.1
+torch == 2.4.0
 # These must be updated alongside torch
-torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27  # Requires PyTorch 2.3.1
-vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.0  # Requires PyTorch 2.4.0
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index bdb577da3172f..fb88a05daf482 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -7,7 +7,7 @@
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 from vllm.triton_utils.sample import get_num_triton_sampler_splits
 
-_EPS = 1e-6
+_EPS: tl.constexpr = 1e-6
 
 
 def _multi_split_sample(

From 2dd34371a6054966d30971dae89b0c431d7f0f08 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 2 Aug 2024 03:00:28 +0800
Subject: [PATCH 49/79] [Bugfix] Fix RMSNorm forward in InternViT attention
 qk_layernorm (#6992)

---
 vllm/model_executor/models/intern_vit.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 86d0930d80126..c6c692deca2e1 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -113,10 +113,10 @@ def forward(self, x):
 
         if self.qk_normalization:
             B_, H_, N_, D_ = q.shape
-            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(
-                B_, N_, H_, D_).transpose(1, 2)
-            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(
-                B_, N_, H_, D_).transpose(1, 2)
+            q = self.q_norm.forward_native(q.transpose(1, 2).flatten(
+                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm.forward_native(k.transpose(1, 2).flatten(
+                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
 
         x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
         x = x.transpose(1, 2).reshape(B, N, C)

From fb3db616881d7225c4bbe64bb709ea6bcd6157f7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 1 Aug 2024 15:00:51 -0400
Subject: [PATCH 50/79] [CI/Build] Remove sparseml requirement from testing
 (#7037)

---
 requirements-test.txt                         |  1 -
 tests/conftest.py                             |  4 --
 tests/models/test_compressed_tensors.py       | 52 -------------------
 tests/quantization/test_compressed_tensors.py |  2 +-
 4 files changed, 1 insertion(+), 58 deletions(-)
 delete mode 100644 tests/models/test_compressed_tensors.py

diff --git a/requirements-test.txt b/requirements-test.txt
index 9b88fcce3e842..df247496be16c 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 59510075b0063..999ca60d07a4f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -152,7 +152,6 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
-        is_sparseml_model: bool = False,
     ) -> None:
         torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
@@ -169,9 +168,6 @@ def __init__(
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
-            elif is_sparseml_model:
-                from sparseml.transformers import SparseAutoModelForCausalLM
-                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
deleted file mode 100644
index da47d5f3f3d23..0000000000000
--- a/tests/models/test_compressed_tensors.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""Compares vllm vs sparseml for compressed-tensors
-
-Note: vllm and sparseml do not have bitwise correctness, 
-so in this test, we just confirm that the top selected 
-tokens of the are in the top 5 selections of each other.
-"""
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from .utils import check_logprobs_close
-
-MODELS = [
-    # No bias
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
-    # Bias
-    "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
-]
-
-MAX_TOKENS = 32
-NUM_LOGPROBS = 5
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("compressed-tensors"),
-    reason="compressed-tensors is not supported on this machine type.")
-@pytest.mark.parametrize("model_name", MODELS)
-def test_models(
-    vllm_runner,
-    hf_runner,
-    example_prompts,
-    model_name,
-) -> None:
-    # Run sparseml.
-    with hf_runner(model_name=model_name,
-                   is_sparseml_model=True) as sparseml_model:
-
-        sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit(
-            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
-
-    # Run vllm.
-    with vllm_runner(model_name=model_name) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, MAX_TOKENS, NUM_LOGPROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=sparseml_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="sparseml",
-        name_1="vllm",
-    )
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c5a01b73f4a80..bd79da84a7764 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,4 @@
-"""Test model set-up and weight loading for sparseml-quantized models.
+"""Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """

From f4fd390f5de585fd94877158bea4e1b2d1920df3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 1 Aug 2024 15:01:07 -0400
Subject: [PATCH 51/79] [Bugfix] Lower gemma's unloaded_params exception to
 warning (#7002)

---
 vllm/model_executor/models/gemma.py     | 6 +++---
 vllm/model_executor/models/gemma2.py    | 9 ++++++---
 vllm/model_executor/models/paligemma.py | 6 +++---
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 7e0888b5f5abd..64aef1024a1a5 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -404,6 +404,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             loaded_params.add(name)
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8386084c2b3f8..b77c901f6cd32 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -23,6 +23,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -41,6 +42,8 @@
 
 from .interfaces import SupportsLoRA
 
+logger = init_logger(__name__)
+
 
 class Gemma2MLP(nn.Module):
 
@@ -390,6 +393,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2af48b6bc190f..fe91611cd30ff 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -342,6 +342,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)

From fc912e0886f5eaa584c1a65fad81c6c269f609a0 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:40:43 -0700
Subject: [PATCH 52/79] [Models] Support Qwen model with PP (#6974)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
---
 docs/source/serving/distributed_serving.rst |  2 +-
 vllm/config.py                              |  1 +
 vllm/model_executor/models/qwen.py          | 54 +++++++++++++++++----
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 5f14fd2b0ee0a..fcb2646df50d3 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -50,7 +50,7 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
     $     --pipeline-parallel-size 2
 
 .. note::
-    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models.
+    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.
 
 Multi-Node Inference and Serving
 --------------------------------
diff --git a/vllm/config.py b/vllm/config.py
index e065744592378..ef56e2b6395be 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -42,6 +42,7 @@
     "NemotronForCausalLM",
     "Qwen2ForCausalLM",
     "Qwen2MoeForCausalLM",
+    "QWenLMHeadModel",
 ]
 
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 47c85c783db7a..eb61adf34e9a7 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -12,7 +12,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -30,6 +30,8 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import print_warning_once
 
+from .utils import is_pp_missing_parameter, make_layers
+
 
 class QWenMLP(nn.Module):
 
@@ -186,6 +188,7 @@ def __init__(
         config: PretrainedConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -195,10 +198,10 @@ def __init__(
             config.vocab_size,
             config.hidden_size,
         )
-        self.h = nn.ModuleList([
-            QWenBlock(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: QWenBlock(config, cache_config, quant_config),
+            prefix=f"{prefix}.h")
         self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -207,18 +210,29 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
     ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        residual = None
-        for i in range(len(self.h)):
+        if get_pp_group().is_first_rank:
+            hidden_states = self.wte(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
             layer = self.h[i]
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i],
+                kv_caches[i - self.start_layer],
                 attn_metadata,
                 residual,
             )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
@@ -250,9 +264,23 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata)
+                                         attn_metadata, intermediate_tensors)
         return hidden_states
 
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states,
@@ -284,6 +312,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -301,6 +332,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         "Only text inputs are allowed. Images won't be handled "
                         "until Qwen-VL models are fully supported.")
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From 562e580abc63cd6c1d39bd04d7a007ddefba7575 Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Thu, 1 Aug 2024 15:12:37 -0500
Subject: [PATCH 53/79] Update run-amd-test.sh (#7044)

---
 .buildkite/run-amd-test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 85b2b6b503531..ccc2f090565e4 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -55,7 +55,6 @@ while true; do
 done
 
 echo "--- Pulling container" 
-docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}

From 805a8a75f2f17ee56c0882efcc34d35e1801cbee Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 1 Aug 2024 13:14:37 -0700
Subject: [PATCH 54/79] [Misc] Support attention logits soft-capping with
 flash-attn (#7022)

---
 requirements-cuda.txt                       |  2 +-
 tests/kernels/test_flash_attn.py            | 19 +++++++++++++------
 vllm/attention/backends/abstract.py         |  1 +
 vllm/attention/backends/blocksparse_attn.py |  3 +++
 vllm/attention/backends/flash_attn.py       | 21 ++++++++++-----------
 vllm/attention/backends/flashinfer.py       | 14 +++++---------
 vllm/attention/backends/ipex_attn.py        |  8 ++++++--
 vllm/attention/backends/pallas.py           |  4 ++++
 vllm/attention/backends/rocm_flash_attn.py  | 10 ++++++++--
 vllm/attention/backends/torch_sdpa.py       |  8 ++++++--
 vllm/attention/backends/utils.py            |  9 ---------
 vllm/attention/backends/xformers.py         |  9 +++++++--
 vllm/attention/layer.py                     |  3 ++-
 vllm/model_executor/models/gemma2.py        |  7 +++++--
 14 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 1f60d54083b4c..1d00f0c17deec 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -8,4 +8,4 @@ torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.0  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index cd06c27175cef..6c5eff00de44c 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -20,6 +20,7 @@ def ref_paged_attn(
     block_tables: torch.Tensor,
     scale: float,
     sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -53,6 +54,8 @@ def ref_paged_attn(
                                              (query_len + sliding_window) +
                                              1).bool().logical_not()
             mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
         attn.masked_fill_(mask, float("-inf"))
         attn = torch.softmax(attn, dim=-1).to(v.dtype)
         out = torch.einsum("hqk,khd->qhd", attn, v)
@@ -68,13 +71,15 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -108,6 +113,7 @@ def test_flash_attn_with_paged_kv(
         causal=True,
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
+        softcap=soft_cap if soft_cap is not None else 0,
     ).squeeze(1)
 
     ref_output = ref_paged_attn(
@@ -118,6 +124,7 @@ def test_flash_attn_with_paged_kv(
         kv_lens=kv_lens,
         block_tables=block_tables,
         scale=scale,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
@@ -129,7 +136,8 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("sliding_window", [None])
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_varlen_with_paged_kv(
     seq_lens: List[Tuple[int, int]],
     num_heads: Tuple[int, int],
@@ -137,6 +145,7 @@ def test_varlen_with_paged_kv(
     sliding_window: Optional[int],
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -163,10 +172,6 @@ def test_varlen_with_paged_kv(
                             head_size,
                             dtype=dtype)
     value_cache = torch.randn_like(key_cache)
-    # Normalize the scale of the key and value caches to mitigate
-    # numerical instability.
-    key_cache /= head_size**0.5
-    value_cache /= head_size**0.5
     cu_query_lens = torch.tensor([0] + query_lens,
                                  dtype=torch.int32).cumsum(dim=0,
                                                            dtype=torch.int32)
@@ -192,6 +197,7 @@ def test_varlen_with_paged_kv(
         causal=True,
         window_size=window_size,
         block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
     )
 
     ref_output = ref_paged_attn(
@@ -203,6 +209,7 @@ def test_varlen_with_paged_kv(
         block_tables=block_tables,
         scale=scale,
         sliding_window=sliding_window,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 106b00cc1014c..97b13917ccfaa 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -150,6 +150,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 71954f864a9b4..907b45393eeb5 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -283,12 +283,15 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
         assert sliding_window is None, ValueError(
             "sliding_window is invalid for blocksparse attention.")
+        assert logits_soft_cap is None, ValueError(
+            "logits_soft_cap is invalid for blocksparse attention.")
 
         if "num_heads" not in blocksparse_params:
             blocksparse_params["num_heads"] = num_heads
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 7d7aff9dc3cdc..00654dca2adfa 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -288,15 +288,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "Please use Flashinfer backend for models with logits_soft_cap"
-                " (i.e., Gemma-2). Otherwise, the output might be wrong."
-                " Set Flashinfer backend by "
-                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
-
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
@@ -405,9 +396,11 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "FlashAttention does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -418,6 +411,10 @@ def __init__(
         self.sliding_window = ((sliding_window, sliding_window)
                                if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -525,6 +522,7 @@ def forward(
                     causal=True,
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
+                    softcap=self.logits_soft_cap,
                 )
                 assert output[:num_prefill_tokens].shape == out.shape
                 output[:num_prefill_tokens] = out
@@ -544,6 +542,7 @@ def forward(
                     causal=True,
                     alibi_slopes=self.alibi_slopes,
                     block_table=prefill_meta.block_tables,
+                    softcap=self.logits_soft_cap,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 83a420d76834b..ccf8ab03a621b 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -116,8 +116,6 @@ class FlashInferMetadata(AttentionMetadata):
     # The data type of the paged kv cache
     data_type: torch.dtype = None
     device: torch.device = torch.device("cuda")
-    # Only used by gemma2 model
-    logits_soft_cap: Optional[float] = None
 
     def __post_init__(self):
         # Refer to
@@ -391,9 +389,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                            dtype=torch.long,
                                            device=device)
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-
         if len(self.paged_kv_indptr) > 0:
             paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
                                                    device="cpu",
@@ -430,8 +425,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             query_start_loc=query_start_loc,
             device=device,
             data_type=kv_cache_dtype,
-            use_cuda_graph=use_captured_graph,
-            logits_soft_cap=logits_soft_cap)
+            use_cuda_graph=use_captured_graph)
 
 
 class FlashInferImpl(AttentionImpl):
@@ -446,6 +440,7 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -458,6 +453,7 @@ def __init__(
             raise ValueError("Sliding window is not supported in FlashInfer.")
         self.sliding_window = (-1, -1)
         self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -532,7 +528,7 @@ def forward(
                 output = prefill_meta.prefill_wrapper.forward(
                     query,
                     kv_cache,
-                    logits_soft_cap=attn_metadata.logits_soft_cap,
+                    logits_soft_cap=self.logits_soft_cap,
                     causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
@@ -541,5 +537,5 @@ def forward(
                 query,
                 kv_cache,
                 sm_scale=self.scale,
-                logits_soft_cap=attn_metadata.logits_soft_cap)
+                logits_soft_cap=self.logits_soft_cap)
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 4559dd15f600c..bac30aec24826 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -105,9 +105,13 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "Torch SPDA does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "IPEX backend does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError("IPEX backend does not support logits_soft_cap.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2269ac2606e89..4ecf698c8d514 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -91,6 +91,7 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -109,6 +110,9 @@ def __init__(
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
+        if logits_soft_cap is not None:
+            raise NotImplementedError(
+                "Attention logits soft-capping is not supported.")
 
         if torch_xla.tpu.version() < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 058c8df0eaf8b..26e9b8a93fb9a 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -244,9 +244,15 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "ROCFlashAttention does not support blocksparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "ROCmFlashAttention does not support blocksparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "ROCmFlashAttention does not support attention logits soft "
+                "capping.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index fe6a56123ce72..b83c673f0165e 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -109,9 +109,13 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "Torch SPDA does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "Torch SPDA does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError("Torch SPDA does not support logits soft cap.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index dcd10ed410a79..bca1370343b7b 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -165,15 +165,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "Please use Flashinfer backend for models with logits_soft_cap "
-                "(i.e., Gemma-2). Otherwise, the output might be wrong. "
-                "Set Flashinfer backend by "
-                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
-
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 1573cd7da94cd..24ba5fc725403 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -408,9 +408,14 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
-        assert blocksparse_params is None, ValueError(
-            "XFormer does not support block-sparse attention.")
+        if blocksparse_params is not None:
+            raise ValueError(
+                "XFormers does not support block-sparse attention.")
+        if logits_soft_cap is not None:
+            raise ValueError(
+                "XFormers does not support attention logits soft capping.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 5fa552f2f4eca..2c21502dcf407 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -34,6 +34,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -82,7 +83,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params)
+                             blocksparse_params, logits_soft_cap)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index b77c901f6cd32..7bad2626fec6a 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -90,7 +90,8 @@ def __init__(self,
                  max_position_embeddings: int,
                  rope_theta: float,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 attn_logits_soft_cap: Optional[float] = None) -> None:
         super().__init__()
         self.layer_idx = layer_idx
         self.config = config
@@ -150,7 +151,8 @@ def __init__(self,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config)
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap)
 
     def forward(
         self,
@@ -189,6 +191,7 @@ def __init__(
             rope_theta=config.rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
+            attn_logits_soft_cap=config.attn_logit_softcapping,
         )
         self.hidden_size = config.hidden_size
         self.mlp = Gemma2MLP(

From 6a11fdfbb8d6701c7ad38648aead23d8cbe6aac5 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 1 Aug 2024 16:51:15 -0400
Subject: [PATCH 55/79] [CI/Build][Bugfix] Fix CUTLASS header-only line (#7034)

---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c2cb360fca30..77a8af549b027 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,12 +156,15 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # CUTLASS 3.5.0
         GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+        # Shallow clone with depth 1
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE
   )
   FetchContent_MakeAvailable(cutlass)
 

From 6ce01f30667bbae33f112152e07a3b66b841078f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 1 Aug 2024 18:29:52 -0700
Subject: [PATCH 56/79] [Performance] Optimize `get_seqs` (#7051)

---
 vllm/core/block_manager_v1.py          |  2 +-
 vllm/sequence.py                       | 40 +++++++++++++-------------
 vllm/transformers_utils/detokenizer.py |  2 +-
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e29eba375f4dd..d81648caa5851 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -700,5 +700,5 @@ def get_common_computed_block_ids(
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         if self.enable_caching:
-            for seq in seq_group.seqs_dict.values():
+            for seq in seq_group.get_seqs():
                 self.compute_full_blocks_in_seq(seq)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ab50cfdfd29a5..7ef9387c611f8 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -444,6 +444,7 @@ def __init__(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.request_id = request_id
+        self.seqs = seqs
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
         self.sampling_params = sampling_params
         self.metrics = RequestMetrics(arrival_time=arrival_time,
@@ -458,25 +459,24 @@ def __init__(
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
-        self._first_seq = next(iter(self.seqs_dict.values()))
 
     @property
     def prompt(self) -> Optional[str]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return self._first_seq.prompt
+        return self.seqs[0].prompt
 
     @property
     def prompt_token_ids(self) -> List[int]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return self._first_seq.prompt_token_ids
+        return self.seqs[0].prompt_token_ids
 
     @property
     def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
-        return self._first_seq.multi_modal_data
+        return self.seqs[0].multi_modal_data
 
     @property
     def lora_int_id(self) -> int:
@@ -512,7 +512,7 @@ def maybe_set_first_token_time(self, time: float) -> None:
         #   in TPOT, rather than recalculating TTFT (since from the )
         #   POV of the user, there is simply a long generation delay.
         if (self.metrics.first_token_time is None
-                and self.get_seqs()[0].get_output_len() == 1):
+                and self.seqs[0].get_output_len() == 1):
             self.metrics.first_token_time = time
 
     def maybe_set_first_scheduled_time(self, time: float) -> None:
@@ -548,9 +548,9 @@ def get_seqs(
         self,
         status: Optional[SequenceStatus] = None,
     ) -> List[Sequence]:
-        return list(self.seqs_dict.values()) if status is None else [
-            seq for seq in self.seqs_dict.values() if seq.status == status
-        ]
+        if status is None:
+            return self.seqs
+        return [seq for seq in self.seqs if seq.status == status]
 
     def is_encoder_decoder(self) -> bool:
         return self.encoder_seq is not None
@@ -559,22 +559,20 @@ def get_encoder_seq(self) -> Optional[Sequence]:
         return self.encoder_seq
 
     def get_unfinished_seqs(self) -> List[Sequence]:
-        return [
-            seq for seq in self.seqs_dict.values() if not seq.is_finished()
-        ]
+        return [seq for seq in self.seqs if not seq.is_finished()]
 
     def get_finished_seqs(self) -> List[Sequence]:
-        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
+        return [seq for seq in self.seqs if seq.is_finished()]
 
     def update_num_computed_tokens(self, num_new_computed_tokens: int):
         """Update number of tokens computed so far."""
-        for seq in self.seqs_dict.values():
+        for seq in self.seqs:
             if not seq.is_finished():
                 seq.data.update_num_computed_tokens(num_new_computed_tokens)
 
     def get_num_uncomputed_tokens(self) -> int:
         num_uncomputed_tokens = 0
-        for seq in self.get_seqs():
+        for seq in self.seqs:
             if not seq.is_finished():
                 num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
         return num_uncomputed_tokens
@@ -583,7 +581,7 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
         # Optimization. We don't need to call get_seqs if we don't need to
         # filter by states.
         if status is None:
-            return len(self.seqs_dict)
+            return len(self.seqs)
 
         return len(self.get_seqs(status))
 
@@ -602,23 +600,25 @@ def add(self, seq: Sequence) -> None:
         if seq.seq_id in self.seqs_dict:
             raise ValueError(f"Sequence {seq.seq_id} already exists.")
         self.seqs_dict[seq.seq_id] = seq
+        self.seqs.append(seq)
 
     def remove(self, seq_id: int) -> None:
-        if seq_id not in self.seqs_dict:
+        seq = self.seqs_dict.pop(seq_id, None)
+        if seq is None:
             raise ValueError(f"Sequence {seq_id} not found.")
-        del self.seqs_dict[seq_id]
+        self.seqs.remove(seq)
 
     def is_finished(self) -> bool:
-        return all(seq.is_finished() for seq in self.get_seqs())
+        return all(seq.is_finished() for seq in self.seqs)
 
     def is_prefill(self) -> bool:
         # Every sequence should be in the same stage.
-        return self.get_seqs()[0].is_prefill()
+        return self.seqs[0].is_prefill()
 
     def __repr__(self) -> str:
         return (f"SequenceGroup(request_id={self.request_id}, "
                 f"sampling_params={self.sampling_params}, "
-                f"num_seqs={len(self.seqs_dict)})")
+                f"num_seqs={len(self.seqs)})")
 
 
 class SequenceGroupMetadata:
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 76f418674532f..001af67f3bb9e 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -40,7 +40,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
         assert prms is not None
 
         # We can pick any sequence for the prompt.
-        seq = next(iter(seq_group.seqs_dict.values()))
+        seq = seq_group.get_seqs()[0]
         # Only prompt, without the generated token.
         all_token_ids = seq.get_token_ids()
         prompt_token_ids = all_token_ids[:-1]

From 954f7305a106058815bd7e47f5b9d585d8764c05 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Thu, 1 Aug 2024 18:44:16 -0700
Subject: [PATCH 57/79] [Kernel] Fix input for flashinfer prefill wrapper.
 (#7008)

---
 vllm/attention/backends/flashinfer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index ccf8ab03a621b..91abaab78dcb8 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -133,13 +133,20 @@ def begin_forward(self):
                 return
 
             assert self.prefill_wrapper is not None
+            assert self.query_start_loc is not None
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
             assert self.paged_kv_last_page_len is not None
-            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            batch_size = self.query_start_loc.shape[0] - 1
+            assert batch_size >= 0
+            # The prefill stage does not read kv cache.
+            # Both paged_kv_indices and paged_kv_last_page_len are empty.
+            # paged_kv_indptr is a zero tensor with size batch_size + 1.
+            self.paged_kv_indptr = torch.zeros(batch_size + 1,
+                                               device=self.device)
             self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                 self.device)
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
             self.prefill_wrapper.end_forward()
             self.prefill_wrapper.begin_forward(
                 self.query_start_loc, self.paged_kv_indptr,

From 3bb4b1e4cd3d07c80a208d875b016631d91844f8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 2 Aug 2024 10:49:43 +0800
Subject: [PATCH 58/79] [mypy] Speed up mypy checking (#7056)

---
 .github/workflows/mypy.yaml | 2 +-
 format.sh                   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 721c9c026cf16..68e3a3fefdc5d 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,6 +32,7 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
+        mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
         mypy vllm/core --follow-imports skip
@@ -44,5 +45,4 @@ jobs:
         mypy vllm/prompt_adapter --follow-imports skip
         mypy vllm/spec_decode --follow-imports skip
         mypy vllm/worker --follow-imports skip
-        mypy
 
diff --git a/format.sh b/format.sh
index 71697cffacfb4..abc688c702aa6 100755
--- a/format.sh
+++ b/format.sh
@@ -96,6 +96,7 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
+mypy --follow-imports skip  # Note that this is less strict than CI
 mypy tests --follow-imports skip
 mypy vllm/attention --follow-imports skip
 mypy vllm/core --follow-imports skip
@@ -108,7 +109,7 @@ mypy vllm/model_executor  --follow-imports skip
 mypy vllm/prompt_adapter --follow-imports skip
 mypy vllm/spec_decode --follow-imports skip
 mypy vllm/worker --follow-imports skip
-mypy
+echo 'vLLM mypy: Done'
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
@@ -127,7 +128,7 @@ spell_check_all(){
   codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }
 
-# Spelling  check of files that differ from main branch.
+# Spelling check of files that differ from main branch.
 spell_check_changed() {
     # The `if` guard ensures that the list of filenames is not empty, which
     # could cause ruff to receive 0 positional arguments, making it hang

From 252357793dd1fe9d30c34e68e4b8b2143a4c5138 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 1 Aug 2024 22:03:12 -0700
Subject: [PATCH 59/79] [ci][distributed] try to fix pp test (#7054)

---
 tests/distributed/test_pipeline_parallel.py   |  4 +-
 tests/utils.py                                | 39 +++++++++++++++++++
 .../tokenizer_group/ray_tokenizer_group.py    |  2 +-
 vllm/utils.py                                 |  3 +-
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5ff39ddfbf996..f632caba9017e 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from ..utils import compare_two_settings
+from ..utils import compare_two_settings, fork_new_process_for_each_test
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
@@ -28,6 +28,7 @@
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
     ])
+@fork_new_process_for_each_test
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
                     DIST_BACKEND):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
@@ -77,6 +78,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
     "FLASH_ATTN",
     "FLASHINFER",
 ])
+@fork_new_process_for_each_test
 def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
     cudagraph_args = [
         # use half precision for speed and memory savings in CI environment
diff --git a/tests/utils.py b/tests/utils.py
index 1086591464d43..f3ee801ee7742 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,6 @@
+import functools
 import os
+import signal
 import subprocess
 import sys
 import time
@@ -336,3 +338,40 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
                              f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
 
         time.sleep(5)
+
+
+def fork_new_process_for_each_test(f):
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        # Make the process the leader of its own process group
+        # to avoid sending SIGTERM to the parent process
+        os.setpgrp()
+        from _pytest.outcomes import Skipped
+        pid = os.fork()
+        if pid == 0:
+            try:
+                f(*args, **kwargs)
+            except Skipped as e:
+                # convert Skipped to exit code 0
+                print(str(e))
+                os._exit(0)
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                os._exit(1)
+            else:
+                os._exit(0)
+        else:
+            pgid = os.getpgid(pid)
+            _pid, _exitcode = os.waitpid(pid, 0)
+            # ignore SIGTERM signal itself
+            old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
+            # kill all child processes
+            os.killpg(pgid, signal.SIGTERM)
+            # restore the signal handler
+            signal.signal(signal.SIGTERM, old_singla_handler)
+            assert _exitcode == 0, (f"function {f} failed when called with"
+                                    f" args {args} and kwargs {kwargs}")
+
+    return wrapper
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index eebdf7bf644d0..79081c04ddc14 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -3,7 +3,7 @@
 from typing import List, Optional
 
 try:
-    from ray.exceptions import ActorDiedError
+    from ray.exceptions import ActorDiedError  # type: ignore
 except ImportError:
     # For older versions of Ray
     from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
diff --git a/vllm/utils.py b/vllm/utils.py
index 38e1782a51ab9..358788c95f30e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -928,7 +928,8 @@ def error_on_invalid_device_count_status():
     with contextlib.suppress(Exception):
         # future pytorch will fix the issue, device_count will not be cached
         # at that time, `.cache_info().currsize` will error out
-        cache_entries = torch.cuda.device_count.cache_info().currsize
+        cache_entries = torch.cuda.device_count.cache_info(  # type: ignore
+        ).currsize
     if cache_entries != 0:
         # the function is already called, and the result is cached
         remembered = torch.cuda.device_count()

From cf2a1a4d9d8168d2e8e7bef244c1dfec80780405 Mon Sep 17 00:00:00 2001
From: Bongwon Jang <152451401+bong-furiosa@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:28:00 +0900
Subject: [PATCH 60/79] Fix tracing.py (#7065)

---
 vllm/tracing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/tracing.py b/vllm/tracing.py
index ba6732cab68f2..dc8377f2396f2 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -15,7 +15,7 @@
         OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
     from opentelemetry.sdk.trace import TracerProvider
     from opentelemetry.sdk.trace.export import BatchSpanProcessor
-    from opentelemetry.semconv.ai import SpanAttributes as BaseSpanAttributes
+    from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
     from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
     from opentelemetry.trace.propagation.tracecontext import (
         TraceContextTextMapPropagator)

From 660dea1235bfe8987e4e9136ce70269084384b2f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 00:14:21 -0700
Subject: [PATCH 61/79] [cuda][misc] remove
 error_on_invalid_device_count_status (#7069)

---
 vllm/executor/multiproc_gpu_executor.py |  3 ---
 vllm/executor/ray_gpu_executor.py       |  9 +++------
 vllm/utils.py                           | 23 -----------------------
 3 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index e1e92958e667c..08a35a074b37b 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -17,7 +17,6 @@
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async,
                         update_environment_variables)
@@ -79,8 +78,6 @@ def _init_executor(self) -> None:
             f"please ensure that world_size ({world_size}) "
             f"is less than than max local gpu count ({cuda_device_count})")
 
-        error_on_invalid_device_count_status()
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 564fa79acfd40..14007e5518d4a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -10,10 +10,9 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (_run_task_with_lock,
-                        error_on_invalid_device_count_status,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -216,8 +215,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        error_on_invalid_device_count_status()
-
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(
diff --git a/vllm/utils.py b/vllm/utils.py
index 358788c95f30e..c4c17bfbefc65 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import contextlib
 import datetime
 import enum
 import gc
@@ -923,28 +922,6 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
-def error_on_invalid_device_count_status():
-    cache_entries = 0
-    with contextlib.suppress(Exception):
-        # future pytorch will fix the issue, device_count will not be cached
-        # at that time, `.cache_info().currsize` will error out
-        cache_entries = torch.cuda.device_count.cache_info(  # type: ignore
-        ).currsize
-    if cache_entries != 0:
-        # the function is already called, and the result is cached
-        remembered = torch.cuda.device_count()
-        current = cuda_device_count_stateless()
-        if remembered > current:
-            raise RuntimeError(
-                "The number of CUDA devices has changed since the first "
-                "call to torch.cuda.device_count(). This is not allowed "
-                "and may result in undefined behavior. Please check out "
-                "https://github.com/vllm-project/vllm/issues/6056 to "
-                "find the first call to torch.cuda.device_count() "
-                "and defer it until the engine is up. Or you can set "
-                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
-
-
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.

From db35186391a2abfc6c91d703527dac20d2488107 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 2 Aug 2024 15:58:26 +0800
Subject: [PATCH 62/79] [Core] Comment out unused code in sampler (#7023)

---
 vllm/model_executor/sampling_metadata.py | 58 +++++++++++++-----------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 59cfec9ec8934..015e85b4ca81d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -13,6 +13,8 @@
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
+# Some triton sampler related code is guarded before it is ready.
+_USE_TRITON_SAMPLER = False
 
 
 @dataclass
@@ -347,14 +349,16 @@ def from_sampling_metadata(
         repetition_penalties: List[float] = []
         sampling_seeds: List[int] = []
         sample_indices: List[int] = []
-        prompt_best_of: List[int] = []
         do_penalties = False
         do_top_p_top_k = False
         do_min_p = False
 
-        # We need one base seed per Triton slice.
-        seeds_to_generate = (extra_seeds_to_generate +
-                             get_num_triton_sampler_splits(vocab_size))
+        if _USE_TRITON_SAMPLER:
+            prompt_best_of: List[int] = []
+
+            # We need one base seed per Triton slice.
+            seeds_to_generate = (extra_seeds_to_generate +
+                                 get_num_triton_sampler_splits(vocab_size))
 
         assert sampling_metadata.seq_groups is not None
         for seq_group in sampling_metadata.seq_groups:
@@ -366,9 +370,6 @@ def from_sampling_metadata(
             r = sampling_params.repetition_penalty
             top_p = sampling_params.top_p
             min_p = sampling_params.min_p
-            seed = sampling_params.seed
-
-            is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
 
             # k should not be greater than the vocab size.
             top_k = min(sampling_params.top_k, vocab_size)
@@ -389,8 +390,7 @@ def from_sampling_metadata(
                 do_penalties = True
 
             is_prompt = seq_group.is_prompt
-            if (seq_group.is_prompt
-                    and sampling_params.prompt_logprobs is not None):
+            if (is_prompt and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
                 query_len = seq_group.query_len
@@ -415,23 +415,27 @@ def from_sampling_metadata(
                 frequency_penalties += [f] * len(seq_ids)
                 repetition_penalties += [r] * len(seq_ids)
 
-            if is_prompt:
-                prompt_best_of.append(sampling_params.best_of)
-                query_len = seq_group.query_len
-                assert query_len is not None
-
-            for seq_id in seq_ids:
-                seq_data = seq_group.seq_data[seq_id]
-                extra_entropy = extra_entropy or ()
-                seq_seeds = cls._get_sequence_seeds(
-                    seed,
-                    seq_data.get_len(),
-                    *extra_entropy,
-                    seq_id,
-                    seeds_to_generate=seeds_to_generate,
-                    is_greedy=is_greedy)
-                sampling_seeds.append(seq_seeds)
-            sample_indices.extend(seq_group.sample_indices)
+            if _USE_TRITON_SAMPLER:
+                if is_prompt:
+                    prompt_best_of.append(sampling_params.best_of)
+                    query_len = seq_group.query_len
+                    assert query_len is not None
+
+                seed = sampling_params.seed
+                is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
+
+                for seq_id in seq_ids:
+                    seq_data = seq_group.seq_data[seq_id]
+                    extra_entropy = extra_entropy or ()
+                    seq_seeds = cls._get_sequence_seeds(
+                        seed,
+                        seq_data.get_len(),
+                        *extra_entropy,
+                        seq_id,
+                        seeds_to_generate=seeds_to_generate,
+                        is_greedy=is_greedy)
+                    sampling_seeds.append(seq_seeds)
+                sample_indices.extend(seq_group.sample_indices)
 
         if do_penalties:
             for seq_group in sampling_metadata.seq_groups:
@@ -549,7 +553,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             device="cpu",
             dtype=torch.long,
             pin_memory=pin_memory,
-        ).T.contiguous()
+        ).t().contiguous()
 
         # Because the memory is pinned, we can do non-blocking
         # transfer to device.

From c16eaac5001d9e2bfb51c9812ec0c2b9e32b8d25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Fri, 2 Aug 2024 23:55:58 +0800
Subject: [PATCH 63/79] [Hardware][Intel CPU] Update torch 2.4.0 for CPU
 backend (#6931)

---
 Dockerfile.cpu       | 2 +-
 requirements-cpu.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index c473ba431e680..78730f39721cb 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,7 +26,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 2dcd86274a2a8..27ca8ca5dbc58 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.4.0; platform_machine != "ppc64le"
+torch == 2.4.0+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch

From 806949514ab07a2d7218645022c22962696adf46 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 10:03:24 -0700
Subject: [PATCH 64/79] [ci] set timeout for test_oot_registration.py (#7082)

---
 tests/entrypoints/openai/test_oot_registration.py | 4 ++++
 vllm/worker/worker.py                             | 4 +++-
 vllm/worker/xpu_worker.py                         | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index dbbda6de1fa09..5272ac4065f1d 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
     ctx = torch.multiprocessing.get_context()
     server = ctx.Process(target=server_function, args=(port, ))
     server.start()
+    MAX_SERVER_START_WAIT_S = 60
     client = OpenAI(
         base_url=f"http://localhost:{port}/v1",
         api_key="token-abc123",
     )
+    now = time.time()
     while True:
         try:
             completion = client.chat.completions.create(
@@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
         except OpenAIError as e:
             if "Connection error" in str(e):
                 time.sleep(3)
+                if time.time() - now > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError("Server did not start in time") from e
             else:
                 raise e
     server.kill()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f3c379d1aa34d..9e2cfff435cf6 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -186,7 +186,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 6a822c2ba3e7a..0f22d67c4f254 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -138,7 +138,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()

From b482b9a5b13ba7d126adabbedb3ba66f48d4d83b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 2 Aug 2024 16:51:22 -0400
Subject: [PATCH 65/79] [CI/Build] Add support for Python 3.12 (#7035)

---
 .github/workflows/mypy.yaml                  | 2 +-
 .github/workflows/publish.yml                | 2 +-
 .github/workflows/ruff.yml                   | 2 +-
 .github/workflows/yapf.yml                   | 2 +-
 CMakeLists.txt                               | 2 +-
 docs/source/getting_started/installation.rst | 2 +-
 setup.py                                     | 1 +
 7 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 68e3a3fefdc5d..8d423657630c2 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 607fda754bf23..aeeaf6efab043 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 773def58fd966..1a794af572fef 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 04f307bcf8b0e..c89f82dfaaaf6 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77a8af549b027..dbe688186f17f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 0253717da3cda..57ad8bacedfcc 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -9,7 +9,7 @@ Requirements
 ------------
 
 * OS: Linux
-* Python: 3.8 -- 3.11
+* Python: 3.8 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 Install with pip
diff --git a/setup.py b/setup.py
index 63c1f466d2910..91307e8a94062 100644
--- a/setup.py
+++ b/setup.py
@@ -465,6 +465,7 @@ def _read_requirements(filename: str) -> List[str]:
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],

From a8d604ca2a2912b3a5352821c53c080383580df1 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 2 Aug 2024 16:51:58 -0400
Subject: [PATCH 66/79] [Misc] Disambiguate quantized types via a new
 ScalarType (#6396)

---
 CMakeLists.txt                                |  52 ++-
 Dockerfile.openvino                           |   3 +
 benchmarks/kernels/benchmark_marlin.py        |  50 +--
 cmake/cpu_extension.cmake                     |   1 -
 csrc/{ => core}/registration.h                |   0
 csrc/core/scalar_type.hpp                     | 382 ++++++++++++++++++
 csrc/core/torch_bindings.cpp                  |  16 +
 csrc/cpu/torch_bindings.cpp                   |   2 +-
 csrc/moe/torch_bindings.cpp                   |   2 +-
 csrc/ops.h                                    |   8 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  66 ++-
 .../marlin/sparse/marlin_24_cuda_kernel.cu    |  17 +-
 csrc/torch_bindings.cpp                       |   2 +-
 setup.py                                      |   9 +-
 tests/kernels/test_int8_quant.py              |   2 -
 tests/kernels/test_marlin_gemm.py             |  75 ++--
 tests/test_scalartype.py                      |  36 ++
 vllm/_core_ext.py                             | 177 ++++++++
 vllm/_custom_ops.py                           |  29 +-
 .../layers/quantization/awq_marlin.py         |  49 ++-
 .../schemes/compressed_tensors_w4a16_24.py    |  18 +-
 .../schemes/compressed_tensors_wNa16.py       |  29 +-
 .../layers/quantization/gptq_marlin.py        |  43 +-
 .../layers/quantization/gptq_marlin_24.py     |  29 +-
 .../layers/quantization/utils/marlin_utils.py | 120 +++---
 .../quantization/utils/marlin_utils_test.py   |  29 +-
 .../utils/marlin_utils_test_24.py             |  30 +-
 .../layers/quantization/utils/quant_utils.py  | 148 +++----
 vllm/scalar_type.py                           |  35 ++
 29 files changed, 1107 insertions(+), 352 deletions(-)
 rename csrc/{ => core}/registration.h (100%)
 create mode 100644 csrc/core/scalar_type.hpp
 create mode 100644 csrc/core/torch_bindings.cpp
 create mode 100644 tests/test_scalartype.py
 create mode 100644 vllm/_core_ext.py
 create mode 100644 vllm/scalar_type.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbe688186f17f..922613ec5ddaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,39 @@ endif()
 #
 find_package(Torch REQUIRED)
 
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)
+message(STATUS "Enabling core extension.")
+
+# Define _core_C extension
+#  built for (almost) every target platform, (excludes TPU and Neuron)
+
+set(VLLM_EXT_SRC
+  "csrc/core/torch_bindings.cpp")
+
+define_gpu_extension_target(
+  _core_C
+  DESTINATION vllm
+  LANGUAGE CXX
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+  USE_SABI 3
+  WITH_SOABI)
+
+add_dependencies(default _core_C)
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -74,7 +107,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     else()
-        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+        return()
     endif()
     return()
 endif()
@@ -132,7 +165,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 #
-# Define extension targets
+# Define other extension targets
 #
 
 #
@@ -228,21 +261,6 @@ define_gpu_extension_target(
 
 
 
-#
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
-
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(default _C)
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 7c62dd845aa99..c84dea419e58a 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -13,6 +13,9 @@ COPY requirements-common.txt /workspace/vllm/
 COPY requirements-openvino.txt /workspace/vllm/
 
 COPY vllm/ /workspace/vllm/vllm
+COPY csrc/core /workspace/vllm/csrc/core
+COPY cmake/utils.cmake /workspace/vllm/cmake/
+COPY CMakeLists.txt /workspace/vllm/
 COPY setup.py /workspace/vllm/
 
 # install build requirements
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 684985b81f690..536c133bb3341 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -7,16 +7,17 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS)
+    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
+    gptq_pack, gptq_quantize_weights, sort_weights)
+from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@@ -27,13 +28,14 @@
 
 
 def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
-              size_m: int, size_k: int, size_n: int):
+              act_order: bool, is_k_full: bool, quant_type: ScalarType,
+              group_size: int, size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
 
-    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
-                                         group_size, size_m, size_k, size_n))
+    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
+                                         str(quant_type), group_size, size_m,
+                                         size_k, size_n))
 
     print(f"Testing: {sub_label}")
 
@@ -50,18 +52,18 @@ def bench_run(results: List[benchmark.Measurement], model: str,
         marlin_g_idx,
         marlin_sort_indices,
         marlin_rand_perm,
-    ) = marlin_quantize(b, num_bits, group_size, act_order)
+    ) = marlin_quantize(b, quant_type, group_size, act_order)
 
     # Marlin_24 quant
     (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
 
     marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
 
     # GPTQ quant
     (w_ref, q_w, s, g_idx,
-     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # For act_order, sort the "weights" and "g_idx"
     # so that group ids are increasing
@@ -75,10 +77,11 @@ def bench_run(results: List[benchmark.Measurement], model: str,
 
     marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                           GPTQ_MARLIN_24_MAX_PARALLEL)
+    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
 
     globals = {
         # Gen params
-        "num_bits": num_bits,
+        "quant_type": quant_type,
         "group_size": group_size,
         "size_m": size_m,
         "size_n": size_n,
@@ -128,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -138,19 +141,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
             description="gptq_marlin_gemm_fp32",
         ).blocked_autorange(min_run_time=min_run_time))
 
-    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
             and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
         results.append(
             benchmark.Timer(
                 stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
                 globals=globals,
                 label=label,
                 sub_label=sub_label,
@@ -160,7 +163,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -196,9 +199,10 @@ def main(args):
                            ) > 0 and is_k_full not in args.limit_k_full:
                         continue
 
-                    for num_bits in MARLIN_SUPPORTED_NUM_BITS:
-                        if len(args.limit_num_bits
-                               ) > 0 and num_bits not in args.limit_num_bits:
+                    for quant_type in query_marlin_supported_quant_types(
+                            False):
+                        if len(args.limit_num_bits) > 0 and \
+                            quant_type.size_bits not in args.limit_num_bits:
                             continue
 
                         for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
@@ -215,8 +219,8 @@ def main(args):
 
                             for size_m in args.batch_sizes:
                                 bench_run(results, model, act_order, is_k_full,
-                                          num_bits, group_size, size_m, size_k,
-                                          size_n)
+                                          quant_type, group_size, size_m,
+                                          size_k, size_n)
 
     compare = benchmark.Compare(results)
     compare.print()
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 118f9b28e0ae3..3ba3a2b6a93cd 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -113,6 +113,5 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
diff --git a/csrc/registration.h b/csrc/core/registration.h
similarity index 100%
rename from csrc/registration.h
rename to csrc/core/registration.h
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
new file mode 100644
index 0000000000000..9f78402eee2a7
--- /dev/null
+++ b/csrc/core/scalar_type.hpp
@@ -0,0 +1,382 @@
+#pragma once
+
+#include <torch/custom_class.h>
+
+namespace vllm {
+
+//
+//  ScalarType can represent a wide range of floating point and integer types,
+//  in particular it can be used to represent sub-byte data types (something
+//  that torch.dtype currently does not support).
+//
+//  ScalarTypeTorch is a subclass of ScalarType that is compatible with
+//  TORCH_LIBRARY, making it accessible from Python as well meaning this class
+//  can be used as a argument for custom operators, helping to simplify these
+//  interfaces.
+//
+//  The type definitions on the Python side can be found in: vllm/_core_ext.pyi
+//  these type definitions should be kept up to date with any Python API changes
+//  here.
+//
+class ScalarType {
+ public:
+  enum NanRepr : int64_t {
+    NAN_NONE = 0,                // nans are not supported
+    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
+    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
+
+    NAN_REPR_ID_MAX
+  };
+
+  constexpr ScalarType(bool signed_, int64_t exponent, int64_t mantissa,
+                       int64_t bias, bool finite_values_only = false,
+                       NanRepr nan_repr = NAN_IEEE_754)
+      : exponent(exponent),
+        mantissa(mantissa),
+        bias(bias),
+        signed_(signed_),
+        finite_values_only(finite_values_only),
+        nan_repr(nan_repr){};
+
+  static constexpr ScalarType int_(int64_t size_bits, int64_t bias = 0) {
+    return ScalarType(true, 0, size_bits - 1, bias);
+  }
+
+  static constexpr ScalarType uint(int64_t size_bits, int64_t bias = 0) {
+    return ScalarType(false, 0, size_bits, bias);
+  }
+
+  // IEEE 754 compliant floating point type
+  static constexpr ScalarType float_IEEE754(int64_t exponent,
+                                            int64_t mantissa) {
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    return ScalarType(true, exponent, mantissa, 0, false, NAN_IEEE_754);
+  }
+
+  // IEEE 754 non-compliant floating point type
+  static constexpr ScalarType float_(int64_t exponent, int64_t mantissa,
+                                     bool finite_values_only,
+                                     NanRepr nan_repr) {
+    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    TORCH_CHECK(nan_repr != NAN_IEEE_754,
+                "use `float_IEEE754` constructor for floating point types that "
+                "follow IEEE 754 conventions");
+    return ScalarType(true, exponent, mantissa, 0, finite_values_only,
+                      nan_repr);
+  }
+
+  int64_t const exponent;  // size of the exponent field (0 for integer types)
+  int64_t const mantissa;  // size of the mantissa field (size of the integer
+                           // excluding the sign bit for integer types)
+  int64_t const bias;      // stored values equal value + bias,
+                           // used for quantized type
+  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
+                       // sign bit)
+
+  // Extra Floating point info
+  bool const finite_values_only;  // i.e. no +/-inf if true
+  NanRepr const nan_repr;         // how NaNs are represented
+                                  // (not applicable for integer types)
+
+  int64_t size_bits() const { return mantissa + exponent + is_signed(); }
+  bool is_signed() const { return signed_; }
+  bool is_integer() const { return exponent == 0; }
+  bool is_floating_point() const { return exponent > 0; }
+  bool is_ieee_754() const {
+    return is_floating_point() && finite_values_only == false &&
+           nan_repr == NAN_IEEE_754;
+  }
+  bool has_nans() const { return is_floating_point() && nan_repr != NAN_NONE; }
+  bool has_infs() const {
+    return is_floating_point() && finite_values_only == false;
+  }
+  bool has_bias() const { return bias != 0; }
+
+ private:
+  double _floating_point_max() const {
+    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+                "Cannot represent max/min as a double for type ", str());
+
+    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
+      max_mantissa -= 1;
+    }
+
+    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
+      TORCH_CHECK(exponent < 11,
+                  "Cannot represent max/min as a double for type ", str());
+      max_exponent += 1;
+    }
+
+    // adjust the exponent to match that of a double
+    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
+    //  is the exponent bits), there is some precedent for non-standard biases,
+    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
+    //  but to avoid premature over complication we are just assuming the
+    //  standard exponent bias until there is a need to support non-standard
+    //  biases
+    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
+    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
+
+    uint64_t max_exponent_double =
+        max_exponent - exponent_bias + exponent_bias_double;
+
+    // shift the mantissa into the position for a double and
+    // the exponent
+    uint64_t double_raw =
+        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
+
+    return *reinterpret_cast<double*>(&double_raw);
+  }
+
+  std::variant<int64_t, double> _raw_max() const {
+    if (is_floating_point()) {
+      return {_floating_point_max()};
+    } else {
+      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+                  "Cannot represent max as a int64_t");
+      return {(int64_t(1) << mantissa) - 1};
+    }
+  }
+
+  std::variant<int64_t, double> _raw_min() const {
+    if (is_floating_point()) {
+      TORCH_CHECK(is_signed(),
+                  "We currently assume all floating point types are signed");
+      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
+
+      double max = _floating_point_max();
+      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
+      uint64_t min_raw = max_raw | sign_bit_double;
+      return {*reinterpret_cast<double*>(&min_raw)};
+    } else {
+      TORCH_CHECK(!is_signed() || size_bits() <= 64,
+                  "Cannot represent min as a int64_t");
+      if (is_signed()) {
+        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
+        // then perform an arithmetic shift right to set all the bits above
+        // (size_bits() - 1) to 1
+        return {INT64_MIN >> (64 - size_bits())};
+      } else {
+        return {int64_t(0)};
+      }
+    }
+  }
+
+ public:
+  // Max representable value for this scalar type.
+  // (accounting for bias if there is one)
+  std::variant<int64_t, double> max() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_max());
+  }
+
+  // Min representable value for this scalar type.
+  // (accounting for bias if there is one)
+  std::variant<int64_t, double> min() const {
+    return std::visit(
+        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
+        _raw_min());
+  }
+
+  std::string str() const {
+    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
+     * for floating point types (leading f) the scheme is:
+     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+     *  flags:
+     *  - no-flags: means it follows IEEE 754 conventions
+     *  - f: means finite values only (no infinities)
+     *  - n: means nans are supported (non-standard encoding)
+     * for integer types the scheme is:
+     *  `[u]int<size_bits>[b<bias>]`
+     *  - if bias is not present it means its zero
+     */
+    if (is_floating_point()) {
+      auto ret = "float" + std::to_string(size_bits()) + "_e" +
+                 std::to_string(exponent) + "m" + std::to_string(mantissa);
+      if (!is_ieee_754()) {
+        if (finite_values_only) {
+          ret += "f";
+        }
+        if (nan_repr != NAN_NONE) {
+          ret += "n";
+        }
+      }
+      return ret;
+    } else {
+      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
+      if (has_bias()) {
+        ret += "b" + std::to_string(bias);
+      }
+      return ret;
+    }
+  }
+
+  bool operator==(ScalarType const& other) const {
+    return mantissa == other.mantissa && exponent == other.exponent &&
+           bias == other.bias && signed_ == other.signed_ &&
+           finite_values_only == other.finite_values_only &&
+           nan_repr == other.nan_repr;
+  }
+};
+
+// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
+//  torch::CustomClassHolder), we use multiple inheritance here since we cannot
+//  have ScalarType inherit from torch::CustomClassHolder and have a constexpr
+//  constructor at the same time (torch::CustomClassHolder does not have a
+//  constexpr destructor)
+class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
+ public:
+  ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
+                  bool _signed)
+      : ScalarType(exponent, mantissa, bias, _signed){};
+
+  ScalarTypeTorch(ScalarType type) : ScalarType(type){};
+
+  using Base = ScalarType;
+  using Self = ScalarTypeTorch;
+  using SelfPtr = c10::intrusive_ptr<Self>;
+
+  static SelfPtr int_(int64_t size_bits, c10::optional<int64_t> bias) {
+    return c10::make_intrusive<Self>(
+        ScalarType::int_(size_bits, bias.value_or(0)));
+  }
+
+  static SelfPtr uint(int64_t size_bits, c10::optional<int64_t> bias) {
+    return c10::make_intrusive<Self>(
+        ScalarType::uint(size_bits, bias.value_or(0)));
+  }
+
+  static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
+    return c10::make_intrusive<Self>(
+        ScalarType::float_IEEE754(exponent, mantissa));
+  }
+
+  static SelfPtr float_(int64_t exponent, int64_t mantissa,
+                        bool finite_values_only, int64_t nan_repr) {
+    return c10::make_intrusive<Self>(ScalarType::float_(
+        exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
+  }
+
+  template <typename T>
+  static void bind_readonly_property(torch::class_<Self>& cls,
+                                     std::string const& name, T Base::*field) {
+    auto getter_func = [field = std::move(field)](SelfPtr const& self) {
+      if constexpr (std::is_member_function_pointer_v<decltype(field)>) {
+        return (self.get()->*field)();
+      } else {
+        return self.get()->*field;
+      }
+    };
+
+    cls.def_property(name, getter_func);
+  }
+
+  template <typename MemberFunc, typename Cls>
+  static void bind_function(torch::class_<Self>& cls, const std::string& name,
+                            MemberFunc Cls::*member) {
+    cls.def(name, [member = std::move(member)](SelfPtr const& self) {
+      return (self.get()->*member)();
+    });
+  }
+
+  template <typename Func>
+  static void bind_function(torch::class_<Self>& cls, const std::string& name,
+                            Func func) {
+    cls.def(name, func);
+  }
+
+  template <typename Func>
+  static void bind_static_function(torch::class_<Self>& cls,
+                                   const std::string& name, Func func) {
+    cls.def_static(name, func);
+  }
+
+  static void bind_class(torch::Library& lib) {
+    auto cls = lib.class_<ScalarTypeTorch>("ScalarType")
+                   .def(torch::init<int64_t, int64_t, int64_t, bool>());
+
+    // Bind Properties
+    bind_readonly_property(cls, "mantissa", &Base::mantissa);
+    bind_readonly_property(cls, "exponent", &Base::exponent);
+    bind_readonly_property(cls, "bias", &Base::bias);
+    bind_readonly_property(cls, "signed", &Base::is_signed);
+    bind_readonly_property(cls, "size_bits", &Base::size_bits);
+
+    // Bind member functions
+    bind_function(cls, "is_signed", &Base::is_signed);
+    bind_function(cls, "is_integer", &Base::is_integer);
+    bind_function(cls, "is_floating_point", &Base::is_floating_point);
+    bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
+    bind_function(cls, "has_nans", &Base::has_nans);
+    bind_function(cls, "has_infs", &Base::has_infs);
+    bind_function(cls, "has_bias", &Base::has_bias);
+
+    bind_function(cls, "max", [](SelfPtr const& self) {
+      return std::visit([](auto arg) { return c10::IValue(arg); },
+                        self.get()->max());
+    });
+    bind_function(cls, "min", [](SelfPtr const& self) {
+      return std::visit([](auto arg) { return c10::IValue(arg); },
+                        self.get()->min());
+    });
+
+    bind_function(cls, "__str__", &Base::str);
+    bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
+      return *self == *other;
+    });
+    bind_function(cls, "__repr__", [](SelfPtr const& self) {
+      return "ScalarType." + self.get()->str();
+    });
+
+    // Bind static functions (convenience constructors)
+    bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
+    bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
+    bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
+    bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
+  }
+};
+
+using ScalarTypeTorchPtr = c10::intrusive_ptr<ScalarTypeTorch>;
+
+// "rust style" names generally following:
+//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
+static inline constexpr auto kS4 = ScalarType::int_(4);
+static inline constexpr auto kU4 = ScalarType::uint(4);
+static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
+static inline constexpr auto kS8 = ScalarType::int_(8);
+static inline constexpr auto kU8 = ScalarType::uint(8);
+static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+
+static inline constexpr auto kFE3M2f =
+    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE4M3fn =
+    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
+static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
+static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
+
+// Fixed width style names, generally following:
+//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
+static inline constexpr auto kInt4 = kS4;
+static inline constexpr auto kUint4 = kU4;
+static inline constexpr auto kUint4b8 = kU4B8;
+static inline constexpr auto kInt8 = kS8;
+static inline constexpr auto kUint8 = kU8;
+static inline constexpr auto kUint8b128 = kU8B128;
+
+static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
+static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
+static inline constexpr auto kFloat8_e5m2 = kFE5M2;
+static inline constexpr auto kFloat16_e8m7 = kFE8M7;
+static inline constexpr auto kFloat16_e5m10 = kFE5M10;
+
+// colloquial names
+static inline constexpr auto kHalf = kFE5M10;
+static inline constexpr auto kFloat16 = kHalf;
+static inline constexpr auto kBFloat16 = kFE8M7;
+
+};  // namespace vllm
diff --git a/csrc/core/torch_bindings.cpp b/csrc/core/torch_bindings.cpp
new file mode 100644
index 0000000000000..f60254189a2f7
--- /dev/null
+++ b/csrc/core/torch_bindings.cpp
@@ -0,0 +1,16 @@
+#include <torch/library.h>
+
+#include "scalar_type.hpp"
+#include "registration.h"
+
+// Note the CORE exstension will be built for (almost) all hardware targets so
+// new additions must account for this. (currently not built for TPU and Neuron)
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
+  // ScalarType, a custom class for representing data types that supports
+  // quantized types, declared here so it can be used when creating interfaces
+  // for custom ops.
+  vllm::ScalarTypeTorch::bind_class(lib);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 7d549e271a30d..cf7d977da7c1c 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -1,6 +1,6 @@
 #include "cache.h"
 #include "ops.h"
-#include "registration.h"
+#include "core/registration.h"
 
 #include <torch/library.h>
 
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 243752b9a9e8c..86e42af44df15 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -1,4 +1,4 @@
-#include "registration.h"
+#include "core/registration.h"
 #include "moe_ops.h"
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
diff --git a/csrc/ops.h b/csrc/ops.h
index f274a7e647b95..3bd4a9eda5ee3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -3,6 +3,8 @@
 #include <optional>
 #include <torch/library.h>
 
+#include "core/scalar_type.hpp"
+
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
@@ -84,14 +86,16 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k);
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
                                bool use_fp32_reduce);
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 26cc248e6ac5d..edf19365c8098 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -21,6 +21,7 @@
 
 #include "marlin.cuh"
 #include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
   static_assert(std::is_same<scalar_t, half>::value ||          \
@@ -71,14 +72,15 @@ __global__ void Marlin(
     bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {}
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full) {
+                               bool is_k_full, bool has_zp) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -1963,18 +1965,29 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
     __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
 
 template <typename scalar_t>
-void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
-                     void* s, void* zp, void* g_idx, void* perm, void* a_tmp,
-                     int prob_m, int prob_n, int prob_k, void* workspace,
-                     int num_bits, bool has_act_order, bool is_k_full,
-                     bool has_zp, int num_groups, int group_size, int dev,
-                     cudaStream_t stream, int thread_k, int thread_n, int sms,
-                     int max_par, bool use_fp32_reduce) {
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, int max_par, bool use_fp32_reduce) {
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
+        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        q_type.str());
+  }
+
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
+  // TODO: remove alias when we start supporting other 8bit types
+  int num_bits = q_type.size_bits();
   int tot_m = prob_m;
   int tot_m_blocks = div_ceil(tot_m, 16);
   int pad = 16 * tot_m_blocks - tot_m;
@@ -2126,19 +2139,28 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp,
   }
 }
 
-}  // namespace gptq_marlin
+}  // namespace marlin
 
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace, int64_t num_bits,
+                               torch::Tensor& workspace,
+                               vllm::ScalarTypeTorchPtr const& b_q_type,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
                                bool use_fp32_reduce) {
-  // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
+  if (has_zp) {
+    TORCH_CHECK(*b_q_type == vllm::kU4 || *b_q_type == vllm::kU8,
+                "b_q_type must be u4 or u8 when has_zp = True. Got = ",
+                b_q_type->str());
+  } else {
+    TORCH_CHECK(
+        *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        b_q_type->str());
+  }
+
+  int pack_factor = 32 / b_q_type->size_bits();
 
   // Verify A
   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@@ -2265,21 +2287,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 
   int dev = a.get_device();
   if (a.scalar_type() == at::ScalarType::Half) {
-    marlin::marlin_mm_f16i4<half>(
+    marlin::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
         c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
         b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
         thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    marlin::marlin_mm_f16i4<nv_bfloat16>(
+    marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
         c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
         b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
+        workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
         thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
   } else {
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 3c50f1786bc68..93445a386593b 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -27,6 +27,7 @@
 #include <iostream>
 
 #include "common/base.h"
+#include "core/scalar_type.hpp"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
@@ -86,7 +87,8 @@ __global__ void Marlin_24(
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k) {
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -1025,13 +1027,14 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,
-                                  torch::Tensor& workspace, int64_t num_bits,
+                                  torch::Tensor& workspace,
+                                  vllm::ScalarTypeTorchPtr const& b_q_type,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k) {
   // Verify num_bits
-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
-              "num_bits must be 4 or 8. Got = ", num_bits);
-  int pack_factor = 32 / num_bits;
+  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+  int pack_factor = 32 / b_q_type->size_bits();
 
   // Verify M
   TORCH_CHECK(size_m == a.size(0),
@@ -1126,8 +1129,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   marlin_24::marlin_cuda_2_4(
       a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
       b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
-      num_bits, groupsize, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
-      thread_m, sms, max_par);
+      b_q_type->size_bits(), groupsize, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_m, sms, max_par);
 
   return c;
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index bf8cefa8d4713..7c0d617fc8b3b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -1,7 +1,7 @@
 #include "cache.h"
 #include "cuda_utils.h"
 #include "ops.h"
-#include "registration.h"
+#include "core/registration.h"
 
 #include <torch/library.h>
 
diff --git a/setup.py b/setup.py
index 91307e8a94062..b146299f8269d 100644
--- a/setup.py
+++ b/setup.py
@@ -271,6 +271,10 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
+def _build_core_ext() -> bool:
+    return not _is_neuron() and not _is_tpu()
+
+
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -433,6 +437,9 @@ def _read_requirements(filename: str) -> List[str]:
 
 ext_modules = []
 
+if _build_core_ext():
+    ext_modules.append(CMakeExtension(name="vllm._core_C"))
+
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
@@ -477,7 +484,7 @@ def _read_requirements(filename: str) -> List[str]:
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
     },
-    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
+    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
     package_data=package_data,
     entry_points={
         "console_scripts": [
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 03acbf7968ff1..0b7ed26a39e1e 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,8 +1,6 @@
 import pytest
 import torch
 
-# ruff: noqa: F401
-import vllm._C
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from vllm._custom_ops import scaled_int8_quant
 
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index a9e34ac8a7aa8..2f58ffda21408 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -9,14 +9,14 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.qqq import (
     MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
     MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
-    marlin_make_empty_g_idx, marlin_permute_scales)
+    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
+    marlin_permute_scales, query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     pack_fp8_to_int32)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -27,8 +27,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
     marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
-    sort_weights)
+    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -65,12 +64,13 @@ def rand_data(shape, dtype=torch.float16):
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
-                            mnk_factors):
+def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
+                            act_order, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -95,11 +95,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(b_weight, num_bits,
-                                                       group_size, act_order)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        b_weight, quant_type, group_size, act_order)
 
     # Pack to GPTQ format
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
@@ -108,8 +108,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -117,7 +118,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
         sort_indices,
         size_k,
         size_n,
-        num_bits,
+        quant_type.size_bits,
     )
     torch.cuda.synchronize()
 
@@ -128,10 +129,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                            mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -150,22 +152,25 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize
-    w_ref, q_w, s, zp = quantize_weights_with_zp(b_weight, num_bits,
-                                                 group_size)
+    w_ref, q_w, s, zp = quantize_weights(b_weight,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
 
     # Pack to AWQ format
-    q_w_awq = awq_pack(q_w, num_bits, size_k, size_n)
+    q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                  weight_perm)
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
         q_w_awq,
         size_k,
         size_n,
-        num_bits,
+        quant_type.size_bits,
     )
     torch.cuda.synchronize()
 
@@ -176,7 +181,8 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
@@ -185,7 +191,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
 def test_gptq_marlin_gemm(
     k_chunk,
     n_chunk,
-    num_bits,
+    quant_type,
     group_size,
     mnk_factors,
     act_order,
@@ -211,7 +217,7 @@ def test_gptq_marlin_gemm(
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-        b_weight, num_bits, group_size, act_order)
+        b_weight, quant_type, group_size, act_order)
 
     marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
 
@@ -226,7 +232,7 @@ def test_gptq_marlin_gemm(
         g_idx,
         sort_indices,
         workspace.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -248,10 +254,10 @@ def test_gptq_marlin_gemm(
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
                              mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
@@ -266,7 +272,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
     b_weight = rand_data((size_k, size_n))
 
     (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b_weight, num_bits, group_size)
+     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
 
     workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                    GPTQ_MARLIN_24_MAX_PARALLEL)
@@ -279,7 +285,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
         marlin_24_meta,
         marlin_24_s,
         workspace_24.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -371,14 +377,15 @@ def test_fp8_marlin_gemm(
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("quant_type",
+                         query_marlin_supported_quant_types(True))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_awq_marlin_gemm(
     k_chunk,
     n_chunk,
-    num_bits,
+    quant_type,
     group_size,
     mnk_factors,
     use_fp32_reduce,
@@ -396,7 +403,7 @@ def test_awq_marlin_gemm(
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-        b_weight, num_bits, group_size)
+        b_weight, quant_type, group_size)
 
     g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
     sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
@@ -414,7 +421,7 @@ def test_awq_marlin_gemm(
         g_idx,
         sort_indices,
         workspace.scratch,
-        num_bits,
+        quant_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
new file mode 100644
index 0000000000000..1201aaa92ea89
--- /dev/null
+++ b/tests/test_scalartype.py
@@ -0,0 +1,36 @@
+import pytest
+import torch
+
+from vllm.scalar_type import scalar_types
+
+
+@pytest.mark.parametrize("type_tuple", (
+    (-8, 7, scalar_types.int4),
+    (0, 15, scalar_types.uint4),
+    (-8, 7, scalar_types.uint4b8),
+    (-128, 127, scalar_types.uint8b128),
+    (-28., 28., scalar_types.float6_e3m2f),
+    (torch.int8, scalar_types.int8),
+    (torch.uint8, scalar_types.uint8),
+    (torch.float8_e5m2, scalar_types.float8_e5m2),
+    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
+    (torch.bfloat16, scalar_types.float16_e8m7),
+    (torch.float16, scalar_types.float16_e5m10),
+),
+                         ids=lambda x: str(x))
+def test_scalar_type_min_max(type_tuple):
+    print(type_tuple)
+    if len(type_tuple) == 3:
+        min, max, t = type_tuple
+    else:
+        torch_type, t = type_tuple
+        if torch_type.is_floating_point:
+            min = torch.finfo(torch_type).min
+            max = torch.finfo(torch_type).max
+        else:
+            min = torch.iinfo(torch_type).min
+            max = torch.iinfo(torch_type).max
+
+    print(t, min, max, t.min(), t.max())
+    assert min == t.min()
+    assert max == t.max()
diff --git a/vllm/_core_ext.py b/vllm/_core_ext.py
new file mode 100644
index 0000000000000..e3b9fbb938915
--- /dev/null
+++ b/vllm/_core_ext.py
@@ -0,0 +1,177 @@
+import importlib.util
+from enum import Enum
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+core_C_available = importlib.util.find_spec('._core_C', 'vllm') is not None
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+if TYPE_CHECKING or not core_C_available:
+    # On platforms were we cannot use/build the C++ core extension (i.e. namely
+    # neuron and tpu), we define the mock ScalarType class here that partially
+    # mimics the C++ ScalarType class.
+    #
+    # We also use this provide type signatures to the Python LSP for the methods
+    # in the C++ ScalarType class. So these type signatures should be kept
+    # in sync with csrc/core/scalar_type.hpp
+
+    from dataclasses import dataclass
+
+    @dataclass(frozen=True)
+    class ScalarType:
+        """
+        ScalarType can represent a wide range of floating point and integer 
+        types, in particular it can be used to represent sub-byte data types 
+        (something that torch.dtype currently does not support). It is also 
+        capable of  representing types with a bias, i.e.:
+          `stored_value = value + bias`, 
+        this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias 
+        of 8). The implementation for this class can be found in 
+        csrc/core/scalar_type.hpp, these type signatures should be kept in sync 
+        with that file.
+        """
+
+        exponent: int
+        """
+        Number of bits in the exponent if this is a floating point type
+        (zero if this an integer type)
+        """
+
+        mantissa: int
+        """
+        Number of bits in the mantissa if this is a floating point type,
+        or the number bits representing an integer excluding the sign bit if 
+        this an integer type.
+        """
+
+        bias: int
+        """
+        bias used to encode the values in this scalar type 
+        (value = stored_value - bias, default 0) for example if we store the 
+        type as an unsigned integer with a bias of 128 then the value 0 will be 
+        stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+        """
+
+        signed: bool
+        "If the type is signed (i.e. has a sign bit)"
+
+        _finite_values_only: bool = False
+        """
+        Private: if NANs are supported, used `has_infs()` instead.
+        """
+
+        nan_repr: int = NanRepr.IEEE_754.value
+        """
+        How NaNs are represent in this scalar type, returns NanRepr value. 
+        (not applicable for integer types)
+        """
+
+        @property
+        def size_bits(self):
+            return self.exponent + self.mantissa + int(self.signed)
+
+        def min(self) -> Union[int, float]:
+            """
+            Min representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def max(self) -> Union[int, float]:
+            """
+            Max representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def is_signed(self) -> bool:
+            """
+            If the type is signed (i.e. has a sign bit), same as `signed`
+            added for consistency with:
+            https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+            """
+            ...
+
+        def is_floating_point(self):
+            "If the type is a floating point type"
+            return self.exponent != 0
+
+        def is_integer(self):
+            "If the type is an integer type"
+            return self.exponent == 0
+
+        def has_bias(self):
+            "If the type has a non-zero bias"
+            return self.bias != 0
+
+        def has_infs(self):
+            "If the type is floating point and supports infinity"
+            return not self._finite_values_only
+
+        def has_nans(self):
+            return self.nan_repr != NanRepr.NONE.value
+
+        def is_ieee_754(self) -> bool:
+            """
+            If the type is a floating point type that follows IEEE 754 
+            conventions
+            """
+            return self.nan_repr == NanRepr.IEEE_754.value and \
+                not self._finite_values_only
+
+        def __str__(self) -> str:
+            raise NotImplementedError
+
+        def __repr__(self) -> str:
+            raise NotImplementedError
+
+        #
+        # Convenience Constructors
+        #
+
+        @classmethod
+        def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            "Create a signed integer scalar type (size_bits includes sign-bit)."
+            return cls(size_bits - 1, size_bits, bias if bias else 0, True)
+
+        @classmethod
+        def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            """Create a unsigned integer scalar type."""
+            return cls(size_bits, size_bits, bias if bias else 0, False)
+
+        @classmethod
+        def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+            """
+            Create a standard floating point type 
+            (i.e. follows IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True)
+
+        @classmethod
+        def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
+                   nan_repr: int):
+            """
+            Create a non-standard floating point type 
+            (i.e. does not follow IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True, finite_values_only,
+                       nan_repr)
+
+elif core_C_available:
+    try:
+        import vllm._core_C  # noqa: F401
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._core_C with %r", e)
+
+    ScalarType = torch.classes._core_C.ScalarType
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6cd77f75cae8d..ad7e5bd199339 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from vllm._core_ext import ScalarType
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -220,10 +221,10 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 # marlin_24
 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
-                        workspace: torch.Tensor, num_bits: int, size_m: int,
-                        size_n: int, size_k: int) -> torch.Tensor:
+                        workspace: torch.Tensor, b_q_type: ScalarType,
+                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                            workspace, num_bits, size_m,
+                                            workspace, b_q_type, size_m,
                                             size_n, size_k)
 
 
@@ -279,14 +280,22 @@ def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
-def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                     b_scales: torch.Tensor, b_zeros: torch.Tensor,
-                     g_idx: torch.Tensor, perm: torch.Tensor,
-                     workspace: torch.Tensor, num_bits: int, size_m: int,
-                     size_n: int, size_k: int, is_k_full: bool, has_zp: bool,
-                     use_fp32_reduce: bool) -> torch.Tensor:
+def gptq_marlin_gemm(a: torch.Tensor,
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     b_zeros: torch.Tensor,
+                     g_idx: torch.Tensor,
+                     perm: torch.Tensor,
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool,
+                     has_zp: bool = False,
+                     use_fp32_reduce: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
-                                         g_idx, perm, workspace, num_bits,
+                                         g_idx, perm, workspace, b_q_type,
                                          size_m, size_n, size_k, is_k_full,
                                          has_zp, use_fp32_reduce)
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 5ffbb8e854e87..2cc080608c7a9 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -10,11 +10,11 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_awq_marlin_linear, awq_to_marlin_zero_points,
-    check_awq_marlin_supported, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_awq_marlin_supported,
-    verify_marlin_supports_shape)
+    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
+    replace_tensor, verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -22,20 +22,31 @@
 class AWQMarlinConfig(QuantizationConfig):
     """Config class for AWQ Marlin"""
 
+    # num_bits -> type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
     def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
                  lm_head_quantized: bool) -> None:
-        self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into 32bits
+        self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.has_zp = has_zp
         self.lm_head_quantized = lm_head_quantized
 
-        verify_awq_marlin_supported(num_bits=self.weight_bits,
-                                    group_size=self.group_size,
-                                    has_zp=self.has_zp)
+        if weight_bits not in self.TYPE_MAP:
+            raise ValueError(f"Unsupported num_bits = {weight_bits}. "
+                             f"Supported num_bits = {self.TYPE_MAP.keys()}")
+
+        self.quant_type = self.TYPE_MAP[weight_bits]
+
+        verify_marlin_supported(self.quant_type,
+                                group_size=self.group_size,
+                                has_zp=self.has_zp)
 
     def __repr__(self) -> str:
-        return (f"AWQMarlinConfig(weight_bits={self.weight_bits}, "
+        return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"has_zp={self.has_zp}, "
                 f"lm_head_quantized={self.lm_head_quantized})")
@@ -110,11 +121,13 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         if (num_bits is None or group_size is None or has_zp is None):
             return False
 
-        return check_awq_marlin_supported(
-            num_bits=num_bits,
-            group_size=group_size,
-            has_zp=has_zp,
-            min_capability=cls.get_min_capability())
+        if num_bits not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
+                                      group_size=group_size,
+                                      has_zp=has_zp,
+                                      min_capability=cls.get_min_capability())
 
 
 class AWQMarlinLinearMethod(LinearMethodBase):
@@ -226,7 +239,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from AWQ format to marlin format.
@@ -242,7 +255,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.qzeros,
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qzeros", marlin_zp)
 
         # Not-used
@@ -263,7 +276,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
+            quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index b8ffb22d7a89d..c1adfdb2980b6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -9,9 +9,13 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
-W4A16SPARSE24_SUPPORTED_BITS = [4]
+W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+}
+W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
@@ -22,9 +26,15 @@ def __init__(self,
                  group_size: Optional[int] = None):
         self.strategy = strategy
         self.group_size = group_size
-        self.num_bits = num_bits
         self.tile_size = 16
 
+        if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
+
+        self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
+
         if self.strategy == "group" and self.group_size is None:
             raise ValueError(
                 "group_size must be given when using strategy group")
@@ -43,7 +53,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        pack_factor = 32 // self.num_bits
+        pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 
         qweight = Parameter(
@@ -138,7 +148,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
         size_n = scales.shape[1]
 
         output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
-                                            workspace, self.num_bits, size_m,
+                                            workspace, self.quant_type, size_m,
                                             size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a41962ccd66d8..b8880f7ac136f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -8,12 +8,17 @@
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_gptq_marlin_supported,
+    marlin_permute_scales, replace_tensor, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsWNA16"]
-WNA16_SUPPORTED_BITS = [4, 8]
+WNA16_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+    8: scalar_types.uint8b128,
+}
+WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsWNA16(CompressedTensorsScheme):
@@ -22,8 +27,8 @@ def __init__(self,
                  strategy: str,
                  num_bits: int,
                  group_size: Optional[int] = None):
-        self.num_bits = num_bits
-        self.pack_factor = 32 // self.num_bits
+
+        self.pack_factor = 32 // num_bits
         self.strategy = strategy
 
         self.group_size: int
@@ -37,10 +42,16 @@ def __init__(self,
         else:
             self.group_size = group_size
 
+        if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
+
+        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
+
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.num_bits,
-                                     group_size=self.group_size,
-                                     is_sym=True)
+        verify_marlin_supported(quant_type=self.quant_type,
+                                group_size=self.group_size)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -150,7 +161,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.num_bits)
+            num_bits=self.quant_type.size_bits)
         replace_tensor(layer, "weight_packed", marlin_qweight)
 
         # Permute scales from compressed-tensors format to marlin format.
@@ -172,7 +183,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.num_bits,
+            wtype=self.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=True,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bdcc9c3b4f0c5..4a11b14971076 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,11 +10,12 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_gptq_marlin_linear, check_gptq_marlin_supported, marlin_is_k_full,
+    apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
     marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
     marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_gptq_marlin_supported, verify_marlin_supports_shape)
+    verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -22,6 +23,12 @@
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                  is_sym: bool, lm_head_quantized: bool) -> None:
         if desc_act and group_size == -1:
@@ -29,20 +36,23 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
             # (since we have only one group per output channel)
             desc_act = False
 
-        self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
-        self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.weight_bits,
-                                     group_size=self.group_size,
-                                     is_sym=self.is_sym)
+        verify_marlin_supported(quant_type=self.quant_type,
+                                group_size=self.group_size)
 
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+        return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized})")
@@ -122,11 +132,12 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
                 or desc_act is None):
             return False
 
-        return check_gptq_marlin_supported(
-            num_bits=num_bits,
-            group_size=group_size,
-            is_sym=sym,
-            min_capability=cls.get_min_capability())
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
+                                      group_size=group_size,
+                                      min_capability=cls.get_min_capability())
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -293,7 +304,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.quant_type.size_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -319,7 +330,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
+            wtype=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index e708c4da95af3..cafd100a2f40c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
 
@@ -17,9 +18,10 @@
 GPTQ_MARLIN_24_MIN_THREAD_K = 128
 GPTQ_MARLIN_24_MAX_PARALLEL = 64
 
-GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
+    scalar_types.uint4b8, scalar_types.uint8b128
+]
 GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
-GPTQ_MARLIN_24_SUPPORTED_SYM = [True]
 
 
 class GPTQMarlin24Config(QuantizationConfig):
@@ -31,14 +33,19 @@ def __init__(
         weight_bits: int,
         group_size: int,
     ) -> None:
-        self.weight_bits = weight_bits
+        quant_type = {
+            4: scalar_types.uint4b8,
+            8: scalar_types.uint8b128,
+        }.get(weight_bits)
+
         self.group_size = group_size
 
         # Verify
-        if self.weight_bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
+        if quant_type is None or \
+            quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
             raise ValueError(
-                f"Marlin_24 does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_NUM_BITS} "
+                f"Marlin_24 does not support quant_type = {quant_type}. "
+                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
                 "are supported.")
         if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
             raise ValueError(
@@ -46,8 +53,10 @@ def __init__(
                 f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
                 "are supported.")
 
+        self.quant_type = quant_type
+
         # 4 Bits packed into 32 bit datatype.
-        self.pack_factor = 32 // self.weight_bits
+        self.pack_factor = 32 // self.quant_type.size_bits
 
         # Tile size used by marlin kernels.
         self.tile_size = 16
@@ -66,8 +75,8 @@ def __init__(
         self.perm_len = 1024
 
     def __repr__(self) -> str:
-        return "Marlin24Config(weight_bits={}, group_size={})".format(
-            self.weight_bits, self.group_size)
+        return "Marlin24Config(quant_type={}, group_size={})".format(
+            self.quant_type, self.group_size)
 
     @classmethod
     def get_name(cls) -> str:
@@ -279,7 +288,7 @@ def apply(
 
         output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
                                             workspace,
-                                            self.quant_config.weight_bits,
+                                            self.quant_config.quant_type,
                                             size_m, size_n, size_k)
 
         output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index b789ca20cadb3..6e84d36219361 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -5,6 +5,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
 
 from .quant_utils import pack_cols, unpack_cols
 
@@ -13,7 +14,6 @@
 GPTQ_MARLIN_MIN_THREAD_K = 128
 GPTQ_MARLIN_MAX_PARALLEL = 16
 
-MARLIN_SUPPORTED_NUM_BITS = [4, 8]
 MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 # In case there is a performance issue with Marlin, the variable below can be
@@ -22,76 +22,70 @@
 USE_FP32_REDUCE_DEFAULT = True
 
 
-def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
-                            min_capability: Optional[int],
-                            has_zp: bool) -> Tuple[bool, Optional[str]]:
-    if min_capability is not None:
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(has_zp: bool,
+                                       min_capability: Optional[int] = None):
+    if min_capability is None:
         major, minor = current_platform.get_device_capability()
-        device_capability = major * 10 + minor
-        if device_capability < min_capability:
-            return (False, "Marlin does not support device_capability = {}"
-                    ", the min_capability required is {}".format(
-                        device_capability, min_capability))
-
-    if num_bits not in MARLIN_SUPPORTED_NUM_BITS:
-        return (False, "Marlin does not support weight_bits = {}. "
-                "Only weight_bits = {} are supported.".format(
-                    num_bits, MARLIN_SUPPORTED_NUM_BITS))
-
-    if group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (False, "Marlin does not support group_size = {}. Only "
-                "group_sizes = {} are supported.".format(
-                    group_size, MARLIN_SUPPORTED_GROUP_SIZES))
-
-    if not has_zp and not is_sym:
-        return (False,
-                "Marlin without zero_points must have symmetric quantization")
+        min_capability = major * 10 + minor
 
-    return True, None
+    if min_capability < 80:
+        return []
 
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
 
-def check_gptq_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
-                                min_capability: int) -> bool:
-    cond, _ = _check_marlin_supported(num_bits,
-                                      group_size,
-                                      is_sym,
-                                      min_capability,
-                                      has_zp=False)
-    return cond
 
+def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        min_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
 
-def check_awq_marlin_supported(num_bits: int, group_size: int, has_zp: bool,
-                               min_capability: int) -> bool:
-    cond, _ = _check_marlin_supported(num_bits,
-                                      group_size,
-                                      False,
-                                      min_capability,
-                                      has_zp=has_zp)
-    return cond
+    if min_capability is None:
+        major, minor = current_platform.get_device_capability()
+        min_capability = major * 10 + minor
 
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, min_capability)
 
-def verify_gptq_marlin_supported(num_bits: int, group_size: int,
-                                 is_sym: bool) -> None:
-    cond, err_msg = _check_marlin_supported(num_bits,
-                                            group_size,
-                                            is_sym,
-                                            min_capability=None,
-                                            has_zp=False)
-    if not cond:
-        assert err_msg is not None
-        raise ValueError("GPTQ" + err_msg)
+    if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"min_capability = {min_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    return True, None
+
+
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           min_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      min_capability)
+    return cond
 
 
-def verify_awq_marlin_supported(num_bits: int, group_size: int,
-                                has_zp: bool) -> None:
-    cond, err_msg = _check_marlin_supported(num_bits,
-                                            group_size,
-                                            False,
-                                            min_capability=None,
-                                            has_zp=has_zp)
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
-        raise ValueError("AWQ" + err_msg)
+        raise ValueError(err_msg)
 
 
 def verify_marlin_supports_shape(output_size_per_partition: int,
@@ -245,7 +239,7 @@ def apply_gptq_marlin_linear(
         g_idx: torch.Tensor,
         g_idx_sort_indices: torch.Tensor,
         workspace: torch.Tensor,
-        num_bits: int,
+        wtype: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
         is_k_full: bool,
@@ -261,7 +255,7 @@ def apply_gptq_marlin_linear(
                                   g_idx,
                                   g_idx_sort_indices,
                                   workspace,
-                                  num_bits,
+                                  wtype,
                                   size_m=reshaped_x.shape[0],
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
@@ -283,7 +277,7 @@ def apply_awq_marlin_linear(
         g_idx: torch.Tensor,
         g_idx_sort_indices: torch.Tensor,
         workspace: torch.Tensor,
-        num_bits: int,
+        quant_type: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
         bias: Optional[torch.Tensor] = None,
@@ -298,7 +292,7 @@ def apply_awq_marlin_linear(
                                   g_idx,
                                   g_idx_sort_indices,
                                   workspace,
-                                  num_bits,
+                                  quant_type,
                                   size_m=reshaped_x.shape[0],
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 541d148c761fc..7d08ac6f87469 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -5,10 +5,12 @@
 import numpy as np
 import torch
 
+from vllm.scalar_type import ScalarType
+
 from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
                            marlin_zero_points)
-from .quant_utils import (get_pack_factor, quantize_weights,
-                          quantize_weights_with_zp, sort_weights)
+from .quant_utils import (get_pack_factor, gptq_quantize_weights,
+                          quantize_weights, sort_weights)
 
 
 class MarlinWorkspace:
@@ -90,9 +92,10 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
+def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
                     act_order: bool):
     size_k, size_n = w.shape
+    num_bits = quant_type.size_bits
 
     # Normalize group_size
     if group_size == -1:
@@ -100,8 +103,8 @@ def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
     assert group_size <= size_k
 
     # Quantize (and apply act_order if provided)
-    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
-                                                       act_order)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        w, quant_type, group_size, act_order)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
@@ -122,7 +125,8 @@ def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int):
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
+                        group_size: int):
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -135,13 +139,18 @@ def awq_marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int):
     num_groups = size_k // group_size
 
     # Quantize with zp
-    w_ref, q_w, s, zp = quantize_weights_with_zp(w, num_bits, group_size)
+    w_ref, q_w, s, zp = quantize_weights(w,
+                                         quant_type,
+                                         group_size,
+                                         zero_points=True)
 
     # Reformat to marlin
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
+                                weight_perm)
     marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n, num_bits)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n,
+                                   quant_type.size_bits)
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 648c32249a571..17d09055b1eac 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -6,8 +6,10 @@
 import numpy
 import torch
 
+from vllm.scalar_type import ScalarType
+
 from .marlin_utils_test import marlin_weights
-from .quant_utils import quantize_weights
+from .quant_utils import gptq_quantize_weights
 
 
 # This is PyTorch implementation of main part of reorder_meta()
@@ -348,13 +350,11 @@ def check_24(w, num_rows_to_sample=50, _verbose=False):
     print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
 
 
-def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
     assert q_24.shape == (size_k, size_n)
 
-    # Remove zp to normalize over 0
-    max_q_val = (1 << num_bits) - 1
-    zp = (max_q_val + 1) // 2
-    q_24_no_zp = q_24 - zp
+    # Remove bias to normalize over 0
+    q_24_no_zp = q_24 - wtype.bias
 
     # Compress
     q_24_no_zp = q_24_no_zp.t().contiguous()
@@ -362,8 +362,8 @@ def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
         q_24_no_zp)
     q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
 
-    # Restore zp
-    q_24_comp = q_24_no_zp_comp + zp
+    # Restore bias
+    q_24_comp = q_24_no_zp_comp + wtype.bias
 
     # Resize meta to its actual shape (without moving any data)
     meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
@@ -427,7 +427,7 @@ def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
 
 def marlin_24_quantize(
     w: torch.Tensor,
-    num_bits: int,
+    quant_type: ScalarType,
     group_size: int,
 ):
     size_k, size_n = w.shape
@@ -441,20 +441,18 @@ def marlin_24_quantize(
     w_24, mask_24 = inject_24(w, size_k, size_n)
 
     # Quantize
-    w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
-                                                             num_bits,
-                                                             group_size,
-                                                             act_order=False)
+    w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
+        w_24, quant_type, group_size, act_order=False)
 
     # Compress quantized weight
     q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
-                                                     num_bits)
+                                                     quant_type)
     size_k_comp = size_k // 2
 
     # Reformat to marlin
-    weight_perm = get_weight_perm_24(num_bits)
+    weight_perm = get_weight_perm_24(quant_type.size_bits)
     marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
-                                        num_bits, weight_perm)
+                                        quant_type.size_bits, weight_perm)
     marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
 
     # Create result
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 7ade8bf664ccc..7f9081b257705 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -4,7 +4,11 @@
 import numpy
 import torch
 
-SUPPORTED_NUM_BITS = [4, 8]
+from vllm.model_executor.layers.quantization.qqq import (
+    MARLIN_QQQ_SUPPORTED_NUM_BITS)
+from vllm.scalar_type import ScalarType, scalar_types
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
 # Note: this is a hack. We should update each model to register the
@@ -45,7 +49,7 @@ def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
 
 
 def get_pack_factor(num_bits):
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
     return 32 // num_bits
 
 
@@ -74,24 +78,23 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
     )
 
 
-def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
-                     act_order: bool):
+def quantize_weights(w: torch.Tensor,
+                     quant_type: ScalarType,
+                     group_size: int,
+                     zero_points: bool = False):
+    assert quant_type.is_integer(), \
+        "Floating point quantization may work but has not been tested"
+
     orig_device = w.device
+    orig_type = w.dtype
     size_k, size_n = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
-    assert group_size in SUPPORTED_GROUP_SIZES + [
-        size_k
-    ], f"Unsupported groupsize = {group_size}"
 
     if group_size == -1:
         group_size = size_k
     assert group_size <= size_k
 
-    max_q_val = 2**num_bits - 1
-    half_q_val = (max_q_val + 1) // 2
-
     # Reshape to [groupsize, -1]
     if group_size < size_k:
         w = w.reshape((-1, group_size, size_n))
@@ -99,16 +102,34 @@ def quantize_weights(w: torch.Tensor, num_bits: int, group_size: int,
         w = w.reshape((group_size, -1))
 
     # Compute scale for each group
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / max_q_val  # 2 => symmetric
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    if zero_points:
+        assert not quant_type.is_signed() and quant_type.max() > 0
+        w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+        maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
+            .clamp(min_q_val, max_q_val).int()
+    else:
+        # If the bias is such that there are no possible negative/positive
+        #  values, set the max value to inf to avoid divide by 0
+        w_s = torch.max(
+            abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+            abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
+        maybe_w_zp = None
 
     # Quantize
-    q_w = torch.round(w / s).int()
-    q_w += half_q_val
-    q_w = torch.clamp(q_w, 0, max_q_val)
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
 
     # Compute ref (dequantized)
-    w_ref = (q_w - half_q_val).half() * s
+    w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
 
     # Restore original shapes
     if group_size < size_k:
@@ -119,90 +140,48 @@ def reshape_w(w):
             w = w.reshape((size_k, size_n)).contiguous()
             return w
 
-        q_w = reshape_w(q_w)
+        w_q = reshape_w(w_q)
         w_ref = reshape_w(w_ref)
 
-    s = s.reshape((-1, size_n)).contiguous()
+    w_s = w_s.reshape((-1, size_n)).contiguous()
 
-    # Apply act_order
-    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
-    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
-    if act_order:
-        assert (
-            group_size < size_k
-        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
-            group_size, size_k)
-
-        w_ref, q_w, g_idx, rand_perm = permute_rows(q_w, w_ref, group_size)
+    if zero_points:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
 
     return (
         w_ref.to(device=orig_device),
-        q_w.to(device=orig_device),
-        s.to(device=orig_device),
-        g_idx.to(device=orig_device),
-        rand_perm.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s.to(device=orig_device),
+        maybe_w_zp,
     )
 
 
-def quantize_weights_with_zp(w: torch.Tensor, num_bits: int, group_size: int):
-    orig_device = w.device
-    size_k, size_n = w.shape
+def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
+                          group_size: int, act_order: bool):
+    size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
+        f"Unsupported gptq type = {quant_type}"
     assert group_size in SUPPORTED_GROUP_SIZES + [
         size_k
     ], f"Unsupported groupsize = {group_size}"
 
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    max_q_val = 2**num_bits - 1
-    min_q_val = 0
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
 
-    # Reshape to [groupsize, -1]
-    if group_size < size_k:
-        w = w.reshape((-1, group_size, size_n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((group_size, -1))
-
-    # Compute scale for each group
-    max = torch.max(w, 0, keepdim=True)[0]
-    min = torch.min(w, 0, keepdim=True)[0]
-    s = (max - min).clamp(min=1e-5) / max_q_val
-
-    # Compute zero-point for each group
-    zp = (-torch.round(min / s)).clamp(min_q_val, max_q_val).int()
-
-    # Quantize
-    q_w = torch.round(w / s).int() + zp
-    q_w = torch.clamp(q_w, min_q_val, max_q_val)
-
-    # Compute ref (dequantized)
-    w_ref = (q_w - zp).half() * s
-
-    # Restore original shapes
-    if group_size < size_k:
-
-        def reshape_w(w):
-            w = w.reshape((group_size, -1, size_n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((size_k, size_n)).contiguous()
-            return w
-
-        q_w = reshape_w(q_w)
-        w_ref = reshape_w(w_ref)
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k)
 
-    s = s.reshape((-1, size_n)).contiguous()
-    zp = zp.reshape((-1, size_n)).contiguous()
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
 
-    return (
-        w_ref.to(device=orig_device),
-        q_w.to(device=orig_device),
-        s.to(device=orig_device),
-        zp.to(device=orig_device),
-    )
+    return w_ref, w_q, w_s, g_idx, rand_perm
 
 
 # QQQ employs different quant schemes for per-group and
@@ -212,7 +191,8 @@ def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
     size_k, size_n = w.shape
 
     assert w.is_floating_point(), "w must be float"
-    assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"
+    assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
+           f"Unsupported num_bits = {num_bits}"
     assert group_size in SUPPORTED_GROUP_SIZES + [
         size_k
     ], f"Unsupported groupsize = {group_size}"
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
new file mode 100644
index 0000000000000..eb491dd1554a8
--- /dev/null
+++ b/vllm/scalar_type.py
@@ -0,0 +1,35 @@
+from ._core_ext import NanRepr, ScalarType
+
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+
+
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True,
+                                      NanRepr.EXTD_RANGE_MAX_MIN.value)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value)
+
+    # "gptq" types
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10

From 05308891e203329a733bcf29a3452b15b75b5eb4 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:55:40 -0700
Subject: [PATCH 67/79] [Core] Pipeline parallel with Ray ADAG (#6837)

Support pipeline-parallelism with Ray accelerated DAG.

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 Dockerfile                                  |   2 +
 MANIFEST.in                                 |   1 +
 requirements-adag.txt                       |   3 +
 requirements-test.txt                       |   3 +
 tests/distributed/test_pipeline_parallel.py |  51 +++++---
 tests/utils.py                              |  31 ++++-
 vllm/envs.py                                |  12 +-
 vllm/executor/ray_gpu_executor.py           | 137 +++++++++++++-------
 vllm/executor/ray_utils.py                  |  30 ++++-
 vllm/worker/worker_base.py                  |   6 +-
 10 files changed, 199 insertions(+), 77 deletions(-)
 create mode 100644 requirements-adag.txt

diff --git a/Dockerfile b/Dockerfile
index 7294707046abc..49aaea2949ac6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,6 +42,7 @@ WORKDIR /workspace
 
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
+COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
@@ -78,6 +79,7 @@ COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
+COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
diff --git a/MANIFEST.in b/MANIFEST.in
index 82be639ef4d73..5a41e5e714184 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include LICENSE
+include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
diff --git a/requirements-adag.txt b/requirements-adag.txt
new file mode 100644
index 0000000000000..e77f90fb8f85d
--- /dev/null
+++ b/requirements-adag.txt
@@ -0,0 +1,3 @@
+# Dependencies for Ray accelerated DAG
+cupy-cuda12x
+ray >= 2.32
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index df247496be16c..5f3fd15c7ee56 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,3 +1,6 @@
+# Needed for Ray accelerated DAG tests
+-r requirements-adag.txt
+
 # testing
 pytest
 tensorizer>=2.9.0
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index f632caba9017e..ab325e0966929 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -15,22 +15,31 @@
 
 
 @pytest.mark.parametrize(
-    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
-    [
-        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-    ])
-@fork_new_process_for_each_test
+    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
+         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
+     ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
-                    DIST_BACKEND):
+                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
         pytest.skip("Skipping multi-node pipeline parallel test for "
                     "multiprocessing distributed backend")
@@ -67,8 +76,18 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
     if EAGER_MODE:
         pp_args.append("--enforce-eager")
         tp_args.append("--enforce-eager")
+    pp_env = None
+    if USE_RAY_ADAG:
+        assert DIST_BACKEND == "ray", (
+            "Ray ADAG is only supported with Ray distributed backend")
+        pp_env = {
+            "VLLM_USE_RAY_COMPILED_DAG": "1",
+            "VLLM_USE_RAY_SPMD_WORKER": "1",
+            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+            str(int(USE_RAY_ADAG_NCCL)),
+        }
 
-    compare_two_settings(MODEL_NAME, pp_args, tp_args)
+    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
 
 
 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
diff --git a/tests/utils.py b/tests/utils.py
index f3ee801ee7742..dd8af8e3afe70 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -7,7 +7,7 @@
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import openai
 import ray
@@ -57,6 +57,7 @@ def __init__(
         model: str,
         cli_args: List[str],
         *,
+        env_dict: Optional[Dict[str, str]] = None,
         auto_port: bool = True,
     ) -> None:
         if auto_port:
@@ -77,6 +78,8 @@ def __init__(
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
         env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
         self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
                                      env=env,
                                      stdout=sys.stdout,
@@ -89,6 +92,11 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         self.proc.terminate()
+        try:
+            self.proc.wait(3)
+        except subprocess.TimeoutExpired:
+            # force kill if needed
+            self.proc.kill()
 
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
@@ -127,10 +135,21 @@ def get_async_client(self):
         )
 
 
-def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
+def compare_two_settings(model: str,
+                         arg1: List[str],
+                         arg2: List[str],
+                         env1: Optional[Dict[str, str]] = None,
+                         env2: Optional[Dict[str, str]] = None):
     """
-    Launch API server with two different sets of arguments and compare the
-    results of the API calls. The arguments are after the model name.
+    Launch API server with two different sets of arguments/environments
+    and compare the results of the API calls.
+
+    Args:
+        model: The model to test.
+        arg1: The first set of arguments to pass to the API server.
+        arg2: The second set of arguments to pass to the API server.
+        env1: The first set of environment variables to pass to the API server.
+        env2: The second set of environment variables to pass to the API server.
     """
 
     tokenizer = AutoTokenizer.from_pretrained(model)
@@ -138,8 +157,8 @@ def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
     prompt = "Hello, my name is"
     token_ids = tokenizer(prompt)["input_ids"]
     results = []
-    for args in (arg1, arg2):
-        with RemoteOpenAIServer(model, args) as server:
+    for args, env in ((arg1, env1), (arg2, env2)):
+        with RemoteOpenAIServer(model, args, env_dict=env) as server:
             client = server.get_client()
 
             # test models list
diff --git a/vllm/envs.py b/vllm/envs.py
index 9bcb26f8e5a64..5b8a65bd6545c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -38,6 +38,7 @@
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
+    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -273,13 +274,20 @@ def get_default_config_root():
     # execution on all workers.
     # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
     "VLLM_USE_RAY_SPMD_WORKER":
-    lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
 
     # If the env var is set, it uses the Ray's compiled DAG API
     # which optimizes the control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
     "VLLM_USE_RAY_COMPILED_DAG":
-    lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
+
+    # If the env var is set, it uses NCCL for communication in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
+                 ),
 
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 14007e5518d4a..46d216910a08a 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -105,12 +105,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         # The remaining workers are the actual ray actors.
         self.workers: List[RayWorkerWrapper] = []
 
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+
         if self.parallel_config.ray_workers_use_nsight:
             ray_remote_kwargs = self._configure_ray_workers_use_nsight(
                 ray_remote_kwargs)
 
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
         # Create the workers.
         driver_ip = get_ip()
+        logger.info("driver_ip: %s", driver_ip)
         worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
@@ -142,42 +149,49 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # Else, added to the list of workers.
                     self.workers.append(worker)
 
+        logger.debug("workers: %s", self.workers)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
                 "Ray does not allocate any GPUs on the driver node. Consider "
                 "adjusting the Ray placement group or running the driver on a "
                 "GPU node.")
 
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = ray.get(worker.get_node_ip.remote())
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
         # Get the set of GPU IDs used on each node.
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)
 
-        # the order in `worker_node_and_gpu_ids` does not necessarily match
-        # the machine boundaries. We need to make sure that workers in the
-        # same node are assigned consecutive ranks.
-        # examples:
-        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
-
-        # initialize worker ranks with -1 (unassigned)
-        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
-        current_rank = 0
-        while -1 in worker_ranks:
-            # whenever we find an unassigned worker, find the node
-            index = worker_ranks.index(-1)
-            current_node_id = worker_node_and_gpu_ids[index][0]
-            # assign ranks to all workers in the same node
-            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-                if node_id == current_node_id:
-                    worker_ranks[i] = current_rank
-                    current_rank += 1
-        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
-
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
 
-        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
-                                                   worker_node_and_gpu_ids):
-            node_workers[node_id].append(worker_rank)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
             # `gpu_ids` can be a list of strings or integers.
             # convert them to integers for consistency.
             # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
@@ -202,16 +216,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
@@ -221,8 +225,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 local_rank=node_workers[node_id].index(rank),
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id,
-                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
         ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
@@ -231,6 +234,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
         # This is the list of workers that are rank 0 of each TP group EXCEPT
         # global rank 0. These are the workers that will broadcast to the
         # rest of the workers.
@@ -241,9 +257,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self.non_driver_workers: List[RayWorkerWrapper] = []
 
         # Enforce rank order for correct rank to return final output.
-        for rank, worker in sorted(zip(worker_ranks[1:], self.workers)):
-            # We need to skip the driver worker, which we
-            # do by skipping worker_ranks[0] which is always 0.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
             if rank % self.parallel_config.tensor_parallel_size == 0:
                 self.tp_driver_workers.append(worker)
             else:
@@ -376,16 +392,47 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
             raise ValueError(f"Ray version {required_version} or greater is "
                              f"required, but found {current_version}")
 
-        from ray.dag import InputNode, MultiOutputNode
         assert self.parallel_config.use_ray
+        from ray.dag import InputNode, MultiOutputNode
+        from ray.experimental.channel.torch_tensor_type import TorchTensorType
 
-        # Right now, compiled DAG requires at least 1 arg. We send
-        # a dummy value for now. It will be fixed soon.
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
         with InputNode() as input_data:
-            forward_dag = MultiOutputNode([
-                worker.execute_model_spmd.bind(  # type: ignore[attr-defined]
-                    input_data) for worker in self.workers
-            ])
+            # Example DAG: PP=2, TP=4
+            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
+            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
+            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
+            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                outputs = [
+                    worker.execute_model_spmd.
+                    bind(  # type: ignore[attr-defined]
+                        outputs[i]) for i, worker in enumerate(tp_group)
+                ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if pp_rank < last_pp_rank:
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage.
+                    transport = "nccl" \
+                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
+                        else "auto"
+                    outputs = [
+                        output.with_type_hint(
+                            TorchTensorType(transport=transport))
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
         return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
 
     def __del__(self):
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 58b864070f727..ac948331e81e0 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,8 +1,8 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip, is_hip, is_tpu, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -31,9 +31,17 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             gpu_ids = ray.get_gpu_ids()
             return node_id, gpu_ids
 
-        def execute_model_spmd(self, execute_model_req: ExecuteModelRequest):
-            """Used only when SPMD worker and compiled DAG are both
-            enabled."""
+        def execute_model_spmd(
+            self, req_or_tuple: Union[ExecuteModelRequest,
+                                      Tuple[ExecuteModelRequest,
+                                            IntermediateTensors]]):
+            """Execute model in SPMD fashion: used only when SPMD worker and
+            compiled DAG are both enabled.
+
+            Args:
+                req_or_tuple: The request to execute the model, or a tuple
+                    containing the request and intermediate tensors.
+            """
             # TODO(swang): This is needed right now because Ray aDAG executes
             # on a background thread, so we need to reset torch's current
             # device.
@@ -42,7 +50,17 @@ def execute_model_spmd(self, execute_model_req: ExecuteModelRequest):
                 torch.cuda.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
-            return self.worker._execute_model_spmd(execute_model_req)
+            if isinstance(req_or_tuple, tuple):
+                execute_model_req, intermediate_tensors = req_or_tuple
+            else:
+                execute_model_req = req_or_tuple
+                intermediate_tensors = None
+
+            output = self.worker._execute_model_spmd(execute_model_req,
+                                                     intermediate_tensors)
+            if isinstance(output, IntermediateTensors):
+                return execute_model_req, output
+            return output
 
     ray_import_err = None
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 8a4d1958c65a0..e56440693b895 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -285,7 +285,9 @@ def execute_model(
         return output
 
     def _execute_model_spmd(
-        self, execute_model_req: ExecuteModelRequest
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None
     ) -> Optional[List[SamplerOutput]]:
         """
         Execute model in Single Program Multiple Data (SPMD) fashion.
@@ -309,7 +311,7 @@ def _execute_model_spmd(
 
         return self.model_runner.execute_model(
             model_input, self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None)
+            if self.kv_cache is not None else None, intermediate_tensors)
 
 
 class WorkerWrapperBase:

From 22e718ff1a51930231d87c89d6c43676af59860b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:50:00 -0700
Subject: [PATCH 68/79] [Misc] Revive to use loopback address for driver IP
 (#7091)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/executor/ray_gpu_executor.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 46d216910a08a..4a6825c01fcf8 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -216,6 +216,16 @@ def sort_by_driver_then_worker_ip(worker):
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 

From 708989341ef6361a5981d890a0e2f1b794323458 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 16:18:45 -0700
Subject: [PATCH 69/79] [misc] add a flag to enable compile (#7092)

---
 vllm/envs.py                | 4 ++++
 vllm/worker/model_runner.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 5b8a65bd6545c..595058bcbb027 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -174,6 +174,10 @@ def get_default_config_root():
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
+    # Internal flag to enable Dynamo graph capture
+    "VLLM_TEST_DYNAMO_GRAPH_CAPTURE":
+    lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 7773442899585..f9c26e0c318b1 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -23,6 +23,7 @@
     BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
+import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
@@ -786,6 +787,11 @@ def load_model(self) -> None:
                     "provided. Defaulting to scaling factors of 1.0. "
                     "This may lead to less accurate results!")
 
+        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE:
+            self.model = torch.compile(self.model,
+                                       fullgraph=True,
+                                       backend="eager")
+
     def save_sharded_state(
         self,
         path: str,

From ed812a73fae77bb520b739cfeaad36dbd61e2b03 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 2 Aug 2024 21:27:28 -0400
Subject: [PATCH 70/79] [ Frontend ] Multiprocessing for OpenAI Server with
 `zeromq` (#6883)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 tests/entrypoints/openai/test_disable_mp.py   | 715 ++++++++++++++++++
 vllm/engine/async_llm_engine.py               |  27 +-
 vllm/engine/llm_engine.py                     |  36 +-
 vllm/engine/protocol.py                       |  84 ++
 vllm/entrypoints/openai/api_server.py         | 132 +++-
 vllm/entrypoints/openai/cli_args.py           |   9 +-
 vllm/entrypoints/openai/logits_processors.py  |  19 +-
 vllm/entrypoints/openai/rpc/__init__.py       |  42 +
 vllm/entrypoints/openai/rpc/client.py         | 248 ++++++
 vllm/entrypoints/openai/rpc/server.py         | 216 ++++++
 vllm/entrypoints/openai/serving_chat.py       |  16 +-
 vllm/entrypoints/openai/serving_completion.py |  19 +-
 vllm/entrypoints/openai/serving_embedding.py  |  13 +-
 vllm/entrypoints/openai/serving_engine.py     |   8 +-
 .../openai/serving_tokenization.py            |  10 +-
 vllm/envs.py                                  |   6 +
 .../outlines_logits_processors.py             |  19 +
 vllm/tracing.py                               |   2 +-
 .../tokenizer_group/__init__.py               |  19 +-
 vllm/utils.py                                 |  28 +-
 20 files changed, 1567 insertions(+), 101 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_disable_mp.py
 create mode 100644 vllm/engine/protocol.py
 create mode 100644 vllm/entrypoints/openai/rpc/__init__.py
 create mode 100644 vllm/entrypoints/openai/rpc/client.py
 create mode 100644 vllm/entrypoints/openai/rpc/server.py

diff --git a/tests/entrypoints/openai/test_disable_mp.py b/tests/entrypoints/openai/test_disable_mp.py
new file mode 100644
index 0000000000000..12c805413311c
--- /dev/null
+++ b/tests/entrypoints/openai/test_disable_mp.py
@@ -0,0 +1,715 @@
+"""
+Repeat of tests in test_completion.py with the non-mp backend.
+"""
+
+# imports for guided decoding tests
+import json
+import re
+import shutil
+from tempfile import TemporaryDirectory
+from typing import List
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def zephyr_pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+        "--disable-frontend-multiprocessing"
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name,num_virtual_tokens",
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should not appear in tokenized prompt
+    assert "vllm" not in completion.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora and 1 pa hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+                # for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index d3f9a0ab00f10..c39caca25cc7a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -7,7 +7,8 @@
 from transformers import PreTrainedTokenizer
 
 import vllm.envs as envs
-from vllm.config import DecodingConfig, EngineConfig, ModelConfig
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -928,6 +929,14 @@ async def get_model_config(self) -> ModelConfig:
         else:
             return self.engine.get_model_config()
 
+    async def get_parallel_config(self) -> ParallelConfig:
+        """Get the parallel configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_parallel_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_parallel_config()
+
     async def get_decoding_config(self) -> DecodingConfig:
         """Get the decoding configuration of the vLLM engine."""
         if self.engine_use_ray:
@@ -936,6 +945,22 @@ async def get_decoding_config(self) -> DecodingConfig:
         else:
             return self.engine.get_decoding_config()
 
+    async def get_scheduler_config(self) -> SchedulerConfig:
+        """Get the scheduling configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_scheduler_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_scheduler_config()
+
+    async def get_lora_config(self) -> LoRAConfig:
+        """Get the lora configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_lora_config.remote(  # type: ignore
+            )
+        else:
+            return self.engine.get_lora_config()
+
     async def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1efe2206abe81..3747f93b16cd1 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -38,9 +38,8 @@
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer_group import (AnyTokenizer,
-                                                     BaseTokenizerGroup,
-                                                     get_tokenizer_group)
+from vllm.transformers_utils.tokenizer_group import (
+    AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
 from vllm.utils import Counter
@@ -485,19 +484,12 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(
             sequence.lora_request)
 
-    def _init_tokenizer(self, **tokenizer_init_kwargs) -> BaseTokenizerGroup:
-        init_kwargs = dict(
-            tokenizer_id=self.model_config.tokenizer,
-            enable_lora=bool(self.lora_config),
-            max_num_seqs=self.scheduler_config.max_num_seqs,
-            max_input_length=None,
-            tokenizer_mode=self.model_config.tokenizer_mode,
-            trust_remote_code=self.model_config.trust_remote_code,
-            revision=self.model_config.tokenizer_revision)
-        init_kwargs.update(tokenizer_init_kwargs)
-
-        return get_tokenizer_group(self.parallel_config.tokenizer_pool_config,
-                                   **init_kwargs)
+    def _init_tokenizer(self) -> BaseTokenizerGroup:
+        return init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=self.scheduler_config,
+            parallel_config=self.parallel_config,
+            enable_lora=bool(self.lora_config))
 
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
@@ -759,10 +751,22 @@ def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
 
+    def get_parallel_config(self) -> ParallelConfig:
+        """Gets the parallel configuration."""
+        return self.parallel_config
+
     def get_decoding_config(self) -> DecodingConfig:
         """Gets the decoding configuration."""
         return self.decoding_config
 
+    def get_scheduler_config(self) -> SchedulerConfig:
+        """Gets the scheduler configuration."""
+        return self.scheduler_config
+
+    def get_lora_config(self) -> LoRAConfig:
+        """Gets the LoRA configuration."""
+        return self.lora_config
+
     def get_num_unfinished_requests(self) -> int:
         """Gets the number of unfinished requests."""
         return sum(scheduler.get_num_unfinished_seq_groups()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
new file mode 100644
index 0000000000000..fc94ef6662e0a
--- /dev/null
+++ b/vllm/engine/protocol.py
@@ -0,0 +1,84 @@
+from typing import (AsyncIterator, List, Mapping, Optional, Protocol,
+                    runtime_checkable)
+
+from transformers import PreTrainedTokenizer
+
+from vllm.config import DecodingConfig, ModelConfig
+from vllm.core.scheduler import SchedulerOutputs
+from vllm.inputs.data import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SamplerOutput
+
+
+@runtime_checkable
+class AsyncEngineClient(Protocol):
+    """Protocol class for Clients to AsyncLLMEngine"""
+
+    @property
+    def is_running(self) -> bool:
+        ...
+
+    @property
+    def is_stopped(self) -> bool:
+        ...
+
+    @property
+    def errored(self) -> bool:
+        ...
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
+    ) -> AsyncIterator[RequestOutput]:
+        """Generates outputs for a request"""
+
+    async def encode(
+        self,
+        inputs: PromptInputs,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> AsyncIterator[EmbeddingRequestOutput]:
+        """Generate outputs for a request from an embedding model."""
+
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+
+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        """Get the decoding configuration of the vLLM engine."""
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PreTrainedTokenizer:
+        """Get the appropriate Tokenizer for the request"""
+
+    async def is_tracing_enabled(self) -> bool:
+        pass
+
+    async def do_log_stats(
+        self,
+        scheduler_outputs: Optional[SchedulerOutputs] = None,
+        model_output: Optional[List[SamplerOutput]] = None,
+    ) -> None:
+        pass
+
+    async def check_health(self) -> None:
+        """Raise if unhealthy"""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0fe4dd245b5e6..e330ee81f7e44 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,7 +5,8 @@
 import signal
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Optional, Set
+from multiprocessing import Process
+from typing import AsyncIterator, Set
 
 import fastapi
 import uvicorn
@@ -17,8 +18,10 @@
 from starlette.routing import Mount
 
 import vllm.envs as envs
+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 # yapf conflicts with isort for this block
@@ -31,6 +34,8 @@
                                               EmbeddingRequest, ErrorResponse,
                                               TokenizeRequest,
                                               TokenizeResponse)
+from vllm.entrypoints.openai.rpc.client import AsyncEngineRPCClient
+from vllm.entrypoints.openai.rpc.server import run_rpc_server
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -39,12 +44,12 @@
     OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, get_open_port
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
-engine: AsyncLLMEngine
+async_engine_client: AsyncEngineClient
 engine_args: AsyncEngineArgs
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
@@ -56,13 +61,22 @@
 _running_tasks: Set[asyncio.Task] = set()
 
 
+def model_is_embedding(model_name: str) -> bool:
+    return ModelConfig(model=model_name,
+                       tokenizer=model_name,
+                       tokenizer_mode="auto",
+                       trust_remote_code=False,
+                       seed=0,
+                       dtype="float16").embedding_mode
+
+
 @asynccontextmanager
 async def lifespan(app: fastapi.FastAPI):
 
     async def _force_log():
         while True:
             await asyncio.sleep(10)
-            await engine.do_log_stats()
+            await async_engine_client.do_log_stats()
 
     if not engine_args.disable_log_stats:
         task = asyncio.create_task(_force_log())
@@ -72,6 +86,52 @@ async def _force_log():
     yield
 
 
+@asynccontextmanager
+async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
+    # Context manager to handle async_engine_client lifecycle
+    # Ensures everything is shutdown and cleaned up on error/exit
+    global engine_args
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Backend itself still global for the silly lil' health handler
+    global async_engine_client
+
+    # If manually triggered or embedding model, use AsyncLLMEngine in process.
+    # TODO: support embedding model via RPC.
+    if (model_is_embedding(args.model)
+            or args.disable_frontend_multiprocessing):
+        async_engine_client = AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+        yield async_engine_client
+        return
+
+    # Otherwise, use the multiprocessing AsyncLLMEngine.
+    else:
+        # Start RPCServer in separate process (holds the AsyncLLMEngine).
+        port = get_open_port(envs.VLLM_RPC_PORT)
+        rpc_server_process = Process(target=run_rpc_server,
+                                     args=(engine_args,
+                                           UsageContext.OPENAI_API_SERVER,
+                                           port))
+        rpc_server_process.start()
+
+        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
+        async_engine_client = AsyncEngineRPCClient(port)
+        await async_engine_client.setup()
+
+        try:
+            yield async_engine_client
+        finally:
+            # Ensure rpc server process was terminated
+            rpc_server_process.terminate()
+
+            # Close all open connections to the backend
+            async_engine_client.close()
+
+            # Wait for server process to join
+            rpc_server_process.join()
+
+
 router = APIRouter()
 
 
@@ -86,7 +146,7 @@ def mount_metrics(app: fastapi.FastAPI):
 @router.get("/health")
 async def health() -> Response:
     """Health check."""
-    await openai_serving_chat.engine.check_health()
+    await async_engine_client.check_health()
     return Response(status_code=200)
 
 
@@ -215,8 +275,8 @@ async def authentication(request: Request, call_next):
 
 
 async def build_server(
+    async_engine_client: AsyncEngineClient,
     args,
-    llm_engine: Optional[AsyncLLMEngine] = None,
     **uvicorn_kwargs,
 ) -> uvicorn.Server:
     app = build_app(args)
@@ -226,14 +286,7 @@ async def build_server(
     else:
         served_model_names = [args.model]
 
-    global engine, engine_args
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = (llm_engine
-              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
-                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
-
-    model_config = await engine.get_model_config()
+    model_config = await async_engine_client.get_model_config()
 
     if args.disable_log_requests:
         request_logger = None
@@ -246,7 +299,7 @@ async def build_server(
     global openai_serving_tokenization
 
     openai_serving_chat = OpenAIServingChat(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         args.response_role,
@@ -257,7 +310,7 @@ async def build_server(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_completion = OpenAIServingCompletion(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         lora_modules=args.lora_modules,
@@ -266,13 +319,13 @@ async def build_server(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     )
     openai_serving_embedding = OpenAIServingEmbedding(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         request_logger=request_logger,
     )
     openai_serving_tokenization = OpenAIServingTokenization(
-        engine,
+        async_engine_client,
         model_config,
         served_model_names,
         lora_modules=args.lora_modules,
@@ -304,32 +357,39 @@ async def build_server(
     return uvicorn.Server(config)
 
 
-async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
+async def run_server(args, **uvicorn_kwargs) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
-    server = await build_server(
-        args,
-        llm_engine,
-        **uvicorn_kwargs,
-    )
+    shutdown_task = None
+    async with build_async_engine_client(args) as async_engine_client:
+
+        server = await build_server(
+            async_engine_client,
+            args,
+            **uvicorn_kwargs,
+        )
+
+        loop = asyncio.get_running_loop()
 
-    loop = asyncio.get_running_loop()
+        server_task = loop.create_task(server.serve())
 
-    server_task = loop.create_task(server.serve())
+        def signal_handler() -> None:
+            # prevents the uvicorn signal handler to exit early
+            server_task.cancel()
 
-    def signal_handler() -> None:
-        # prevents the uvicorn signal handler to exit early
-        server_task.cancel()
+        loop.add_signal_handler(signal.SIGINT, signal_handler)
+        loop.add_signal_handler(signal.SIGTERM, signal_handler)
 
-    loop.add_signal_handler(signal.SIGINT, signal_handler)
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+        try:
+            await server_task
+        except asyncio.CancelledError:
+            logger.info("Gracefully stopping http server")
+            shutdown_task = server.shutdown()
 
-    try:
-        await server_task
-    except asyncio.CancelledError:
-        print("Gracefully stopping http server")
-        await server.shutdown()
+    if shutdown_task:
+        # NB: Await server shutdown only after the backend context is exited
+        await shutdown_task
 
 
 if __name__ == "__main__":
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a4192937980f7..1facedac72ca8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -131,9 +131,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
         "--return-tokens-as-token-ids",
         action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as"
-        "strings of the form 'token_id:{token_id}' so that tokens that"
+        help="When --max-logprobs is specified, represents single tokens as "
+        "strings of the form 'token_id:{token_id}' so that tokens that "
         "are not JSON-encodable can be identified.")
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        help="If specified, will run the OpenAI frontend server in the same "
+        "process as the model serving engine.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index f8e04e7f18e0f..84871fc83ef5f 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,4 +1,4 @@
-from functools import lru_cache
+from functools import lru_cache, partial
 from typing import Dict, FrozenSet, Iterable, List, Optional, Union
 
 import torch
@@ -40,6 +40,14 @@ def _get_allowed_token_ids_logits_processor(
     return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
 
 
+def logit_bias_logits_processor(logit_bias: Dict[str,
+                                                 float], token_ids: List[int],
+                                logits: torch.Tensor) -> torch.Tensor:
+    for token_id, bias in logit_bias.items():
+        logits[token_id] += bias
+    return logits
+
+
 def get_logits_processors(
         logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
         allowed_token_ids: Optional[List[int]],
@@ -64,13 +72,8 @@ def get_logits_processors(
                 raise ValueError("token_id in logit_bias contains "
                                  "out-of-vocab token id")
 
-        def logit_bias_logits_processor(token_ids: List[int],
-                                        logits: torch.Tensor) -> torch.Tensor:
-            for token_id, bias in clamped_logit_bias.items():
-                logits[token_id] += bias
-            return logits
-
-        logits_processors.append(logit_bias_logits_processor)
+        logits_processors.append(
+            partial(logit_bias_logits_processor, clamped_logit_bias))
 
     if allowed_token_ids is not None:
         logits_processors.append(
diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
new file mode 100644
index 0000000000000..8a7b12201cab7
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/__init__.py
@@ -0,0 +1,42 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Mapping, Optional, Union
+
+from vllm.inputs import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+
+VLLM_RPC_SUCCESS_STR = "SUCCESS"
+VLLM_RPC_HEALTHY_STR = "HEALTHY"
+
+
+@dataclass
+class RPCGenerateRequest:
+    inputs: PromptInputs
+    sampling_params: SamplingParams
+    request_id: str
+    lora_request: Optional[LoRARequest] = None
+    trace_headers: Optional[Mapping[str, str]] = None
+    prompt_adapter_request: Optional[PromptAdapterRequest] = None
+
+
+@dataclass
+class RPCAbortRequest:
+    request_id: str
+
+
+class RPCUtilityRequest(Enum):
+    IS_SERVER_READY = 1
+    GET_MODEL_CONFIG = 2
+    GET_DECODING_CONFIG = 3
+    GET_PARALLEL_CONFIG = 4
+    GET_SCHEDULER_CONFIG = 5
+    GET_LORA_CONFIG = 6
+    DO_LOG_STATS = 7
+    CHECK_HEALTH = 8
+    IS_TRACING_ENABLED = 9
+
+
+RPC_REQUEST_TYPE = Union[RPCGenerateRequest, RPCAbortRequest,
+                         RPCUtilityRequest]
diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
new file mode 100644
index 0000000000000..45bf88b5bf574
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/client.py
@@ -0,0 +1,248 @@
+from contextlib import contextmanager
+from typing import Any, AsyncIterator, Mapping, Optional
+
+import cloudpickle
+import zmq
+import zmq.asyncio
+
+from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
+from vllm.entrypoints.openai.rpc import (RPC_REQUEST_TYPE,
+                                         VLLM_RPC_HEALTHY_STR,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCGenerateRequest, RPCUtilityRequest)
+from vllm.inputs import PromptInputs
+from vllm.lora.request import LoRARequest
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+
+
+class AsyncEngineRPCClient:
+
+    def __init__(self, port: int):
+        self.context = zmq.asyncio.Context()
+        self.path = f"tcp://localhost:{port}"
+
+    async def setup(self):
+        """Setup the client before it starts sending server requests."""
+
+        # Wait until server is ready.
+        await self.wait_for_server()
+
+        # Get the configs.
+        self.model_config = await self._get_model_config_rpc()
+        self.decoding_config = await self._get_decoding_config_rpc()
+        self.tracing_flag = await self._is_tracing_enabled_rpc()
+
+        # Create the tokenizer group.
+        # TODO: refactor OAI server to avoid needing this info.
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=self.model_config,
+            scheduler_config=(await self._get_scheduler_config_rpc()),
+            parallel_config=(await self._get_parallel_config_rpc()),
+            enable_lora=bool(await self._get_lora_config_rpc()),
+        )
+
+    def close(self):
+        """Destroy the ZeroMQ Context."""
+        self.context.destroy()
+
+    @contextmanager
+    def socket(self):
+        # Ensure client sockets are always closed after use
+
+        # Connect to RPC socket for Request-Reply pattern,
+        # Note that we use DEALER to enable asynchronous communication
+        # to enable streaming.
+        socket = self.context.socket(zmq.constants.DEALER)
+        try:
+            socket.connect(self.path)
+            yield socket
+        finally:
+            socket.close()
+
+    async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
+                                         expected_type: Any,
+                                         error_message: str) -> Any:
+        """Send an RPC request that is expecting data back."""
+
+        with self.socket() as socket:
+
+            # Ping RPCServer with a request.
+            await socket.send(cloudpickle.dumps(request))
+
+            # Await the data from the Server.
+            data = cloudpickle.loads(await socket.recv())
+
+        if not isinstance(data, expected_type):
+            # LoRAConfig can be None.
+            if expected_type == LoRAConfig and data is None:
+                pass
+            else:
+                raise ValueError(error_message)
+
+        return data
+
+    async def _send_one_way_rpc_request(self, request: RPC_REQUEST_TYPE,
+                                        error_message: str):
+        """Send one-way RPC request to trigger an action."""
+        with self.socket() as socket:
+            # Ping RPC Server with request.
+            await socket.send(cloudpickle.dumps(request))
+
+            # Await acknowledgement from RPCServer.
+            response = cloudpickle.loads(await socket.recv())
+
+        if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR:
+            raise ValueError(error_message)
+
+        return response
+
+    async def get_tokenizer(self, lora_request: LoRARequest):
+        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
+
+    async def get_decoding_config(self) -> DecodingConfig:
+        return self.decoding_config
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def is_tracing_enabled(self) -> bool:
+        return self.tracing_flag
+
+    async def wait_for_server(self):
+        """Wait for the RPCServer to start up."""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUtilityRequest.IS_SERVER_READY,
+            error_message="Unable to start RPC Server.")
+
+    async def _get_model_config_rpc(self) -> ModelConfig:
+        """Get the ModelConfig object from the RPC Server"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_MODEL_CONFIG,
+            expected_type=ModelConfig,
+            error_message="Could not get ModelConfig from RPC Server")
+
+    async def _get_decoding_config_rpc(self) -> DecodingConfig:
+        """Get DecodingConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_DECODING_CONFIG,
+            expected_type=DecodingConfig,
+            error_message="Could not get DecodingConfig from RPC Server")
+
+    async def _get_parallel_config_rpc(self) -> ParallelConfig:
+        """Get ParallelConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_PARALLEL_CONFIG,
+            expected_type=ParallelConfig,
+            error_message="Could not get ParallelConfig from RPC Server")
+
+    async def _get_scheduler_config_rpc(self) -> SchedulerConfig:
+        """Get SchedulerConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_SCHEDULER_CONFIG,
+            expected_type=SchedulerConfig,
+            error_message="Could not get SchedulerConfig from RPC Server")
+
+    async def _get_lora_config_rpc(self):
+        """Get LoRAConfig from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.GET_LORA_CONFIG,
+            expected_type=LoRAConfig,
+            error_message="Could not get LoRAConfig from RPC Server")
+
+    async def _is_tracing_enabled_rpc(self) -> ParallelConfig:
+        """Get is_tracing_enabled flag from the RPCServer"""
+
+        return await self._send_get_data_rpc_request(
+            RPCUtilityRequest.IS_TRACING_ENABLED,
+            expected_type=bool,
+            error_message="Could not get is_tracing_enabled flag from RPC "
+            "Server")
+
+    async def abort(self, request_id: str):
+        """Send an ABORT_REQUEST signal to the RPC Server"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCAbortRequest(request_id),
+            error_message=f"RPCAbortRequest {request_id} failed")
+
+    async def do_log_stats(self):
+        """Send a DO_LOG_STATS signal to the RPC Server"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUtilityRequest.DO_LOG_STATS,
+            error_message="RPCRequest DO_LOG_STATS failed.")
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None
+    ) -> AsyncIterator[RequestOutput]:
+        """Send an RPCGenerateRequest to the RPCServer and stream responses."""
+
+        with self.socket() as socket:
+
+            # Send RPCGenerateRequest to the RPCServer.
+            await socket.send_multipart([
+                cloudpickle.dumps(
+                    RPCGenerateRequest(
+                        inputs=inputs,
+                        sampling_params=sampling_params,
+                        request_id=request_id,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        prompt_adapter_request=prompt_adapter_request))
+            ])
+
+            # Stream back the results from the RPC Server.
+            while True:
+                message = await socket.recv()
+                request_output = cloudpickle.loads(message)
+
+                if isinstance(request_output, Exception):
+                    raise request_output
+
+                if request_output.finished:
+                    break
+                yield request_output
+
+            yield request_output
+
+    async def check_health(self) -> None:
+        """Raise if unhealthy"""
+
+        with self.socket() as socket:
+
+            # Ping RPCServer with CHECK_HEALTH request.
+            await socket.send(cloudpickle.dumps(RPCUtilityRequest.CHECK_HEALTH)
+                              )
+
+            # Await the reply from the server.
+            # TODO: do we need an internal timeout here?
+            # Or do we expect the external probe to timeout and let this chill?
+            health_message = cloudpickle.loads(await socket.recv())
+
+        if isinstance(health_message, Exception):
+            raise health_message
+
+        if health_message != VLLM_RPC_HEALTHY_STR:
+            raise ValueError("Expected healthy response from backend but got "
+                             "f{health_message}")
+
+    async def encode(self, *args,
+                     **kwargs) -> AsyncIterator[EmbeddingRequestOutput]:
+        raise NotImplementedError(
+            "Embeddings not supported with multiprocessing backend")
diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py
new file mode 100644
index 0000000000000..7a72a6f732c99
--- /dev/null
+++ b/vllm/entrypoints/openai/rpc/server.py
@@ -0,0 +1,216 @@
+import asyncio
+import signal
+from typing import Any, Coroutine
+
+import cloudpickle
+import zmq
+import zmq.asyncio
+from typing_extensions import Never
+
+from vllm import AsyncEngineArgs, AsyncLLMEngine
+from vllm.entrypoints.openai.rpc import (VLLM_RPC_HEALTHY_STR,
+                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
+                                         RPCGenerateRequest, RPCUtilityRequest)
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+
+logger = init_logger(__name__)
+
+
+class AsyncEngineRPCServer:
+
+    def __init__(self, async_engine_args: AsyncEngineArgs,
+                 usage_context: UsageContext, port: int):
+        # Initialize engine first.
+        self.engine = AsyncLLMEngine.from_engine_args(async_engine_args,
+                                                      usage_context)
+
+        # Initialize context.
+        self.context = zmq.asyncio.Context()
+
+        # Init socket for readiness state.
+        self.socket = self.context.socket(zmq.constants.ROUTER)
+        self.socket.bind(f"tcp://localhost:{port}")
+
+    def cleanup(self):
+        """Cleanup all resources."""
+        self.socket.close()
+        self.context.destroy()
+
+    async def get_model_config(self, identity):
+        """Send the ModelConfig"""
+        model_config = await self.engine.get_model_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(model_config)])
+
+    async def get_decoding_config(self, identity):
+        """Send the DecodingConfig"""
+        decoding_config = await self.engine.get_decoding_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(decoding_config)])
+
+    async def get_lora_config(self, identity):
+        lora_config = await self.engine.get_lora_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(lora_config)])
+
+    async def get_scheduler_config(self, identity):
+        """Send the SchedulerConfig"""
+        parallel_config = await self.engine.get_scheduler_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(parallel_config)])
+
+    async def get_parallel_config(self, identity):
+        """Send the ParallelConfig"""
+        parallel_config = await self.engine.get_parallel_config()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(parallel_config)])
+
+    async def is_tracing_enabled(self, identity):
+        """Send the is_tracing_enabled flag"""
+        tracing_flag = await self.engine.is_tracing_enabled()
+
+        await self.socket.send_multipart(
+            [identity, cloudpickle.dumps(tracing_flag)])
+
+    async def do_log_stats(self, identity):
+        """Log stats and confirm success."""
+        await self.engine.do_log_stats()
+
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def is_server_ready(self, identity):
+        """Notify the client that we are ready."""
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def abort(self, identity, request: RPCAbortRequest):
+        """Abort request and notify the client of success."""
+        # Abort the request in the llm engine.
+        await self.engine.abort(request.request_id)
+
+        # Send confirmation to the client.
+        await self.socket.send_multipart([
+            identity,
+            cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
+        ])
+
+    async def generate(self, identity, generate_request: RPCGenerateRequest):
+        try:
+            results_generator = self.engine.generate(
+                generate_request.inputs,
+                sampling_params=generate_request.sampling_params,
+                request_id=generate_request.request_id,
+                lora_request=generate_request.lora_request,
+                trace_headers=generate_request.trace_headers,
+                prompt_adapter_request=generate_request.prompt_adapter_request)
+
+            async for request_output in results_generator:
+                await self.socket.send_multipart(
+                    [identity, cloudpickle.dumps(request_output)])
+
+        except Exception as e:
+            ### Notify client of all failures
+            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+
+    async def check_health(self, identity):
+        try:
+            await self.engine.check_health()
+            await self.socket.send_multipart(
+                [identity, cloudpickle.dumps(VLLM_RPC_HEALTHY_STR)])
+        except Exception as e:
+            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+
+    def _make_handler_coro(self, identity,
+                           message) -> Coroutine[Any, Any, Never]:
+        """Route the zmq message to the handler coroutine."""
+
+        request = cloudpickle.loads(message)
+
+        if isinstance(request, RPCGenerateRequest):
+            return self.generate(identity, request)
+
+        elif isinstance(request, RPCAbortRequest):
+            return self.abort(identity, request)
+
+        elif isinstance(request, RPCUtilityRequest):
+            if request == RPCUtilityRequest.GET_MODEL_CONFIG:
+                return self.get_model_config(identity)
+            elif request == RPCUtilityRequest.GET_PARALLEL_CONFIG:
+                return self.get_parallel_config(identity)
+            elif request == RPCUtilityRequest.GET_DECODING_CONFIG:
+                return self.get_decoding_config(identity)
+            elif request == RPCUtilityRequest.GET_SCHEDULER_CONFIG:
+                return self.get_scheduler_config(identity)
+            elif request == RPCUtilityRequest.GET_LORA_CONFIG:
+                return self.get_lora_config(identity)
+            elif request == RPCUtilityRequest.DO_LOG_STATS:
+                return self.do_log_stats(identity)
+            elif request == RPCUtilityRequest.IS_SERVER_READY:
+                return self.is_server_ready(identity)
+            elif request == RPCUtilityRequest.CHECK_HEALTH:
+                return self.check_health(identity)
+            elif request == RPCUtilityRequest.IS_TRACING_ENABLED:
+                return self.is_tracing_enabled(identity)
+            else:
+                raise ValueError(f"Unknown RPCUtilityRequest type: {request}")
+
+        else:
+            raise ValueError(f"Unknown RPCRequest type: {request}")
+
+    async def run_server_loop(self):
+        """Inner RPC Server Loop"""
+
+        running_tasks = set()
+        while True:
+            # Wait for a request.
+            identity, message = await self.socket.recv_multipart()
+
+            # Process the request async.
+            task = asyncio.create_task(
+                self._make_handler_coro(identity, message))
+
+            # We need to keep around a strong reference to the task,
+            # to avoid the task disappearing mid-execution as running tasks
+            # can be GC'ed. Below is a common "fire-and-forget" tasks
+            # https://docs.python.org/3/library/asyncio-task.html#asyncio.create_task
+            running_tasks.add(task)
+            task.add_done_callback(running_tasks.discard)
+
+
+async def run_server(server: AsyncEngineRPCServer):
+    # Put the server task into the asyncio loop.
+    loop = asyncio.get_running_loop()
+    server_task = loop.create_task(server.run_server_loop())
+
+    # Interruption handling.
+    def signal_handler() -> None:
+        # Kill the server on interrupt / terminate
+        server_task.cancel()
+
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+    loop.add_signal_handler(signal.SIGTERM, signal_handler)
+
+    try:
+        await server_task
+    except asyncio.CancelledError:
+        logger.info("vLLM ZMQ RPC Server was interrupted.")
+    finally:
+        # Clean up all resources.
+        server.cleanup()
+
+
+def run_rpc_server(async_engine_args: AsyncEngineArgs,
+                   usage_context: UsageContext, port: int):
+    server = AsyncEngineRPCServer(async_engine_args, usage_context, port)
+    asyncio.run(run_server(server))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c832cf2a24b50..ebb1d57fbb9a6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -8,7 +8,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
                                          parse_chat_message_content)
@@ -39,7 +39,7 @@ class OpenAIServingChat(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         response_role: str,
@@ -50,7 +50,7 @@ def __init__(
         chat_template: Optional[str],
         return_tokens_as_token_ids: bool = False,
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -89,7 +89,8 @@ async def create_chat_completion(
             ) = self._maybe_get_adapters(request)
 
             model_config = self.model_config
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             conversation: List[ConversationMessage] = []
             mm_futures: List[Awaitable[MultiModalDataDict]] = []
@@ -161,7 +162,8 @@ async def create_chat_completion(
             if mm_data is not None:
                 engine_inputs["multi_modal_data"] = mm_data
 
-            is_tracing_enabled = await self.engine.is_tracing_enabled()
+            is_tracing_enabled = (
+                await self.async_engine_client.is_tracing_enabled())
             trace_headers = None
             if is_tracing_enabled and raw_request:
                 trace_headers = extract_trace_headers(raw_request.headers)
@@ -169,7 +171,7 @@ async def create_chat_completion(
                     and contains_trace_headers(raw_request.headers)):
                 log_tracing_disabled_warning()
 
-            result_generator = self.engine.generate(
+            result_generator = self.async_engine_client.generate(
                 engine_inputs,
                 sampling_params,
                 request_id,
@@ -441,7 +443,7 @@ async def chat_completion_full_generator(
         async for res in result_generator:
             if raw_request is not None and await raw_request.is_disconnected():
                 # Abort the request if the client disconnects.
-                await self.engine.abort(request_id)
+                await self.async_engine_client.abort(request_id)
                 return self.create_error_response("Client disconnected")
             final_res = res
         assert final_res is not None
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7765c5903f341..edc83d83fbba7 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -8,7 +8,7 @@
 from transformers import PreTrainedTokenizer
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -42,7 +42,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -51,7 +51,7 @@ def __init__(
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -91,7 +91,8 @@ async def create_completion(self, request: CompletionRequest,
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             guided_decode_logits_processor = (
                 await self._guided_decode_logits_processor(request, tokenizer))
@@ -119,7 +120,8 @@ async def create_completion(self, request: CompletionRequest,
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                is_tracing_enabled = await self.engine.is_tracing_enabled()
+                is_tracing_enabled = (
+                    await self.async_engine_client.is_tracing_enabled())
                 trace_headers = None
                 if is_tracing_enabled:
                     trace_headers = extract_trace_headers(raw_request.headers)
@@ -127,7 +129,7 @@ async def create_completion(self, request: CompletionRequest,
                         raw_request.headers):
                     log_tracing_disabled_warning()
 
-                generator = self.engine.generate(
+                generator = self.async_engine_client.generate(
                     {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     sampling_params,
                     request_id_item,
@@ -168,7 +170,7 @@ async def create_completion(self, request: CompletionRequest,
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
                     # Abort the request if the client disconnects.
-                    await self.engine.abort(f"{request_id}-{i}")
+                    await self.async_engine_client.abort(f"{request_id}-{i}")
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
 
@@ -230,7 +232,8 @@ async def completion_stream_generator(
 
                 # Abort the request if the client disconnects.
                 if await raw_request.is_disconnected():
-                    await self.engine.abort(f"{request_id}-{prompt_idx}")
+                    await self.async_engine_client.abort(
+                        f"{request_id}-{prompt_idx}")
                     raise StopAsyncIteration()
 
                 for output in res.outputs:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index bccc90894e79f..e61c82f9a8a6c 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -6,7 +6,7 @@
 from fastapi import Request
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
                                               EmbeddingResponse,
@@ -56,13 +56,13 @@ class OpenAIServingEmbedding(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
         request_logger: Optional[RequestLogger],
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=None,
@@ -99,7 +99,8 @@ async def create_embedding(self, request: EmbeddingRequest,
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine.get_tokenizer(lora_request)
+            tokenizer = await self.async_engine_client.get_tokenizer(
+                lora_request)
 
             pooling_params = request.to_pooling_params()
 
@@ -124,7 +125,7 @@ async def create_embedding(self, request: EmbeddingRequest,
                         "Prompt adapter is not supported "
                         "for embedding models")
 
-                generator = self.engine.encode(
+                generator = self.async_engine_client.encode(
                     {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
                     pooling_params,
                     request_id_item,
@@ -146,7 +147,7 @@ async def create_embedding(self, request: EmbeddingRequest,
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
                     # Abort the request if the client disconnects.
-                    await self.engine.abort(f"{request_id}-{i}")
+                    await self.async_engine_client.abort(f"{request_id}-{i}")
                     return self.create_error_response("Client disconnected")
                 final_res_batch[i] = res
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8c7929a12e9a0..df4932d8fe185 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -8,7 +8,7 @@
 from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -61,7 +61,7 @@ class OpenAIServing:
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -72,7 +72,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.engine = engine
+        self.async_engine_client = async_engine_client
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
@@ -155,7 +155,7 @@ def create_streaming_error_response(
     async def _guided_decode_logits_processor(
             self, request: Union[ChatCompletionRequest, CompletionRequest],
             tokenizer: AnyTokenizer) -> Optional[LogitsProcessor]:
-        decoding_config = await self.engine.get_decoding_config()
+        decoding_config = await self.async_engine_client.get_decoding_config()
         guided_decoding_backend = request.guided_decoding_backend \
             or decoding_config.guided_decoding_backend
         return await get_guided_decoding_logits_processor(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 94e1b03ed4036..c4350881a27a6 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,9 +1,9 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 # yapf conflicts with isort for this block
 # yapf: disable
+from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
                                          parse_chat_message_content)
@@ -24,7 +24,7 @@ class OpenAIServingTokenization(OpenAIServing):
 
     def __init__(
         self,
-        engine: AsyncLLMEngine,
+        async_engine_client: AsyncEngineClient,
         model_config: ModelConfig,
         served_model_names: List[str],
         *,
@@ -32,7 +32,7 @@ def __init__(
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
     ):
-        super().__init__(engine=engine,
+        super().__init__(async_engine_client=async_engine_client,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=lora_modules,
@@ -57,7 +57,7 @@ async def create_tokenize(
             prompt_adapter_request,
         ) = self._maybe_get_adapters(request)
 
-        tokenizer = await self.engine.get_tokenizer(lora_request)
+        tokenizer = await self.async_engine_client.get_tokenizer(lora_request)
 
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
@@ -113,7 +113,7 @@ async def create_detokenize(
             prompt_adapter_request,
         ) = self._maybe_get_adapters(request)
 
-        tokenizer = await self.engine.get_tokenizer(lora_request)
+        tokenizer = await self.async_engine_client.get_tokenizer(lora_request)
 
         self._log_inputs(request_id,
                          request.tokens,
diff --git a/vllm/envs.py b/vllm/envs.py
index 595058bcbb027..a78bad6a2b273 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -4,6 +4,7 @@
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
     VLLM_PORT: Optional[int] = None
+    VLLM_RPC_PORT: int = 5570
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_INSTANCE_ID: Optional[str] = None
@@ -140,6 +141,11 @@ def get_default_config_root():
     lambda: int(os.getenv('VLLM_PORT', '0'))
     if 'VLLM_PORT' in os.environ else None,
 
+    # used when the frontend api server is running in multi-processing mode,
+    # to communicate with the backend engine process over ZMQ.
+    'VLLM_RPC_PORT':
+    lambda: int(os.getenv('VLLM_PORT', '5570')),
+
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
     "VLLM_USE_MODELSCOPE":
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1c8f6cccb3e9a..554dcc0ed43ed 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,6 +21,8 @@
 from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
+from lark import Lark
+from outlines import grammars
 from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
@@ -44,6 +46,23 @@ def __call__(self, input_ids: List[int],
             last_seq_id = hash(tuple(input_ids[:-1]))
             self._fsm_state[seq_id] = self._guide.get_next_state(
                 state=self._fsm_state[last_seq_id], token_id=last_token)
+        else:
+            # Note: this is a hack.
+            # Lark pickling does not work properly (silent failure),
+            # which breaks the RPC (which uses python pickleing).
+            # We need to find a better solution.
+            # On the first time this is called, we simply re-create
+            # the Lark object.
+            if isinstance(self._guide, CFGGuide):
+                self._guide.parser = Lark(
+                    self._guide.cfg_string,
+                    parser="lalr",
+                    lexer="contextual",
+                    propagate_positions=False,
+                    maybe_placeholders=False,
+                    regex=True,
+                    import_paths=[grammars.GRAMMAR_PATH],
+                )
 
         instruction = self._guide.get_next_instruction(
             state=self._fsm_state[seq_id])
diff --git a/vllm/tracing.py b/vllm/tracing.py
index dc8377f2396f2..7ac38e6a0f663 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -60,7 +60,7 @@ def get_span_exporter(endpoint):
             OTLPSpanExporter)
     elif protocol == "http/protobuf":
         from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
-            OTLPSpanExporter)
+            OTLPSpanExporter)  # type: ignore
     else:
         raise ValueError(
             f"Unsupported OTLP protocol '{protocol}' is configured")
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 7a0436dd1fb16..eeab19899b022 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,6 +1,7 @@
 from typing import Optional, Type
 
-from vllm.config import TokenizerPoolConfig
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         TokenizerPoolConfig)
 from vllm.executor.ray_utils import ray
 
 from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
@@ -13,6 +14,22 @@
     RayTokenizerGroupPool = None  # type: ignore
 
 
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                parallel_config: ParallelConfig,
+                                enable_lora: bool):
+    init_kwargs = dict(tokenizer_id=model_config.tokenizer,
+                       enable_lora=enable_lora,
+                       max_num_seqs=scheduler_config.max_num_seqs,
+                       max_input_length=None,
+                       tokenizer_mode=model_config.tokenizer_mode,
+                       trust_remote_code=model_config.trust_remote_code,
+                       revision=model_config.tokenizer_revision)
+
+    return get_tokenizer_group(parallel_config.tokenizer_pool_config,
+                               **init_kwargs)
+
+
 def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
                         **init_kwargs) -> BaseTokenizerGroup:
     tokenizer_cls: Type[BaseTokenizerGroup]
diff --git a/vllm/utils.py b/vllm/utils.py
index c4c17bfbefc65..51bd72977a226 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -290,6 +290,10 @@ def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
     return _async_wrapper
 
 
+class ProducerFinished:
+    pass
+
+
 def merge_async_iterators(
         *iterators: AsyncIterator[T]) -> AsyncIterator[Tuple[int, T]]:
     """Merge multiple asynchronous iterators into a single iterator.
@@ -298,9 +302,10 @@ def merge_async_iterators(
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
     """
-    queue: asyncio.Queue[Union[Tuple[int, T], Exception]] = asyncio.Queue()
+    queue: asyncio.Queue[Union[Tuple[int, T], ProducerFinished,
+                               Exception]] = asyncio.Queue()
 
-    finished = [False] * len(iterators)
+    producers = len(iterators)
 
     async def producer(i: int, iterator: AsyncIterator[T]):
         try:
@@ -308,7 +313,8 @@ async def producer(i: int, iterator: AsyncIterator[T]):
                 await queue.put((i, item))
         except Exception as e:
             await queue.put(e)
-        finished[i] = True
+        # Signal to the consumer that we've finished
+        await queue.put(ProducerFinished())
 
     _tasks = [
         asyncio.create_task(producer(i, iterator))
@@ -316,9 +322,17 @@ async def producer(i: int, iterator: AsyncIterator[T]):
     ]
 
     async def consumer():
+        remaining = producers
         try:
-            while not all(finished) or not queue.empty():
+            while remaining or not queue.empty():
+                # we think there is a race condition here
                 item = await queue.get()
+
+                if isinstance(item, ProducerFinished):
+                    # Signal that a producer finished- not a real item
+                    remaining -= 1
+                    continue
+
                 if isinstance(item, Exception):
                     raise item
                 yield item
@@ -374,8 +388,10 @@ def get_distributed_init_method(ip: str, port: int) -> str:
     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
 
 
-def get_open_port() -> int:
-    port = envs.VLLM_PORT
+def get_open_port(port: Optional[int] = None) -> int:
+    if port is None:
+        # Default behavior here is to return a port for multi-gpu communication
+        port = envs.VLLM_PORT
     if port is not None:
         while True:
             try:

From 69ea15e5cc823b2bc040921ce516807fb7357dd1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 21:05:16 -0700
Subject: [PATCH 71/79] [ci][distributed] shorten wait time if server hangs
 (#7098)

---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index dd8af8e3afe70..974fece49f4b4 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -50,7 +50,7 @@ def _nvml():
 
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
-    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+    MAX_SERVER_START_WAIT_S = 120  # wait for server to start for 120 seconds
 
     def __init__(
         self,

From 8c025fa7030350a81bfeb665c99ad622667bdac0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 3 Aug 2024 12:31:27 +0800
Subject: [PATCH 72/79] [Frontend] Factor out chat message parsing (#7055)

---
 vllm/entrypoints/chat_utils.py                | 28 +++++++++++++++----
 vllm/entrypoints/openai/serving_chat.py       | 17 ++++-------
 .../openai/serving_tokenization.py            | 21 +++++++-------
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index fbb7f70b55e16..072450a6146ee 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,7 +1,8 @@
 import codecs
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from functools import lru_cache
-from typing import Awaitable, Iterable, List, Optional, Union, cast, final
+from typing import (Awaitable, Iterable, List, Optional, Tuple, Union, cast,
+                    final)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -65,8 +66,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
-        default_factory=list)
+    mm_futures: List[Awaitable[MultiModalDataDict]]
 
 
 def load_chat_template(chat_template: Optional[str]) -> Optional[str]:
@@ -174,7 +174,7 @@ def _parse_chat_message_content_parts(
     return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
 
 
-def parse_chat_message_content(
+def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     model_config: ModelConfig,
     tokenizer: PreTrainedTokenizer,
@@ -190,3 +190,21 @@ def parse_chat_message_content(
 
     return _parse_chat_message_content_parts(role, content, model_config,
                                              tokenizer)
+
+
+def parse_chat_messages(
+    messages: List[ChatCompletionMessageParam],
+    model_config: ModelConfig,
+    tokenizer: PreTrainedTokenizer,
+) -> Tuple[List[ConversationMessage], List[Awaitable[MultiModalDataDict]]]:
+    conversation: List[ConversationMessage] = []
+    mm_futures: List[Awaitable[MultiModalDataDict]] = []
+
+    for msg in messages:
+        parse_result = _parse_chat_message_content(msg, model_config,
+                                                   tokenizer)
+
+        conversation.extend(parse_result.messages)
+        mm_futures.extend(parse_result.mm_futures)
+
+    return conversation, mm_futures
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ebb1d57fbb9a6..d215754993e82 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,6 +1,5 @@
 import time
-from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, List,
-                    Optional)
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
@@ -11,7 +10,7 @@
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
                                          load_chat_template,
-                                         parse_chat_message_content)
+                                         parse_chat_messages)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -92,15 +91,8 @@ async def create_chat_completion(
             tokenizer = await self.async_engine_client.get_tokenizer(
                 lora_request)
 
-            conversation: List[ConversationMessage] = []
-            mm_futures: List[Awaitable[MultiModalDataDict]] = []
-
-            for msg in request.messages:
-                chat_parsed_result = parse_chat_message_content(
-                    msg, model_config, tokenizer)
-
-                conversation.extend(chat_parsed_result.messages)
-                mm_futures.extend(chat_parsed_result.mm_futures)
+            conversation, mm_futures = parse_chat_messages(
+                request.messages, model_config, tokenizer)
 
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
@@ -115,6 +107,7 @@ async def create_chat_completion(
                 chat_template=request.chat_template or self.chat_template,
                 **(request.chat_template_kwargs or {}),
             )
+            assert isinstance(prompt, str)
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index c4350881a27a6..5b6b979b9b9e7 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,13 +1,11 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.engine.protocol import AsyncEngineClient
-from vllm.entrypoints.chat_utils import (ConversationMessage,
-                                         load_chat_template,
-                                         parse_chat_message_content)
+from vllm.entrypoints.chat_utils import load_chat_template, parse_chat_messages
 from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
                                               DetokenizeResponse,
                                               ErrorResponse,
@@ -17,8 +15,11 @@
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
+from vllm.logger import init_logger
 from vllm.utils import random_uuid
 
+logger = init_logger(__name__)
+
 
 class OpenAIServingTokenization(OpenAIServing):
 
@@ -62,12 +63,12 @@ async def create_tokenize(
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
 
-            conversation: List[ConversationMessage] = []
+            conversation, mm_futures = parse_chat_messages(
+                request.messages, model_config, tokenizer)
 
-            for message in request.messages:
-                result = parse_chat_message_content(message, model_config,
-                                                    tokenizer)
-                conversation.extend(result.messages)
+            if mm_futures:
+                logger.warning(
+                    "Multi-modal inputs are ignored during tokenization")
 
             prompt = tokenizer.apply_chat_template(
                 add_generation_prompt=request.add_generation_prompt,

From 04e55834254bf11770d544bbeebdbdb7731d9bbd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 21:33:53 -0700
Subject: [PATCH 73/79] [ci][distributed] merge distributed test commands
 (#7097)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .buildkite/test-pipeline.yaml                 | 27 ++-------
 .../test_basic_distributed_correctness.py     | 50 ++++++++++------
 .../test_chunked_prefill_distributed.py       | 35 +++++-------
 .../distributed/test_multimodal_broadcast.py  | 57 +++++++++----------
 4 files changed, 78 insertions(+), 91 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 573c3740f0bbb..93b3e3fe91663 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -82,20 +82,9 @@ steps:
   num_gpus: 2
   commands:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -107,11 +96,6 @@ steps:
   fast_check: true
   commands:
   - pytest -v -s distributed/test_pynccl.py
-  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
-  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Pipeline Parallelism Test
@@ -279,9 +263,6 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 7a0e5673b2cc4..1de2ebab22db4 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,15 +1,10 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-vLLM will allocate all the available memory, so we need to run the tests one
-by one. The solution is to pass arguments (model name) by environment
-variables.
+
 Run:
 ```sh
 cd $VLLM_PATH/tests
 
-TEST_DIST_MODEL=facebook/opt-125m pytest \
-    distributed/test_basic_distributed_correctness.py
-TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
-    distributed/test_basic_distributed_correctness.py
+pytest distributed/test_basic_distributed_correctness.py
 ```
 """
 import os
@@ -19,27 +14,48 @@
 from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
+from ..utils import fork_new_process_for_each_test
 
-MODELS = [
-    os.environ["TEST_DIST_MODEL"],
-]
-DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 
 @pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+@fork_new_process_for_each_test
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
-    dtype: str,
-    max_tokens: int,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index 1ef085b933793..10921a3852f81 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -1,46 +1,39 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-vLLM will allocate all the available memory, so we need to run the tests one
-by one. The solution is to pass arguments (model name) by environment
-variables.
 
 Run:
 ```sh
-TEST_DIST_MODEL=facebook/opt-125m pytest \
-    test_chunked_prefill_distributed.py
-TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
-    test_chunked_prefill_distributed.py
+pytest test_chunked_prefill_distributed.py
 ```
 """
-import os
 
 import pytest
 
 from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
-
-MODELS = [
-    os.environ["TEST_DIST_MODEL"],
-]
-DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
+from ..utils import fork_new_process_for_each_test
 
 
 @pytest.mark.skipif(cuda_device_count_stateless() < 2,
                     reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("facebook/opt-125m", "ray"),
+    ("meta-llama/Llama-2-7b-hf", "ray"),
+    ("facebook/opt-125m", "mp"),
+    ("meta-llama/Llama-2-7b-hf", "mp"),
+])
+@fork_new_process_for_each_test
 def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
+    distributed_executor_backend: str,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
 
     # Add a chunked prefill config.
     max_num_seqs = min(chunked_prefill_token_size, 256)
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index a99917f586949..2c96358e2e6f2 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -1,44 +1,41 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-The second test will hang if more than one test is run per command, so we need
-to run the tests one by one. The solution is to pass arguments (model name) by
-environment variables.
 
 Run:
 ```sh
-TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
-    test_multimodal_broadcast.py
-TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
-    test_multimodal_broadcast.py
+pytest -s -v test_multimodal_broadcast.py
 ```
 """
-import os
 
 import pytest
 
 from vllm.utils import cuda_device_count_stateless
 
-model = os.environ["TEST_DIST_MODEL"]
-
-if model.startswith("llava-hf/llava-1.5"):
-    from ..models.test_llava import models, run_test
-elif model.startswith("llava-hf/llava-v1.6"):
-    from ..models.test_llava_next import models, run_test
-else:
-    raise NotImplementedError(f"Unsupported model: {model}")
-
-
-@pytest.mark.parametrize("tensor_parallel_size", [2])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets,
-                tensor_parallel_size: int, dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
-    if cuda_device_count_stateless() < tensor_parallel_size:
-        pytest.skip(
-            f"Need at least {tensor_parallel_size} GPUs to run the test.")
-
-    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+from ..utils import fork_new_process_for_each_test
+
+
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("llava-hf/llava-1.5-7b-hf", "ray"),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
+    ("llava-hf/llava-1.5-7b-hf", "mp"),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
+])
+@fork_new_process_for_each_test
+def test_models(hf_runner, vllm_runner, image_assets, model: str,
+                distributed_executor_backend: str) -> None:
+
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+
+    if model.startswith("llava-hf/llava-1.5"):
+        from ..models.test_llava import models, run_test
+    elif model.startswith("llava-hf/llava-v1.6"):
+        from ..models.test_llava_next import models, run_test
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
 
     run_test(
         hf_runner,

From a0d164567cd2a82d827c81a49a21e3f2c75a522d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 22:32:04 -0700
Subject: [PATCH 74/79] [ci][distributed] disable ray dag tests (#7099)

---
 tests/distributed/test_pipeline_parallel.py | 43 +++++++++------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ab325e0966929..8eb5ca9461c75 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -14,36 +14,29 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.mark.parametrize(
-    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
-     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
-         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
-     ])
+@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
+                          "MODEL_NAME, DIST_BACKEND"),
+                         [
+                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+                         ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
-                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
+                    DIST_BACKEND):
     if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
         pytest.skip("Skipping multi-node pipeline parallel test for "
                     "multiprocessing distributed backend")
 
+    USE_RAY_ADAG_NCCL = 0
+    USE_RAY_ADAG = 0
+
     pp_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",

From 0c25435daa0a399460a676e7c9b604bd23ea2d22 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sat, 3 Aug 2024 13:36:14 +0800
Subject: [PATCH 75/79] [Model] Refactor and decouple weight loading logic for
 InternVL2 model (#7067)

---
 vllm/model_executor/models/intern_vit.py | 11 +++-
 vllm/model_executor/models/internvl.py   | 82 ++++++++----------------
 2 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index c6c692deca2e1..54c933e3e4959 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -4,7 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Optional
+from typing import Iterable, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -16,6 +16,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 NORM2FN = {
     'rms_norm': RMSNorm,
@@ -268,3 +269,11 @@ def forward(
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         return encoder_outputs
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index eabc283b1efdb..4749251271487 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -4,6 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import itertools
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
@@ -414,58 +415,31 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
-            (".gate_up_proj", ".w1", 0),
-            (".gate_up_proj", ".w3", 1),
-        ]
-        params_dict = dict(self.named_parameters())
+    def _filter_weights(self, weights: Iterable[Tuple[str, torch.Tensor]],
+                        prefix: str):
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.text_config.tie_word_embeddings \
-                and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # We only do sharding for language model
-                # and not vision model for now.
-                if "vision_embed_tokens" in name and self.vision_embed_tokens:
-                    continue
-                if weight_name not in name:
-                    continue
-                param = params_dict[name.replace(weight_name, param_name)]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                if "wqkv" in name:
-                    config = self.config.text_config
-                    kv_groups = (config.num_attention_heads //
-                                 config.num_key_value_heads)
-                    head_dim = config.hidden_size // config.num_attention_heads
-                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
-                                                       head_dim,
-                                                       loaded_weight.shape[-1])
-                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
-                                             dim=1)
-                    wq = wq.reshape(-1, wq.shape[-1])
-                    wk = wk.reshape(-1, wk.shape[-1])
-                    wv = wv.reshape(-1, wv.shape[-1])
-                    weight_loader = param.weight_loader
-                    weight_loader(param, wq, 'q')
-                    weight_loader(param, wk, 'k')
-                    weight_loader(param, wv, 'v')
-                    continue
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+            name = name.split(".")
+            if prefix == name.pop(0):
+                name = ".".join(name)
+                yield name, loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators for components
+        vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3)
+
+        # load vision encoder
+        vit_weights = self._filter_weights(vit_weights, "vision_model")
+        self.vision_model.load_weights(vit_weights)
+
+        # load mlp projector
+        mlp_weights = self._filter_weights(mlp_weights, "mlp1")
+        mlp_params_dict = dict(self.mlp1.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        # load llm backbone
+        llm_weights = self._filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)

From fb2c1c86c196aa1531435d0c445fbea4c9dd4aa5 Mon Sep 17 00:00:00 2001
From: Zach Zheng <zach.zheng96@gmail.com>
Date: Fri, 2 Aug 2024 22:38:15 -0700
Subject: [PATCH 76/79] [Bugfix] Fix block table for seqs that have prefix
 cache hits (#7018)

---
 tests/prefix_caching/test_prefix_caching.py | 56 +++++++++++++++++++++
 vllm/attention/backends/flash_attn.py       | 12 +++--
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7985001d34eb1..9821dbd066a59 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -6,10 +6,17 @@
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device
 
+from ..models.utils import check_outputs_equal
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_blocks", [16])
@@ -76,3 +83,52 @@ def test_eviction(num_blocks: int, ):
     assert (realloc_block != new_block)
     assert (new_block.block_hash == new_block_hash)
     assert (new_block.block_number == 2)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("cached_position", [0, 1])
+@pytest.mark.parametrize("use_v2_block_manager", [False, True])
+def test_mixed_requests(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    backend: str,
+    dtype: str,
+    max_tokens: int,
+    cached_position: int,
+    use_v2_block_manager: bool,
+    monkeypatch,
+) -> None:
+    """
+    Test the case when some sequences have the prefix cache hit
+    and the others don't. The cached position determines where 
+    the sequence is at among the batch of prefills.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    cached_prompt = example_prompts[cached_position]
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enable_prefix_caching=True,
+            use_v2_block_manager=use_v2_block_manager,
+    ) as vllm_model:
+        # Run the first prompt so the cache is populated
+        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+
+        # Run all the promopts
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 00654dca2adfa..26b3159682b3e 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -209,6 +209,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefills = 0
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
 
         self.input_builder = input_builder
         self.runner = input_builder.runner
@@ -219,7 +220,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
 
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.
@@ -252,7 +253,7 @@ def _add_seq_group(
             # only allowing multiple of block_size chunk size.
             # NOTE: This only works for oooooooxxx style attention.
             block_table = []
-            if inter_data.prefix_cache_hit:
+            if prefix_cache_hit:
                 # NOTE(woosuk): For flash-attn, the block table should
                 # include the entries for the incoming prefill tokens.
                 block_table = block_tables[seq_id]
@@ -281,9 +282,14 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                  -1 if cuda graph is not used.
             batch_size: The maybe padded batch size.
         """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
         for inter_data in self.input_builder.inter_data_list:
             self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
 
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1

From 99d7cabd7b8b789e837a0682982fd7ec94a843b1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 3 Aug 2024 13:40:19 +0800
Subject: [PATCH 77/79] [LoRA]  ReplicatedLinear support LoRA (#7081)

---
 tests/lora/test_layers.py | 103 ++++++++++++++++++++++++++++++++++++++
 vllm/lora/layers.py       |  94 ++++++++++++++++++++++++++++++++++
 vllm/lora/utils.py        |   2 +
 3 files changed, 199 insertions(+)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 6f33f56616fcd..d8cc68d5e9599 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -22,6 +22,7 @@
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
+                              ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
@@ -31,6 +32,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -545,6 +547,107 @@ def _pretest():
                               atol=atol)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
+def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
+
+    torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_linear_replicated_layer():
+
+        linear = ReplicatedLinear(4096,
+                                  4096,
+                                  bias=False,
+                                  params_dtype=torch.float16)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        lora_linear = ReplicatedLinearWithLoRA(linear)
+
+        lora_linear.create_lora_weights(max_loras, lora_config)
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_replicated_layer()
+        lora_linear.set_mapping(punica_wrapper)
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results: List[torch.Tensor] = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float16,
+        )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 3176badabbc7f..42ec99e6ea2c8 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
@@ -262,6 +263,99 @@ def can_replace_layer(
         return type(source_layer) is VocabParallelEmbedding
 
 
+class ReplicatedLinearWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.input_size = self.base_layer.input_size
+        self.output_size = self.base_layer.output_size
+        self.device = _get_lora_device(self.base_layer)
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        self.lora_config = lora_config
+        lora_a_output_size = lora_config.max_lora_rank
+        self.lora_a_stacked = torch.zeros(
+            max_loras,
+            1,
+            lora_a_output_size,
+            self.input_size,
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            max_loras,
+            1,
+            self.output_size,
+            lora_config.max_lora_rank,
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+    ):
+        self.reset_lora(index)
+
+        self.lora_a_stacked[index,
+                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                lora_a.T, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+
+    def apply(self, x: torch.Tensor,
+              bias: Optional[torch.Tensor]) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
+        return output
+
+    def forward(self, input_):
+        """Forward of ReplicatedLinearWithLoRA
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output = self.apply(input_, bias)
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ReplicatedLinear
+
+
 class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 4513337299e16..ee983328e2c5b 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -23,6 +23,7 @@
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
+                              ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
@@ -38,6 +39,7 @@
     QKVParallelLinearWithLora,
     MergedQKVParallelLinearWithLora,
     RowParallelLinearWithLoRA,
+    ReplicatedLinearWithLoRA,
     LogitsProcessorWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
     QKVParallelLinearWithShardedLora,

From 67d745cc68d9ad31bf683a88f00a1aee9782f541 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 2 Aug 2024 23:52:44 -0700
Subject: [PATCH 78/79] [CI] Temporarily turn off H100 performance benchmark
 (#7104)

---
 .../benchmark-pipeline.yaml                   | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 02c0ee534d72c..8490c9f1da221 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -42,20 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "H100"
-    agents:
-      queue: H100
-    plugins:
-    - docker#v5.11.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
+  # - label: "H100"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #       command:
+  #       - bash
+  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+  #       mount-buildkite-agent: true
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+  #       environment:
+  #       - VLLM_USAGE_SOURCE
+  #       - HF_TOKEN
 

From 44dcb52e39ee6b2c9ef9e6497525e1e183c9d24b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 3 Aug 2024 10:44:53 -0700
Subject: [PATCH 79/79] [ci][test] finalize fork_new_process_for_each_test
 (#7114)

---
 tests/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index 974fece49f4b4..666694299d397 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -360,6 +360,9 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
 
 
 def fork_new_process_for_each_test(f):
+    """Decorator to fork a new process for each test function.
+    See https://github.com/vllm-project/vllm/issues/7053 for more details.
+    """
 
     @functools.wraps(f)
     def wrapper(*args, **kwargs):