diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 972c62a091aea..6659440135ff4 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
 
 commands=$@
+echo "Commands:$commands"
+#ignore certain kernels tests
+if [[ $commands == *" kernels "* ]]; then
+  commands="${commands} \
+  --ignore=kernels/test_attention.py \
+  --ignore=kernels/test_attention_selector.py \
+  --ignore=kernels/test_blocksparse_attention.py \
+  --ignore=kernels/test_causal_conv1d.py \
+  --ignore=kernels/test_cutlass.py \
+  --ignore=kernels/test_encoder_decoder_attn.py \
+  --ignore=kernels/test_flash_attn.py \
+  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_int8_quant.py \
+  --ignore=kernels/test_machete_gemm.py \
+  --ignore=kernels/test_mamba_ssm.py \
+  --ignore=kernels/test_marlin_gemm.py \
+  --ignore=kernels/test_moe.py \
+  --ignore=kernels/test_prefix_prefill.py \
+  --ignore=kernels/test_rand.py \
+  --ignore=kernels/test_sampler.py"
+fi
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
     #replace shard arguments
-    commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
+    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
     commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+    echo "Shard ${GPU} commands:$commands"
     docker run \
         --device /dev/kfd --device /dev/dri \
         --network host \
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index a01cf3fe67489..49ae838cf0690 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -11,8 +11,9 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
+source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
 
 # Run basic model test
 docker exec cpu-test bash -c "
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index ca9cf15780e25..73ce82c5857ab 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -22,13 +22,17 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
-      --ignore=tests/models/test_oot_registration.py \
-      --ignore=tests/models/test_registry.py \
-      --ignore=tests/models/test_fp8.py \
-      --ignore=tests/models/test_jamba.py \
-      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  pytest -v -s tests/models/decoder_only/language \
+    --ignore=tests/models/test_fp8.py \
+    --ignore=tests/models/decoder_only/language/test_jamba.py \
+    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+
+# Run compressed-tensor test
+docker exec cpu-test bash -c "
+  pytest -s -v \
+  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d0317b2fc48c9..9b0cb6663a55b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,6 +50,7 @@ steps:
   - tests/worker
   commands:
   - pytest -v -s async_engine # Async Engine
+  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
   - pytest -v -s multimodal
   - pytest -v -s test_utils.py # Utils
@@ -91,7 +92,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/openai
   - pytest -v -s entrypoints/test_chat_utils.py
-
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
@@ -162,15 +163,6 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 
-- label: Models Test # 1hr10min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
-
 - label: torch compile integration test
   source_file_dependencies:
   - vllm/
@@ -178,14 +170,6 @@ steps:
     - pytest -v -s ./compile/test_full_graph.py
     - pytest -v -s ./compile/test_wrapper.py
 
-
-- label: Vision Language Models Test # 42min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s models -m vlm
-
 - label: Prefix Caching Test # 7min
   #mirror_hardwares: [amd]
   source_file_dependencies:
@@ -217,7 +201,8 @@ steps:
   commands:
     # See https://github.com/vllm-project/vllm/issues/5152
     - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
+    - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
 
 - label: LoRA Test %N # 30min each
   mirror_hardwares: [amd]
@@ -228,6 +213,7 @@ steps:
   parallelism: 4
 
 - label: Kernels Test %N # 30min each
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
   - vllm/attention
@@ -282,6 +268,45 @@ steps:
   commands:
     - pytest -v -s tool_use
 
+#####  models test  #####
+
+- label: Basic Models Test # 3min
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s models/test_oot_registration.py # it needs a clean process
+    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+
+- label: Decoder-only Language Models Test # 1h3min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  commands:
+    - pytest -v -s models/decoder_only/language
+
+- label: Decoder-only Multi-Modal Models Test # 56min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pytest -v -s models/decoder_only/audio_language
+    - pytest -v -s models/decoder_only/vision_language
+
+- label: Other Models Test # 5min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/embedding/language
+    - pytest -v -s models/encoder_decoder/language
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -307,11 +332,11 @@ steps:
   - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 28min
   #mirror_hardwares: [amd]
@@ -324,11 +349,10 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s distributed/test_multimodal_broadcast.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
@@ -386,7 +410,18 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 
 
 ##### multi gpus test #####
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index d4113da8b5b81..30db1721a9df7 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml	
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml	
@@ -30,6 +30,15 @@ body:
       </details>
   validations:
     required: true
+- type: textarea
+  attributes:
+    label: Model Input Dumps
+    description: |
+      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
+    placeholder: |
+      Upload the dumped input file.
+  validations:
+    required: false
 - type: textarea
   attributes:
     label: 🐛 Describe the bug
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 262ce8e1530a8..be0afc6305044 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
     <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 
+<h3>Adding or changing kernels</h3>
+<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
+<ul>
+    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
+    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
+    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
+    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
+    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
+</ul>
+
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71f160acc4dcc..7a0fa967155bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,9 +208,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.1
-        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
+        GIT_TAG v3.5.1
         GIT_PROGRESS TRUE
+
+        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
+        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
+        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
+        GIT_SHALLOW TRUE
   )
   FetchContent_MakeAvailable(cutlass)
 
@@ -244,6 +248,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
           "-gencode arch=compute_90a,code=sm_90a")
   endif()
 
+
   #
   # Machete kernels
 
@@ -307,28 +312,11 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-if(VLLM_GPU_LANG STREQUAL "HIP")
-  #
-  # custom extension
-  #
-  set(CUSTOM_SRC
-  "csrc/custom/torch_bindings.cpp"
-  "csrc/custom/custom_kernels.cu"
-  "csrc/custom/fused_kernels.cu"
-  "csrc/custom/custom.cu"
-  "csrc/custom/paged_attention/attention_ll4mi.cu"
-  )
-
-  define_gpu_extension_target(
-    _custom_C
-    DESTINATION vllm
-    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${CUSTOM_SRC}
-    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_GPU_ARCHES}
-    USE_SABI 3
-    WITH_SOABI)
-endif()
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# driver API. This causes problems when linking with earlier versions of CUDA.
+# Setting this variable sidesteps the issue by calling the driver directly.
+target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 #
 # _moe_C extension
@@ -354,6 +342,28 @@ define_gpu_extension_target(
   WITH_SOABI)
 
 
+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # _rocm_C extension
+  #
+  set(VLLM_ROCM_EXT_SRC
+    "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/attention.cu"
+    "csrc/rocm/custom_kernels.cu"
+    "csrc/rocm/fused_kernels.cu"
+    "csrc/rocm/custom.cu")
+
+  define_gpu_extension_target(
+    _rocm_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_ROCM_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
+endif()
+
 
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
@@ -364,6 +374,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "HIP")
-  message(STATUS "Enabling custom extension.")
-  add_dependencies(default _custom_C)
+  message(STATUS "Enabling rocm extension.")
+  add_dependencies(default _rocm_C)
 endif()
diff --git a/Dockerfile b/Dockerfile
index 0ec6655ed449e..5484be5bc5785 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 9a570f988f3db..34b4c95e34ffc 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -2,9 +2,14 @@
 
 FROM ubuntu:22.04 AS cpu-test-1
 
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
 RUN --mount=type=cache,target=/var/cache/apt \
     apt-get update -y \
     && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
+# install oneDNN
+RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
+    -DONEDNN_BUILD_DOC=OFF \ 
+    -DONEDNN_BUILD_EXAMPLES=OFF \ 
+    -DONEDNN_BUILD_TESTS=OFF \ 
+    -DONEDNN_BUILD_GRAPH=OFF \ 
+    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
+    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
+    cmake --build ./oneDNN/build --target install --config Release
+
 FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
@@ -40,7 +58,6 @@ COPY ./ ./
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 
-ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/ccache \
     VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index caa1b1d6c4424..f0c3479625a70 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
+    && apt-get install python3 python3-pip -y \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 06ca4638dfeb9..96b9593a2bfa8 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
+    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 
 # copy requirements
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 16780f8ab950c..3313162bf28e1 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@@ -16,7 +16,7 @@ COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 
 # These packages will be in rocketce eventually
-RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
 
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
@@ -25,4 +25,3 @@ WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 3a11c6721ead9..04cd4d79f4045 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
 FROM $BASE_IMAGE
 WORKDIR /workspace
 
+# Install some basic utilities
+RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index f91baa11a3753..50bbd8f7dad87 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,15 +1,22 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
     chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
     wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
     echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
+
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
 
 COPY ./ /workspace/vllm
 
diff --git a/README.md b/README.md
index 9ae30f8d2de55..53749cb36b972 100644
--- a/README.md
+++ b/README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
+**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
 
-We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
-Join us to hear the vLLM's recent update about performance.
-Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
+We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
+Join us to learn more about recent advancements of vLLM on MI300X.
+Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 
 ---
 
 *Latest News* 🔥
+- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
   year={2023}
 }
 ```
+
+## Contact Us
+
+* For technical questions and feature requests, please use Github issues or discussions.
+* For discussing with fellow users, please use Discord.
+* For security disclosures, please use Github's security advisory feature.
+* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
\ No newline at end of file
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 97afd301c8f24..a39d1cf842f06 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
 from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -205,13 +205,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help=('path to save the pytorch profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+    parser.add_argument("--device",
+                        type=str,
+                        default="auto",
+                        choices=DEVICE_OPTIONS,
+                        help='device type for vLLM execution')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 94549d84fb4e4..3f531ee82cc94 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
 
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -451,13 +451,11 @@ def main(args: argparse.Namespace):
         'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
         'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
         'instead supported for common inference criteria.')
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+    parser.add_argument("--device",
+                        type=str,
+                        default="auto",
+                        choices=DEVICE_OPTIONS,
+                        help='device type for vLLM execution')
     parser.add_argument(
         "--num-scheduler-steps",
         type=int,
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index ce0d9db3068c1..b0c23fee5b373 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -145,7 +145,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         v_scale,
                     )
                 else:
-                    ops.paged_attention_custom(
+                    ops.paged_attention_rocm(
                         output,
                         exp_sums,
                         max_logits,
@@ -161,6 +161,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         max_seq_len,
                         alibi_slopes,
                         kv_cache_dtype,
+                        k_scale,
+                        v_scale,
                     )
             else:
                 raise ValueError(f"Invalid version: {version}")
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 3ba3a2b6a93cd..8470e9ea9ebd9 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,4 +1,5 @@
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_STANDARD 17)
 
 #
 # Define environment variables for special configurations
@@ -83,12 +84,7 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS "numa")
-
-
-#
-# Define extension targets
-#
+list(APPEND LIBS dnnl numa)
 
 #
 # _C extension
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
     "csrc/cpu/pos_encoding.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
+#
+# Define extension targets
+#
+
 define_gpu_extension_target(
     _C
     DESTINATION vllm
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 69998b45be70a..1ea6d2b0f090e 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -350,6 +350,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
     ${GPU_INCLUDE_DIRECTORIES})
 
+  # TODO: is torch_python_LIBRARY needed?
   target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
     ${GPU_LIBRARIES})
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index f50620a5287d4..5b1d3d6442b2b 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -24,8 +24,8 @@ namespace vec_op {
 #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
 #define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
+#define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   explicit BF16Vec16(const FP32Vec16 &);
 
   void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm256_mask_storeu_epi16(ptr, mask, reg);
+  }
 };
 
 #ifdef __AVX512F__
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
 
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_max_ps(reg, b.reg));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
+  }
+
+  FP32Vec16 abs() const {
+    return FP32Vec16(_mm512_abs_ps(reg));
+  } 
+
   float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
 
+  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
+
   template <int group_size> float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 
   void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+
+  void save(float* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm512_mask_storeu_ps(ptr, mask, reg);
+  }
 };
 #else
 struct FP32Vec16 : public Vec<FP32Vec16> {
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 };
 #endif
 
+#ifdef __AVX512F__
+struct INT8Vec16: public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m128i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m128i reg;
+  
+  explicit INT8Vec16(const FP32Vec16& vec) : reg(
+    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+  ) {}
+
+  void save(int8_t* ptr) const {
+    _mm_storeu_epi8(ptr, reg);
+  }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm_mask_storeu_epi8(ptr, mask, reg);
+  }
+};
+#endif
+
 template <typename T> struct VecType { using vec_type = void; };
 
 template <typename T> using vec_t = typename VecType<T>::vec_type;
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
new file mode 100644
index 0000000000000..024ad4ae43da8
--- /dev/null
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -0,0 +1,168 @@
+#ifndef DNNL_HELPER_HPP
+#define DNNL_HELPER_HPP
+
+#include <c10/util/BFloat16.h>
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+namespace {
+template <typename T>
+struct DNNLType {
+  static constexpr dnnl::memory::data_type type =
+      dnnl::memory::data_type::undef;
+};
+
+template <>
+struct DNNLType<int8_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+
+template <>
+struct DNNLType<int32_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+
+template <>
+struct DNNLType<float> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+
+template <>
+struct DNNLType<c10::BFloat16> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+
+template <typename T>
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+  return DNNLType<std::decay_t<T>>::type;
+}
+};  // namespace
+
+template <bool InputNoScale>
+class DNNLPrimitiveHelper {
+ public:
+  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
+  // A: [M, K], row-major
+  // B: [K, N], column-major
+  // C: [M, N], row-major
+  // bias: [N], row-major, optional
+  // a_scales: [MS]
+  // b_scales: [NS]
+  // Note: Due to the limitation of oneDNN
+  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
+  // not supported.
+  template <typename OutputT, typename BiasT>
+  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
+                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
+                            dnnl_dim_t K, const float* a_scales,
+                            const float* b_scales, dnnl_dim_t MS,
+                            dnnl_dim_t NS) {
+    auto&& OutputType = get_dnnl_type<OutputT>();
+    auto&& BiasType = get_dnnl_type<BiasT>();
+
+    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
+    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
+    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
+
+    dnnl::primitive_attr attr;
+    if constexpr (!InputNoScale) {
+      if (MS == 1) {
+        // per-tensor
+        attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      } else {
+        // per-token
+        TORCH_CHECK(false, "per-token quantization is unsupported.");
+      }
+    }
+
+    if (NS == 1) {
+      // per-tensor
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+    } else {
+      // per-channel
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+    }
+
+    dnnl::matmul::primitive_desc matmul_pd;
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               bias_md, c_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               c_md, attr);
+    }
+    dnnl::matmul matmul(matmul_pd);
+
+    auto& engine = default_engine();
+
+    dnnl::memory a_m(a_md, engine, (void*)a);
+    dnnl::memory b_m(b_md, engine, (void*)b);
+    dnnl::memory c_m(c_md, engine, (void*)c);
+    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)a_scales);
+    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)b_scales);
+
+    auto& stream = default_stream();
+    if constexpr (InputNoScale) {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    } else {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    }
+    stream.wait();
+  }
+
+ private:
+  static dnnl::engine& default_engine() {
+    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+    return engine;
+  }
+
+  static dnnl::stream& default_stream() {
+    static dnnl::stream stream(default_engine());
+    return stream;
+  }
+};
+
+#endif
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
new file mode 100644
index 0000000000000..0cfc19097fded
--- /dev/null
+++ b/csrc/cpu/quant.cpp
@@ -0,0 +1,294 @@
+#include "cpu_types.hpp"
+#include "dnnl_helper.hpp"
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using load_vec_type = void;
+  using cvt_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using load_vec_type = vec_op::FP32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using load_vec_type = vec_op::BF16Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#ifdef __AVX512F__
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+
+    if (j + vec_elem_num == hidden_size) {
+      elems_int8.save(output + i * hidden_size + j);
+    } else {
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_abs(0.0);
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        max_abs = max_abs.max(elems_fp32.abs());
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        max_abs = max_abs.max(elems_fp32.abs());
+      } else {
+        max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
+      }
+    }
+
+    float scale_val = max_abs.reduce_max() / 127.0f;
+    scale[i] = scale_val;
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+
+      if (j + vec_elem_num == hidden_size) {
+        elems_int8.save(output + i * hidden_size + j);
+      } else {
+        elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+      }
+    }
+  }
+}
+
+template <bool Bias, typename scalar_t>
+void dynamic_output_scale_impl(const float* input, scalar_t* output,
+                               const float* scale, const scalar_t* bias,
+                               const int num_tokens, const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(scale[i]);
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+
+    load_vec_t elems_out(elems_fp32);
+
+    if (j + vec_elem_num == hidden_size) {
+      elems_out.save(output + i * hidden_size + j);
+    } else {
+      elems_out.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+#else
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int num_tokens,
+                                   const int hidden_size) {
+  TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
+}
+
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, const int num_tokens,
+                                    const int hidden_size) {
+  TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
+}
+
+template <typename scalar_t>
+void dynamic_output_scale_impl() {
+  TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
+}
+#endif
+}  // namespace
+
+void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
+                    const torch::Tensor& a,         // [M, IC], row-major
+                    const torch::Tensor& b,         // [IC, OC], column-major
+                    const torch::Tensor& a_scales,  // [1] or [M]
+                    const torch::Tensor& b_scales,  // [1] or [OC]
+                    const c10::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      torch::Tensor tmp_fp32_out =
+          torch::empty_like(c, ::at::ScalarType::Float);
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
+          a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
+          b_scales.numel());
+      if (bias.has_value()) {
+        dynamic_output_scale_impl<true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
+            c.size(1));
+      } else {
+        dynamic_output_scale_impl<false>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      } else {
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            (void*)(0), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+    }
+  });
+}
+
+// static-per-tensor quantization.
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              const torch::Tensor& input,  // [..., hidden_size]
+                              const torch::Tensor& scale) {
+  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+        static_scaled_int8_quant_impl(
+            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+            scale.data_ptr<float>(), num_tokens, hidden_size);
+      });
+}
+
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    const torch::Tensor& input,  // [..., hidden_size]
+    torch::Tensor& scale         // [..., 1]
+) {
+  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+        dynamic_scaled_int8_quant_impl(
+            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+            scale.data_ptr<float>(), num_tokens, hidden_size);
+      });
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index cf7d977da7c1c..b45da1b386b5b 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,12 @@
 
 #include <torch/library.h>
 
-void init_cpu_threads_env(const std::string& cpu_ids);
+std::string init_cpu_threads_env(const std::string& cpu_ids);
+
+void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                    const torch::Tensor& b, const torch::Tensor& a_scales,
+                    const torch::Tensor& b_scales,
+                    const c10::optional<torch::Tensor>& bias);
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // PagedAttention V2.
   ops.def(
       "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                 Tensor! key, int head_size,"
       "                 Tensor cos_sin_cache, bool is_neox) -> ()");
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+
+  // Quantization
+#ifdef __AVX512F__
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
   // Copy the cache blocks from src to dst.
   cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
 
   // Reshape the key and value tensors and cache them.
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
   // CPU utils
-  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 5782580baa861..1138a55df2f05 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -5,7 +5,7 @@
 
 #include "cpu_types.hpp"
 
-void init_cpu_threads_env(const std::string& cpu_ids) {
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
   std::vector<int> omp_cpu_ids;
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
   torch::set_num_threads((int)omp_cpu_ids.size());
   TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
   TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+
+  std::vector<std::pair<int, int>> thread_core_mapping;
+  thread_core_mapping.reserve(omp_cpu_ids.size());
+  omp_lock_t writelock;
+  omp_init_lock(&writelock);
+
 #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
-    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
-    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
-    CPU_ZERO_S(size, mask);
-    CPU_SET_S(omp_cpu_ids[i], size, mask);
-    sched_setaffinity(0, sizeof(cpu_set_t), mask);
-    CPU_FREE(mask);
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
+    if (ret == -1) {
+      TORCH_CHECK(false,
+                  "sched_setaffinity failed. errno: " + std::to_string(errno));
+    }
+
+    omp_set_lock(&writelock);
+    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
+    omp_unset_lock(&writelock);
   }
 
+  omp_destroy_lock(&writelock);
+
   numa_free_nodemask(omp_cpu_mask);
+
+  std::stringstream ss;
+  ss << "OMP threads binding of Process " << getpid() << ":\n";
+  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
+            [](auto&& a, auto&& b) { return a.second < b.second; });
+  for (auto&& item : thread_core_mapping) {
+    ss << "\t"
+       << "OMP tid: " << item.first << ", core " << item.second << "\n";
+  }
+
+  return ss.str();
 }
diff --git a/csrc/custom/custom_ops.h b/csrc/custom/custom_ops.h
deleted file mode 100644
index f8ab5ee5544df..0000000000000
--- a/csrc/custom/custom_ops.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include <torch/all.h>
-
-void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-               const int64_t rows_per_block);
-
-void LLMM1(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-           const int64_t rows_per_block);
-
-void wvSpltK(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
-             const int64_t N_in, const int64_t CuCount);
-
-void paged_attention_custom(torch::Tensor& out, torch::Tensor& exp_sums,
-                            torch::Tensor& max_logits, torch::Tensor& tmp_out,
-                            torch::Tensor& query, torch::Tensor& key_cache,
-                            torch::Tensor& value_cache, int64_t num_kv_heads,
-                            double scale, torch::Tensor& block_tables,
-                            torch::Tensor& context_lens, int64_t block_size,
-                            int64_t max_context_len,
-                            const c10::optional<torch::Tensor>& alibi_slopes,
-                            const std::string& kv_cache_dtype, double k_scale,
-                            double v_scale);
diff --git a/csrc/custom/torch_bindings.cpp b/csrc/custom/torch_bindings.cpp
deleted file mode 100644
index dc26ac5e57204..0000000000000
--- a/csrc/custom/torch_bindings.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "core/registration.h"
-#include "custom_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, custom_ops) {
-  custom_ops.def(
-      "LLMM1(Tensor in_a, Tensor in_b, Tensor! out_c, int rows_per_block) -> "
-      "()");
-  custom_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
-  custom_ops.def(
-      "LLMM_Silu(Tensor in_a, Tensor in_b, Tensor! out_c, int rows_per_block) "
-      "-> ()");
-  custom_ops.impl("LLMM_Silu", torch::kCUDA, &LLMM_Silu);
-  custom_ops.def(
-      "paged_attention_custom(Tensor! out, Tensor exp_sums,"
-      "                       Tensor max_logits, Tensor tmp_out,"
-      "                       Tensor query, Tensor key_cache,"
-      "                       Tensor value_cache, int num_kv_heads,"
-      "                       float scale, Tensor block_tables,"
-      "                       Tensor context_lens, int block_size,"
-      "                       int max_context_len,"
-      "                       Tensor? alibi_slopes,"
-      "                       str kv_cache_dtype,"
-      "                       float k_scale, float v_scale) -> ()");
-  custom_ops.impl("paged_attention_custom", torch::kCUDA,
-                  &paged_attention_custom);
-  custom_ops.def(
-      "wvSpltK(Tensor in_a, Tensor in_b, Tensor! out_c, int N_in,"
-      "        int CuCount) -> ()");
-  custom_ops.impl("wvSpltK", torch::kCUDA, &wvSpltK);
-}
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index 1e170e80d2f70..92184f43c9eb0 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
       moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 01ba8ff69850d..43d264e0770d6 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& g_idx, const torch::Tensor& perm,
     torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
     bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
\ No newline at end of file
+    bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d4d43e2c601b5..8a0e625b43fa1 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
       "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
       "bool replicate_input, bool apply_weights) -> Tensor");
-
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index 10f337f98ccbc..dab0b8c6dbf2f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -65,10 +65,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
 
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables);
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
@@ -134,9 +145,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits);
+
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                 int64_t size_n, int64_t num_bits);
 
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits);
+
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                               int64_t n);
 
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index 0e537ddd6c4cd..a9d08ca0dc14c 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -12,13 +12,11 @@ namespace prepare_inputs {
 
 //
 template <int const num_threads>
-__global__ void advance_step_kernel(int num_seqs, int num_queries,
-                                    int block_size, long* input_tokens_ptr,
-                                    long const* sampled_token_ids_ptr,
-                                    long* input_positions_ptr,
-                                    int* seq_lens_ptr, long* slot_mapping_ptr,
-                                    int const* block_tables_ptr,
-                                    int64_t const block_tables_stride) {
+__global__ void advance_step_flashattn_kernel(
+    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
+    long const* sampled_token_ids_ptr, long* input_positions_ptr,
+    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
+    int64_t const block_tables_stride) {
   int num_query_blocks = div_ceil(num_queries, num_threads);
 
   if (blockIdx.x >= num_query_blocks) {
@@ -79,16 +77,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
   }
 }
 
-void advance_step(int num_seqs, int num_queries, int block_size,
-                  torch::Tensor& input_tokens,       // type: long
-                  torch::Tensor& sampled_token_ids,  // type: long
-                  torch::Tensor& input_positions,    // type: long
-                  torch::Tensor& seq_lens,           // type: int
-                  torch::Tensor& slot_mapping,       // type: long
-                  torch::Tensor& block_tables) {     // type: int
+__global__ void advance_step_flashinfer_kernel(
+    int num_threads, int num_seqs, int num_queries, int block_size,
+    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
+    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
+    int const* block_tables_ptr, int64_t const block_tables_stride,
+    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
+  int num_query_blocks = div_ceil(num_queries, num_threads);
+
+  if (blockIdx.x < num_query_blocks) {
+    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
+
+    if (cur_query_id < num_queries) {
+      // Update input_tokens
+      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
+
+      int seq_len = seq_lens_ptr[cur_query_id];
+      int next_seq_len = seq_len + 1;
+      int next_input_pos = next_seq_len - 1;
+
+      // Update seq_lens
+      seq_lens_ptr[cur_query_id] = next_seq_len;
+      // Update input_positions
+      input_positions_ptr[cur_query_id] = next_input_pos;
+
+      int const* seq_block_tables_ptr =
+          block_tables_ptr + block_tables_stride * cur_query_id;
+
+      int block_index = next_input_pos / block_size;
+      int block_offset = next_input_pos % block_size;
+
+      // Update paged_kv_last_page_len
+      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
+
+      int slot_num =
+          seq_block_tables_ptr[block_index] * block_size + block_offset;
+      // Update slot_mapping
+      slot_mapping_ptr[cur_query_id] = slot_num;
+      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
+    }
+  }
+}
+
+__global__ void advance_step_flashinfer_indptr_kernel(
+    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
+    int* block_table_bound_ptr) {
+  int idx = blockIdx.x * num_threads + threadIdx.x;
+
+  // Update paged_kv_indptr
+  if (idx < num_queries) {
+    int sum = 0;
+    for (int i = 0; i <= idx; ++i) {
+      sum += block_table_bound_ptr[i];
+    }
+    paged_kv_indptr_ptr[idx + 1] = sum;
+  }
+}
+
+__global__ void advance_step_flashinfer_indices_kernel(
+    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
+    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
+    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
+  int idx = blockIdx.x * num_threads + threadIdx.x;
+  int row = idx / block_tables_stride;
+  int col = idx % block_tables_stride;
+
+  if (row < num_queries && col < block_table_bound_ptr[row]) {
+    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
+        block_tables_ptr[row * block_tables_stride + col];
+  }
+  // if cudagraph, fill padded seqs with the last valid seq's indptr
+  if (num_queries < row && row <= num_seqs) {
+    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
+  }
+}
+
+void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
+                            torch::Tensor& input_tokens,       // type: long
+                            torch::Tensor& sampled_token_ids,  // type: long
+                            torch::Tensor& input_positions,    // type: long
+                            torch::Tensor& seq_lens,           // type: int
+                            torch::Tensor& slot_mapping,       // type: long
+                            torch::Tensor& block_tables) {     // type: int
 
   if (logging) {
-    printf("advance_step:\n");
+    printf("advance_step_flashattn:\n");
     printf("  num_seqs = %d\n", num_seqs);
     printf("  num_queries = %d\n", num_queries);
     printf("  block_size = %d\n", block_size);
@@ -108,24 +181,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
   int blocks;
   cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
 
-  advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
-      num_seqs, num_queries, block_size,
+  advance_step_flashattn_kernel<max_threads>
+      <<<blocks, max_threads, 0, stream>>>(
+          num_seqs, num_queries, block_size,
+          reinterpret_cast<long*>(input_tokens.data_ptr()),
+          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
+          reinterpret_cast<long*>(input_positions.data_ptr()),
+          reinterpret_cast<int*>(seq_lens.data_ptr()),
+          reinterpret_cast<long*>(slot_mapping.data_ptr()),
+          reinterpret_cast<int const*>(block_tables.data_ptr()),
+          block_tables.stride(0));
+}
+
+void advance_step_flashinfer(
+    int num_seqs, int num_queries, int block_size,
+    torch::Tensor& input_tokens,            // type: long
+    torch::Tensor& sampled_token_ids,       // type: long
+    torch::Tensor& input_positions,         // type: long
+    torch::Tensor& seq_lens,                // type: int
+    torch::Tensor& slot_mapping,            // type: long
+    torch::Tensor& block_tables,            // type: int
+    torch::Tensor& paged_kv_indices,        // type: int
+    torch::Tensor& paged_kv_indptr,         // type: int
+    torch::Tensor& paged_kv_last_page_len,  // type: int
+    torch::Tensor& block_table_bound) {     // type: int
+
+  if (logging) {
+    printf("advance_step_flashinfer:\n");
+    printf("  num_seqs = %d\n", num_seqs);
+    printf("  num_queries = %d\n", num_queries);
+    printf("  block_size = %d\n", block_size);
+    printf("  block_tables.stride(0) = %d\n", block_tables.stride(0));
+  }
+  // Verify all tensors
+  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
+  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
+  //               at::kLong);
+  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
+  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
+  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
+  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
+
+  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
+  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
+  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
+                at::kInt);
+
+  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
+
+  int dev = sampled_token_ids.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+
+  int blocks;
+  int threads;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
+  if (logging) {
+    printf("launching kernel with %d blocks\n", blocks);
+  }
+
+  // TODO(will): support arbitrary block_tables stride
+  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
+    TORCH_CHECK(false,
+                "multi-step: not enough threads to map block_table to"
+                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
+                "of seqs,",
+                " increasing the block size or take smaller steps.",
+                " num_queries = ", num_queries,
+                " block_tables.stride(0) = ", block_tables.stride(0),
+                " blocks = ", blocks, " max_threads = ", threads);
+  }
+
+  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries, block_size,
       reinterpret_cast<long*>(input_tokens.data_ptr()),
       reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
       reinterpret_cast<long*>(input_positions.data_ptr()),
       reinterpret_cast<int*>(seq_lens.data_ptr()),
       reinterpret_cast<long*>(slot_mapping.data_ptr()),
       reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0));
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries,
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries,
+      reinterpret_cast<int const*>(block_tables.data_ptr()),
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
 }
 
 }  // namespace prepare_inputs
 
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
-  prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
-                               sampled_token_ids, input_positions, seq_lens,
-                               slot_mapping, block_tables);
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables) {
+  prepare_inputs::advance_step_flashattn(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables);
+}
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
+  prepare_inputs::advance_step_flashinfer(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
+      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
 }
\ No newline at end of file
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
index 2069fba759ea0..c012262e49015 100644
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -353,18 +353,47 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
     const block_iq1_s * x = (const block_iq1_s  *) vx;
 
-    const int tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = __float2half(d * (q[j] + delta));
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const int i8 = 4*ib+il;
-    uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
-    const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
-    const float d = __half2float(x[i].d) * (2*(h & 7) + 1);
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j]);
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = __float2half(d * (q[j] + delta));
+    }
 }
 
 template<typename dst_t>
@@ -475,6 +504,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
     dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 }
 
+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb = (k + QK_K - 1) / QK_K;
@@ -525,6 +560,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
             return dequantize_row_iq2_s_cuda;
         case 23:
             return dequantize_row_iq4_xs_cuda;
+        case 29:
+            return dequantize_row_iq1_m_cuda;
         default:
             return nullptr;
     }
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index d7989d84bf68e..fba94fd1d157b 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -149,14 +149,30 @@ typedef struct {
     uint8_t scales[IQ3S_N_SCALE];
 } block_iq3_s;
 
+// 1.5625 bpw
 #define QR1_S 8
 #define QI1_S (QK_K / (4*QR1_S))
 typedef struct {
     half d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
+    uint8_t  qs[QK_K/8];
+    uint16_t qh[QK_K/32];
 } block_iq1_s;
 
+// 1.75 bpw
+#define QR1_M 8
+#define QI1_M (QK_K / (4*QR1_M))
+typedef struct {
+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
+} block_iq1_m;
+
+// Used by IQ1_M quants
+typedef union {
+    half f16;
+    uint16_t  u16;
+} iq1m_scale_t;
+
 #define QK4_NL 32
 #define QR4_NL 2
 #define QI4_NL (QK4_NL / (4*QR4_NL))
@@ -733,135 +749,265 @@ static const __device__ uint32_t iq3xs_grid[512] = {
     0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
 };
 
-static const __device__ uint64_t iq1s_grid[512] = {
-    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
-    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
-    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
-    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
-    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
-    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
-    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
-    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
-    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
-    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
-    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
-    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
-    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
-    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
-    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
-    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
-    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
-    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
-    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
-    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
-    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
-    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
-    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
-    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
-    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
-    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
-    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
-    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
-    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
-    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
-    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
-    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
-    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
-    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
-    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
-    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
-    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
-    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
-    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
-    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
-    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
-    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
-    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
-    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
-    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
-    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
-    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
-    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
-    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
-    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
-    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
-    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
-    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
-    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
-    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
-    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
-    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
-    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
-    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
-    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
-    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
-    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
-    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
-    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
-    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
-    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
-    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
-    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
-    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
-    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
-    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
-    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
-    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
-    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
-    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
-    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
-    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
-    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
-    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
-    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
-    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
-    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
-    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
-    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
-    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
-    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
-    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
-    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
-    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
-    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
-    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
-    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
-    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
-    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
-    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
-    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
-    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
-    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
-    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
-    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
-    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
-    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
-    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
-    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
-    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
-    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
-    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
-    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
-    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
-    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
-    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
-    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
-    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
-    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
-    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
-    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
-    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
-    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
-    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
-    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
-    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
-    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
-    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
-    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
-    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
-    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
+#define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
+static const __device__ uint64_t iq1s_grid_gpu[2048] = {
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
 };
 
 static const __device__ uint8_t ksigns_iq2xs[128] = {
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 966d9992b25fd..37e4de4e14dd3 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -166,6 +166,11 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
                                    (void*)quant_X.data_ptr(),
                                    (half*)Y.data_ptr(), col, row, stream);
       break;
+    case 29:
+      mul_mat_vec_iq1_m_q8_1_cuda((void*)W.data_ptr(),
+                                  (void*)quant_X.data_ptr(),
+                                  (half*)Y.data_ptr(), col, row, stream);
+      break;
   }
   return Y;
 }
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
index ef2ea072392d2..b221ae7896138 100644
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -157,6 +157,14 @@ static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half *
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
 static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh
index 78c749d3f3bc1..ff339753bcbb5 100644
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ b/csrc/quantization/gguf/vecdotq.cuh
@@ -1,5 +1,18 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
 // and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
+static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
+
+    int x32  = x16[2*i32 + 0] <<  0;
+    x32     |= x16[2*i32 + 1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
+    return ((const int *) x)[i32]; // assume at least 4 byte alignment
+}
+
 static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
     const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
     int x32 = 0;
@@ -1658,28 +1671,76 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 
 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
     const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
 
-    const int ib32 = iqs;
-    int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
-    const uint8_t h1 = bq1->scales[2*ib32+0];
-    const uint8_t h2 = bq1->scales[2*ib32+1];
-    const int * q8 = (const int *)bq8_1[ib32].qs;
-    const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
-    const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
-    const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
-    const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
-    for (int j = 0; j < 2; ++j) {
-        sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
-        sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
-        sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
-        sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
-    }
-    const float d = __half2float(bq1->d) * __low2float(bq8_1[ib32].ds);
-    return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
-                sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
-#endif
+    const int       qs_packed = get_int_b2(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq1->qh[iqs];
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi = __dp4a(grid0, u0, sumi);
+        sumi = __dp4a(grid1, u1, sumi);
+    }
+
+    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
+    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+    const float2 ds    = __half22float2(bq8_1[iqs].ds);
+    return d1q * (ds.x*sumi + ds.y*delta);
+}
+
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+
+    const int       qs_packed = get_int_b4(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    int   sumi[2] = {0};
+    float sumf[2] = {0.0f};
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
+
+        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]);
+        sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]);
+
+        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
+        int sumy = 0;
+        sumy = __dp4a(u0, 0x01010101, sumy);
+        sumy = __dp4a(u1, 0x01010101, sumy);
+        sumf[l0/4] += delta*sumy;
+    }
+
+    const uint16_t * sc = (const uint16_t *) bq1->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
+    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+
+    const int tmp = sc[iqs/2] >> (6*(iqs%2));
+    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
+    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
+    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
 }
 
 static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index c58216d8e00c5..de8d9ef2ee63e 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 }
 
 #endif
+
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index c71b1bf573263..70d48de12ab05 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 }
 
 #endif
+
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
diff --git a/csrc/custom/paged_attention/attention_ll4mi.cu b/csrc/rocm/attention.cu
similarity index 94%
rename from csrc/custom/paged_attention/attention_ll4mi.cu
rename to csrc/rocm/attention.cu
index b38ec30dfcdc1..eb7c278435ab9 100644
--- a/csrc/custom/paged_attention/attention_ll4mi.cu
+++ b/csrc/rocm/attention.cu
@@ -1,4 +1,19 @@
-// TODO: add license terms
+/*
+ * Copyright (c) 2024, The vLLM team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -6,8 +21,8 @@
 #include "cuda_compat.h"
 
 #include <algorithm>
-#include "../../attention/dtype_fp8.cuh"
-#include "../../quantization/fp8/amd/quant_utils.cuh"
+#include "../attention/dtype_fp8.cuh"
+#include "../quantization/fp8/amd/quant_utils.cuh"
 
 #if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx940__) || \
                            defined(__gfx941__) || defined(__gfx942__))
@@ -51,8 +66,6 @@ using _B8x8 = uint2;
 
 ////// Non temporal load stores ///////
 
-  #if 1
-
 template <typename T>
 __device__ __forceinline__ T load(T* addr) {
   return addr[0];
@@ -63,83 +76,6 @@ __device__ __forceinline__ void store(T value, T* addr) {
   addr[0] = value;
 }
 
-  #else
-
-template <typename T>
-__device__ __forceinline__ T load(const T* addr) {
-  return __builtin_nontemporal_load(addr);
-}
-
-template <>
-__device__ __forceinline__ float2 load(const float2* addr) {
-  auto addr_alias{reinterpret_cast<const uint64_t*>(addr)};
-  auto result = __builtin_nontemporal_load(addr_alias);
-  auto ret = reinterpret_cast<float2*>(&result);
-  return ret[0];
-}
-
-template <>
-__device__ __forceinline__ float4 load(const float4* addr) {
-  auto addr_alias{reinterpret_cast<const uint64_t*>(addr)};
-  auto result1 = __builtin_nontemporal_load(addr_alias);
-  auto result2 = __builtin_nontemporal_load(addr_alias + 1);
-  float4 ret{};
-  auto ret_alias = reinterpret_cast<float2*>(&result1);
-  ret.x = ret_alias->x;
-  ret.y = ret_alias->y;
-  ret_alias = reinterpret_cast<float2*>(&result2);
-  ret.z = ret_alias->x;
-  ret.w = ret_alias->y;
-  return ret;
-}
-
-template <>
-__device__ __forceinline__ __half load(const __half* addr) {
-  auto addr_alias{reinterpret_cast<const uint16_t*>(addr)};
-  auto result = __builtin_nontemporal_load(addr_alias);
-  auto ret = reinterpret_cast<__half*>(&result);
-  return ret[0];
-}
-
-template <>
-__device__ __forceinline__ __half2 load(const __half2* addr) {
-  auto addr_alias{reinterpret_cast<const uint32_t*>(addr)};
-  auto result = __builtin_nontemporal_load(addr_alias);
-  auto ret = reinterpret_cast<__half2*>(&result);
-  return ret[0];
-}
-
-template <>
-__device__ __forceinline__ vllm::Half4_ load(const vllm::Half4_* addr) {
-  auto addr_alias{reinterpret_cast<const uint64_t*>(addr)};
-  auto result = __builtin_nontemporal_load(addr_alias);
-  auto ret = reinterpret_cast<vllm::Half4_*>(&result);
-  return ret[0];
-}
-
-template <>
-__device__ __forceinline__ vllm::Half8_ load(const vllm::Half8_* addr) {
-  auto addr_alias{reinterpret_cast<const uint64_t*>(addr)};
-  auto result1 = __builtin_nontemporal_load(addr_alias);
-  auto result2 = __builtin_nontemporal_load(addr_alias + 1);
-  vllm::Half8_ ret{};
-  auto ret_alias = reinterpret_cast<vllm::Half4_*>(&result1);
-  ret.x = ret_alias->x;
-  ret.y = ret_alias->y;
-  ret_alias = reinterpret_cast<vllm::Half4_*>(&result2);
-  ret.z = ret_alias->x;
-  ret.w = ret_alias->y;
-  return ret;
-}
-
-//// Not using nontemporal stores for now
-template <typename T>
-__device__ __forceinline__ void store(T value, T* addr) {
-  return __builtin_nontemporal_store(value, addr);
-}
-
-  #endif
-
 template <typename T, int absz, int cbid, int blgp>
 __device__ __forceinline__ floatx4 gcn_mfma_instr(const _B16x4& inpA,
                                                   const _B16x4& inpB,
@@ -673,7 +609,6 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
       }
     }
   } else {  // warp in context
-
   // iterate across heads
   #pragma unroll
     for (int qh = 0; qh < QHLOOP; qh++) {
@@ -1136,7 +1071,7 @@ void paged_attention_custom_launcher(
       break;                                                    \
   }
 
-void paged_attention_custom(
+void paged_attention(
     torch::Tensor& out,         // [num_seqs, num_heads, head_size]
     torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
     torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
diff --git a/csrc/custom/custom.cu b/csrc/rocm/custom.cu
similarity index 100%
rename from csrc/custom/custom.cu
rename to csrc/rocm/custom.cu
diff --git a/csrc/custom/custom_kernels.cu b/csrc/rocm/custom_kernels.cu
similarity index 100%
rename from csrc/custom/custom_kernels.cu
rename to csrc/rocm/custom_kernels.cu
diff --git a/csrc/custom/fused_kernels.cu b/csrc/rocm/fused_kernels.cu
similarity index 100%
rename from csrc/custom/fused_kernels.cu
rename to csrc/rocm/fused_kernels.cu
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
new file mode 100644
index 0000000000000..18c72f937f90a
--- /dev/null
+++ b/csrc/rocm/ops.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/all.h>
+
+void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               const int64_t rows_per_block);
+
+void LLMM1(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+           const int64_t rows_per_block);
+
+void wvSpltK(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+             const int64_t N_in, const int64_t CuCount);
+
+void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
+                     torch::Tensor& max_logits, torch::Tensor& tmp_out,
+                     torch::Tensor& query, torch::Tensor& key_cache,
+                     torch::Tensor& value_cache, int64_t num_kv_heads,
+                     double scale, torch::Tensor& block_tables,
+                     torch::Tensor& context_lens, int64_t block_size,
+                     int64_t max_context_len,
+                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::string& kv_cache_dtype, double k_scale,
+                     double v_scale);
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
new file mode 100644
index 0000000000000..2efa03e87e214
--- /dev/null
+++ b/csrc/rocm/torch_bindings.cpp
@@ -0,0 +1,46 @@
+#include "core/registration.h"
+#include "rocm/ops.h"
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
+  // vLLM custom ops for rocm
+  rocm_ops.def(
+      "LLMM1(Tensor in_a, Tensor in_b, Tensor! out_c, int rows_per_block) -> "
+      "()");
+  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
+  rocm_ops.def(
+      "LLMM_Silu(Tensor in_a, Tensor in_b, Tensor! out_c, int rows_per_block) "
+      "-> ()");
+  rocm_ops.impl("LLMM_Silu", torch::kCUDA, &LLMM_Silu);
+
+  // Custom attention op
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  rocm_ops.def(
+      "paged_attention(Tensor! out, Tensor exp_sums,"
+      "                Tensor max_logits, Tensor tmp_out,"
+      "                Tensor query, Tensor key_cache,"
+      "                Tensor value_cache, int num_kv_heads,"
+      "                float scale, Tensor block_tables,"
+      "                Tensor context_lens, int block_size,"
+      "                int max_context_len,"
+      "                Tensor? alibi_slopes,"
+      "                str kv_cache_dtype,"
+      "                float k_scale, float v_scale) -> ()");
+  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
+  rocm_ops.def(
+      "wvSpltK(Tensor in_a, Tensor in_b, Tensor! out_c, int N_in,"
+      "        int CuCount) -> ()");
+  rocm_ops.impl("wvSpltK", torch::kCUDA, &wvSpltK);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 21408e03fc340..51b03df5d5976 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // PagedAttention V2.
   ops.def(
       "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
@@ -77,8 +77,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
 
   // prepare_inputs advance_step
-  ops.def("advance_step", &advance_step);
-  ops.impl("advance_step", torch::kCUDA, &advance_step);
+  ops.def(
+      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
+      "Tensor! input_tokens, Tensor sampled_token_ids, "
+      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
+      "Tensor block_tables) -> ()");
+  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
+
+  ops.def(
+      "advance_step_flashinfer("
+      "    int num_seqs, int num_queries, int block_size,"
+      "    Tensor! input_tokens, Tensor sampled_token_ids,"
+      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
+      "    Tensor block_tables, Tensor! paged_kv_indices,"
+      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
+      "    Tensor! block_table_bounds"
+      ") -> ()");
+  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
 
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
@@ -130,27 +145,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantization ops
 #ifndef USE_ROCM
   // Quantized GEMM for AQLM.
-  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.def(
+      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
+      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
+      "-> Tensor");
   ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
 
   // Decompression method for AQLM.
-  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.def(
+      "aqlm_dequant(Tensor codes, Tensor codebooks, "
+      "int[] codebook_partition_sizes) -> Tensor");
   ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
 
   // Quantized GEMM for AWQ.
-  ops.def("awq_gemm", &awq_gemm);
+  ops.def(
+      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, int split_k_iters) -> Tensor");
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
-  ops.def("awq_dequantize", &awq_dequantize);
+  ops.def(
+      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
+  // Note about marlin kernel 'workspace' arguments:
+  // Technically these should be mutable since they are modified by the kernel.
+  // But since they are set back to zero once the kernel is finished we can
+  // hand wave and say that they have no net effect.
+  //
+  // The reason to mark 'workspace' as immutable is so that they don't interfere
+  // with using ScalarType arguments in the ops. If they are marked as mutable,
+  // pytorch throws an assert in
+  // 'torch._higher_order_ops._register_effectful_op' that prevents these
+  // kernels from being torch.compile'd.
+  // See the following document for more info on custom types and ops that use
+  // custom types:
+  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
+
   // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def("marlin_gemm", &marlin_gemm);
+  ops.def(
+      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
   ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
 
   // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.def(
+      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
+      "Tensor b_scales, Tensor workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int size_m, int size_n, int size_k) -> Tensor");
   ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -169,35 +213,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
 
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.def(
+      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int size_m, int size_n, int size_k, bool is_k_full, "
+      "bool has_zp, bool use_fp32_reduce) -> Tensor");
   ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
 
   // gptq_marlin repack from GPTQ.
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.def(
+      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
+      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
   ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+  ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
 
   // awq_marlin repack from AWQ.
-  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.def(
+      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
+      "SymInt size_n, int num_bits) -> Tensor");
   ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+  ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
 
   // Dequantization for GGML.
-  ops.def("ggml_dequantize", &ggml_dequantize);
+  ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
   ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
 
   // mmvq kernel for GGML.
-  ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
+  ops.def(
+      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
+      "-> Tensor");
   ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
 
   // mmq kernel for GGML.
-  ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
+  ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
   ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
 
   // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
-  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.def(
+      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor! workspace, int num_bits, int size_m, int size_n, "
+      "int size_k) -> Tensor");
   ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
 
   // marlin_qqq_gemm for QQQ.
-  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.def(
+      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
+      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
+      "Tensor! workspace, int size_m, int size_n, "
+      "int size_k) -> Tensor");
   ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -219,16 +283,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
   // capability
-  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
-  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
-           &cutlass_scaled_mm_supports_fp8);
+  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
       "Tensor! A, Tensor! B, Tensor! C,"
       "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
       "bool delta_softplus,"
-      "Tensor? index_, Tensor? x) -> Tensor[]");
+      "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   ops.def(
@@ -250,7 +314,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
   // Quantized GEMM for GPTQ.
-  ops.def("gptq_gemm", &gptq_gemm);
+  // Note: even though the C++ inferred schema is correct for this op, it seems
+  // to prevent the meta function registry.
+  ops.def(
+      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
+      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
+      "-> Tensor");
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.
@@ -270,8 +339,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
   ops.def(
-      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
-      "scale, Tensor? scale_ub) -> "
+      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
+      "Tensor! scale, Tensor? scale_ub) -> "
       "()");
   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
            &dynamic_per_token_scaled_fp8_quant);
@@ -308,8 +377,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
   // Copy the cache blocks from src to dst.
   cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 
   // Reshape the key and value tensors and cache them.
@@ -334,8 +403,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
-      "kv_cache_dtype) -> ()");
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
+      "str kv_cache_dtype) -> ()");
   cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 }
 
@@ -343,23 +412,27 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
   // Cuda utils
 
   // Gets the specified device attribute.
-  cuda_utils.def("get_device_attribute", &get_device_attribute);
-  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
 
   // Gets the maximum shared memory per block device attribute.
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
-                 &get_max_shared_memory_per_block_device_attribute);
+  cuda_utils.def(
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
   cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
-                  torch::kCUDA,
                   &get_max_shared_memory_per_block_device_attribute);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   // Custom all-reduce kernels
-  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.def(
+      "init_custom_ar(Tensor meta, Tensor rank_data, "
+      "str[] handles, int[] offsets, int rank, "
+      "bool full_nvlink) -> int");
   custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
 
-  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.def(
+      "should_custom_ar(Tensor inp, int max_size, int world_size, "
+      "bool full_nvlink) -> bool");
   custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
 
   custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
@@ -371,21 +444,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
 
   custom_ar.def("dispose", &dispose);
-  custom_ar.impl("dispose", torch::kCPU, &dispose);
-
   custom_ar.def("meta_size", &meta_size);
-  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
 
-  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.def(
+      "register_buffer(int fa, Tensor t, str[] handles, "
+      "int[] offsets) -> ()");
   custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
 
   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
-  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
-                 &get_graph_buffer_ipc_meta);
-
   custom_ar.def("register_graph_buffers", &register_graph_buffers);
-  custom_ar.impl("register_graph_buffers", torch::kCPU,
-                 &register_graph_buffers);
 #ifdef USE_ROCM
   custom_ar.def("allocate_meta_buffer", &allocate_meta_buffer);
   custom_ar.impl("allocate_meta_buffer", torch::kCUDA, &allocate_meta_buffer);
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index 3b01b109ebf2c..a3962e96e7913 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b4f5b4ab9d569..8435129e752e1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@ def setup(app):
     "aiohttp",
     "compressed_tensors",
     "cpuinfo",
+    "cv2",
     "torch",
     "transformers",
     "psutil",
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 7fc469e06844f..816e0a29ef28b 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -59,6 +59,20 @@ Build from source
     $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
     $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
+- Third, build and install oneDNN library from source:
+
+.. code-block:: console
+
+    $ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
+    $ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
+        -DONEDNN_BUILD_DOC=OFF \ 
+        -DONEDNN_BUILD_EXAMPLES=OFF \ 
+        -DONEDNN_BUILD_TESTS=OFF \ 
+        -DONEDNN_BUILD_GRAPH=OFF \ 
+        -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
+        -DONEDNN_ENABLE_PRIMITIVE=MATMUL
+    $ cmake --build ./oneDNN/build --target install --config Release
+
 - Finally, build and install vLLM CPU backend: 
 
 .. code-block:: console
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index f0e54c29fcad7..50a761b49490c 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -26,6 +26,10 @@ You can install vLLM using pip:
     $ # Install vLLM with CUDA 12.1.
     $ pip install vllm
 
+.. note::
+
+    Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
+
 .. note::
 
     As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
@@ -34,7 +38,7 @@ You can install vLLM using pip:
     .. code-block:: console
 
         $ # Install vLLM with CUDA 11.8.
-        $ export VLLM_VERSION=0.4.0
+        $ export VLLM_VERSION=0.6.1.post1
         $ export PYTHON_VERSION=310
         $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 
@@ -48,7 +52,7 @@ You can install vLLM using pip:
 
     .. code-block:: console
 
-        $ export VLLM_VERSION=0.5.4 # vLLM's main branch version is currently set to latest released tag
+        $ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag
         $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
         $ # You can also access a specific commit
         $ # export VLLM_COMMIT=...
@@ -80,11 +84,11 @@ You can also build and install vLLM from source:
 
 .. tip::
 
-    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
+    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
 
 .. tip::
     To avoid your system being overloaded, you can limit the number of compilation jobs
-    to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+    to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
 
     .. code-block:: console
 
@@ -99,7 +103,7 @@ You can also build and install vLLM from source:
         $ # Use `--ipc=host` to make sure the shared memory is large enough.
         $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
 
-    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
 
     .. code-block:: console
 
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 1bb3a448f2c92..3dcc242803752 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -107,6 +107,10 @@ Decoder-only Language Models
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
     -
+  * - :code:`MiniCPM3ForCausalLM`
+    - MiniCPM3
+    - :code:`openbmb/MiniCPM3-4B`, etc.
+    -
   * - :code:`MistralForCausalLM`
     - Mistral, Mistral-Instruct
     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
@@ -227,6 +231,11 @@ Multimodal Language Models
     - Image\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
+  * - :code:`LlavaNextVideoForConditionalGeneration`
+    - LLaVA-NeXT-Video
+    - Video
+    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - Image\ :sup:`+`
@@ -242,11 +251,21 @@ Multimodal Language Models
     - Image\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
+  * - :code:`PixtralForConditionalGeneration`
+    - Pixtral
+    - Image\ :sup:`+`
+    - :code:`mistralai/Pixtral-12B-2409`
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen-VL
-    - Image\ :sup:`E`
+    - Image\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     -
+  * - :code:`Qwen2VLForConditionalGeneration`
+    - Qwen2-VL (see note)
+    - Image\ :sup:`+` / Video\ :sup:`+`
+    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
+    -
   * - :code:`UltravoxModel`
     - Ultravox
     - Audio\ :sup:`E+`
@@ -260,6 +279,14 @@ Multimodal Language Models
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
+.. note::
+  For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
+  This can be installed by running the following command: 
+
+  .. code-block:: bash
+    
+    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
@@ -319,7 +346,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
 
 We have the following levels of testing for models:
 
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md
index 0b6944f688b49..d0895e97dc341 100644
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
@@ -1,6 +1,6 @@
 ### Quantizer Utilities
-`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
-`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
+`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
+from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
 
 ### Prerequisite
 
diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference_pixtral.py
new file mode 100644
index 0000000000000..c12ff7021cf51
--- /dev/null
+++ b/examples/offline_inference_pixtral.py
@@ -0,0 +1,165 @@
+# ruff: noqa
+import argparse
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for running Pixtral.
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Pixtral-12B-2409",
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+
+def run_simple_demo():
+    model_name = "mistralai/Pixtral-12B-2409"
+    sampling_params = SamplingParams(max_tokens=8192)
+
+    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
+    llm = LLM(model=model_name, tokenizer_mode="mistral")
+
+    prompt = "Describe this image in one sentence."
+    image_url = "https://picsum.photos/id/237/200/300"
+
+    messages = [
+        {
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+            ],
+        },
+    ]
+    outputs = llm.chat(messages, sampling_params=sampling_params)
+
+    print(outputs[0].outputs[0].text)
+
+
+def run_advanced_demo():
+    model_name = "mistralai/Pixtral-12B-2409"
+    max_img_per_msg = 5
+    max_tokens_per_img = 4096
+
+    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        limit_mm_per_prompt={"image": max_img_per_msg},
+        max_model_len=max_img_per_msg * max_tokens_per_img,
+    )
+
+    prompt = "Describe the following image."
+
+    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
+    url_2 = "https://picsum.photos/seed/picsum/200/300"
+    url_3 = "https://picsum.photos/id/32/512/512"
+
+    messages = [
+        {
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_1
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_2
+                    }
+                },
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "The images show nature.",
+        },
+        {
+            "role": "user",
+            "content": "More details please and answer only in French!.",
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url_3
+                    }
+                },
+            ],
+        },
+    ]
+
+    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run a demo in simple or advanced mode.")
+
+    parser.add_argument(
+        "mode",
+        choices=["simple", "advanced"],
+        help="Specify the demo mode: 'simple' or 'advanced'",
+    )
+
+    args = parser.parse_args()
+
+    if args.mode == "simple":
+        print("Running simple demo...")
+        run_simple_demo()
+    elif args.mode == "advanced":
+        print("Running advanced demo...")
+        run_advanced_demo()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index aa1580343aee7..464eaf334e3de 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -9,12 +9,9 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input image and question
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-question = "What is the content of this image?"
-
 
 # LLaVA-1.5
 def run_llava(question):
@@ -30,7 +27,16 @@ def run_llava(question):
 def run_llava_next(question):
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question):
+    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -173,9 +179,27 @@ def run_qwen_vl(question):
     return llm, prompt, stop_token_ids
 
 
+# Qwen2-VL
+def run_qwen2_vl(question):
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_num_seqs=5,
+    )
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
@@ -184,14 +208,53 @@ def run_qwen_vl(question):
     "blip-2": run_blip2,
     "internvl_chat": run_internvl,
     "qwen_vl": run_qwen_vl,
+    "qwen2_vl": run_qwen2_vl,
 }
 
 
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
     llm, prompt, stop_token_ids = model_example_map[model](question)
 
     # We set temperature to 0.2 so that outputs can be different
@@ -206,7 +269,7 @@ def main(args):
         inputs = {
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         }
 
@@ -215,7 +278,7 @@ def main(args):
         inputs = [{
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         } for _ in range(args.num_prompts)]
 
@@ -238,8 +301,15 @@ def main(args):
                         help='Huggingface "model_type".')
     parser.add_argument('--num-prompts',
                         type=int,
-                        default=1,
+                        default=4,
                         help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index dd84627b9dc58..454872c628373 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -6,7 +6,7 @@
 from argparse import Namespace
 from typing import List
 
-from transformers import AutoTokenizer
+from transformers import AutoProcessor, AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import fetch_image
@@ -19,7 +19,39 @@
 ]
 
 
-def load_phi3v(question, image_urls: List[str]):
+def load_qwenvl_chat(question: str, image_urls: List[str]):
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids, None, chat_template
+
+
+def load_phi3v(question: str, image_urls: List[str]):
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
@@ -30,10 +62,10 @@ def load_phi3v(question, image_urls: List[str]):
                              for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompt, stop_token_ids, None, None
 
 
-def load_internvl(question, image_urls: List[str]):
+def load_internvl(question: str, image_urls: List[str]):
     model_name = "OpenGVLab/InternVL2-2B"
 
     llm = LLM(
@@ -60,18 +92,73 @@ def load_internvl(question, image_urls: List[str]):
     # https://huggingface.co/OpenGVLab/InternVL2-2B#service
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids
+
+    return llm, prompt, stop_token_ids, None, None
+
+
+def load_qwen2_vl(question, image_urls: List[str]):
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ModuleNotFoundError:
+        print('WARNING: `qwen-vl-utils` not installed, input images will not '
+              'be automatically resized. You can enable this functionality by '
+              '`pip install qwen-vl-utils`.')
+        process_vision_info = None
+
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_num_seqs=5,
+        max_model_len=32768 if process_vision_info is None else 4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    stop_token_ids = None
+
+    if process_vision_info is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+        image_data, _ = process_vision_info(messages)
+
+    return llm, prompt, stop_token_ids, image_data, None
 
 
 model_example_map = {
     "phi3_v": load_phi3v,
     "internvl_chat": load_internvl,
+    "qwen2_vl": load_qwen2_vl,
+    "qwen_vl_chat": load_qwenvl_chat,
 }
 
 
 def run_generate(model, question: str, image_urls: List[str]):
-    llm, prompt, stop_token_ids = model_example_map[model](question,
-                                                           image_urls)
+    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
+        question, image_urls)
+    if image_data is None:
+        image_data = [fetch_image(url) for url in image_urls]
 
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=128,
@@ -81,7 +168,7 @@ def run_generate(model, question: str, image_urls: List[str]):
         {
             "prompt": prompt,
             "multi_modal_data": {
-                "image": [fetch_image(url) for url in image_urls]
+                "image": image_data
             },
         },
         sampling_params=sampling_params)
@@ -92,29 +179,32 @@ def run_generate(model, question: str, image_urls: List[str]):
 
 
 def run_chat(model: str, question: str, image_urls: List[str]):
-    llm, _, stop_token_ids = model_example_map[model](question, image_urls)
+    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
+        question, image_urls)
 
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=128,
                                      stop_token_ids=stop_token_ids)
-
-    outputs = llm.chat([{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": question,
-            },
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
+    outputs = llm.chat(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": question,
                 },
-            } for image_url in image_urls),
-        ],
-    }],
-                       sampling_params=sampling_params)
+                *({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                } for image_url in image_urls),
+            ],
+        }],
+        sampling_params=sampling_params,
+        chat_template=chat_template,
+    )
 
     for o in outputs:
         generated_text = o.outputs[0].text
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 906c9502800d8..1f00d26808771 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -16,7 +16,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
 llm.start_profile()
 
diff --git a/pyproject.toml b/pyproject.toml
index 842344b96282b..68925796d5a53 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ exclude = [
 
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile"
-skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build,./gradlib,./csrc/custom"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build,./gradlib,./csrc/rocm"
 
 [tool.isort]
 use_parentheses = true
@@ -85,5 +85,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "vlm: run tests for vision language models only",
+    "core_model: run this model test in each PR instead of just daily",
+    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
 ]
diff --git a/requirements-common.txt b/requirements-common.txt
index 49a290317f818..ad950d0313454 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -7,11 +7,12 @@ py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi
+fastapi < 0.113.0; python_version < '3.9'
+fastapi >= 0.114.1; python_version >= '3.9'
 aiohttp
 openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.8  # Required for OpenAI server.
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
@@ -23,8 +24,9 @@ filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
-gguf == 0.9.1
+gguf == 0.10.0
 importlib_metadata
-mistral_common >= 1.3.4
+mistral_common >= 1.4.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+einops # Required for Qwen2-VL.
diff --git a/requirements-test.txt b/requirements-test.txt
index 44ba99fe84bd4..16a883b81ce50 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -11,6 +11,7 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio test
+opencv-python # required for video test
 peft
 requests
 ray[adag]>=2.35
@@ -20,6 +21,7 @@ compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
+datamodel_code_generator # required for minicpm3 test
 
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 48d899ec70eda..f07211b48b68d 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,9 +3,10 @@
 
 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 
-torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch == 2.3.1+cxx11.abi
+intel-extension-for-pytorch == 2.3.110+xpu
+oneccl_bind_pt == 2.3.100+xpu
 
-triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+triton-xpu == 3.0.0b2
 
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
diff --git a/setup.py b/setup.py
index c38a1bd9f0713..8930ea7239dc9 100644
--- a/setup.py
+++ b/setup.py
@@ -170,14 +170,17 @@ def configure(self, ext: CMakeExtension) -> None:
 
         if is_sccache_available():
             cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
                 '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
             ]
         elif is_ccache_available():
             cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
                 '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
             ]
 
         # Pass the python executable to cmake so it can find an exact
@@ -460,7 +463,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
 if _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm._custom_C"))
+    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
@@ -505,6 +508,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "video": ["opencv-python"],  # Required for video processing
         "audio": ["librosa", "soundfile"]  # Required for audio processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index a89fa445bf96a..83c71b5cf6eb7 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 import sys
 import time
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
-               worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     commands = [
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
         str(tokenizer_pool_size)
     ]
 
-    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
-    # to prevent `--engine-use-ray` raises an exception due to it deprecation
-    env_vars = os.environ.copy()
-    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    if engine_use_ray:
-        commands.append("--engine-use-ray")
     if worker_use_ray:
         commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands, env=env_vars)
+    uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
 @pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
-                    engine_use_ray: bool):
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    worker_use_ray: bool):
     """
     Run the API server and test it.
 
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 03494581431d4..a093a2b29278a 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,8 +1,10 @@
 import asyncio
 import os
+import uuid
 from asyncio import CancelledError
+from copy import copy
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 
 import pytest
 import pytest_asyncio
@@ -12,6 +14,7 @@
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind
 
 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -72,14 +75,12 @@ def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
 
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
-
-    def _init_engine(self, *args, **kwargs):
-        return MockEngine()
+    _engine_class = MockEngine
 
 
 @pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine = MockAsyncLLMEngine(worker_use_ray=False)
     engine.start_background_loop()
     await asyncio.sleep(0.01)
     assert engine.engine.step_calls == 0
@@ -112,16 +113,11 @@ async def test_new_requests_event():
     assert engine.engine.add_request_calls == 3
     assert engine.engine.step_calls == old_step_calls + 1
 
-    # Allow deprecated engine_use_ray to not raise exception
-    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    engine = MockAsyncLLMEngine(worker_use_ray=True)
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
     assert engine.get_decoding_config() is not None
 
-    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
-
 
 def start_engine():
     wait_for_gpu_memory_to_clear(
@@ -130,8 +126,17 @@ def start_engine():
         timeout_s=60,
     )
 
+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+
     return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+
+
+def uid() -> str:
+    return str(uuid.uuid4())
 
 
 @pytest_asyncio.fixture(scope="module")
@@ -154,59 +159,195 @@ def should_do_global_cleanup_after_test(request) -> bool:
 
 
 @pytest.mark.asyncio(scope="module")
-async def test_asyncio_run(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
 
     async def run(prompt: str):
         sampling_params = SamplingParams(
             temperature=0,
             max_tokens=32,
+            min_tokens=32,
+            stop=stop,
         )
 
+        output_count = 0
+        final_output = None
         async for output in async_engine.generate(prompt,
                                                   sampling_params,
-                                                  request_id=prompt):
+                                                  request_id=uid()):
+            output_count += 1
             final_output = output
-        return final_output
+        return final_output, output_count
 
     results = await asyncio.gather(
         run("test0"),
-        run("test1"),
+        run("test0"),
     )
     assert len(results) == 2
+    first, second = results
+
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+
+    assert str(first) == str(second)
+
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32
 
 
 @pytest.mark.asyncio(scope="module")
-async def test_cancellation(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
     sampling_params = SamplingParams(
         temperature=0,
-        min_tokens=10,
-        max_tokens=10,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
+    )
+
+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+
+        prompt_tokens = None
+        output_tokens: List[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+
+            output_tokens.extend(token_ids)
+            output_text += text
+
+            output_count += 1
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return prompt_tokens, output_tokens, output_text, output_count
+
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+
+    cumulative, final, deltas = results
+
+    # output message counts
+    assert cumulative[3] == deltas[3]
+
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+
+    assert final[3] == 1
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
     )
 
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+
+    request_id = uid()
+
     i = 0
     with pytest.raises(CancelledError):
         async for output in async_engine.generate("test2",
                                                   sampling_params,
-                                                  request_id="test2"):
+                                                  request_id=request_id):
             assert not output.finished
             i += 1
-            if i == 5:
-                await async_engine.abort("test2")
+            if i == stop_at:
+                await async_engine.abort(request_id)
 
-    assert i == 5
+    assert i == stop_at
 
 
 @pytest.mark.asyncio(scope="module")
-async def test_delayed_generator(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
     sampling_params = SamplingParams(
         temperature=0,
         min_tokens=10,
         max_tokens=10,
+        stop=stop,
     )
 
-    stream = async_engine.generate("test3",
-                                   sampling_params,
-                                   request_id="test3")
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
     i = 0
     final_output: Optional[RealRequestOutput] = None
     async for output in stream:
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 4df6c02973284..61a6d77cd8756 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -1,6 +1,7 @@
 import pytest
 
-from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
+                                         load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         add_generation_prompt=add_generation_prompt)
 
     # Call the function and get the result
-    result = apply_chat_template(
+    result = apply_hf_chat_template(
         tokenizer,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server.py
similarity index 91%
rename from tests/async_engine/test_openapi_server_ray.py
rename to tests/async_engine/test_openapi_server.py
index f70118546c7b6..9e5c7c04287eb 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server.py
@@ -19,16 +19,11 @@ def server():
         "--max-model-len",
         "2048",
         "--enforce-eager",
-        "--engine-use-ray",
         "--chat-template",
         str(chatml_jinja_path),
     ]
 
-    # Allow `--engine-use-ray`, otherwise the launch of the server throw
-    # an error due to try to use a deprecated feature
-    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
-    with RemoteOpenAIServer(MODEL_NAME, args,
-                            env_dict=env_dict) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index ec7c2ba3e3ce0..0fe88e792520a 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -3,20 +3,27 @@
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import os
+import pickle
+import re
 import weakref
+from unittest.mock import patch
 
 import pytest
 
 from vllm import LLM
 from vllm.utils import is_hip
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
 
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
@@ -64,3 +71,88 @@ def test_models(
         name_0="hf",
         name_1="vllm",
     )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+def test_model_with_failure(vllm_runner) -> None:
+    try:
+        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                   side_effect=ValueError()):
+            with pytest.raises(ValueError) as exc_info:
+                vllm_runner("facebook/opt-125m",
+                            dtype="half",
+                            enforce_eager=False,
+                            gpu_memory_utilization=0.7)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+            filename = f"{matches.group(1)}.pkl"
+
+        with open(filename, "rb") as filep:
+            inputs = pickle.load(filep)
+
+        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
+            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
+                                 f"{list(inputs.keys())}")
+        assert isinstance(inputs["arg_1"],
+                          ModelInputForGPUWithSamplingMetadata)
+    finally:
+        os.remove(filename)
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 9c34b2a13fd53..14c5447680729 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,13 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext
 
 import pytest
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
     )
 
 
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
 @pytest.mark.parametrize(
     "kv_cache_dtype,model",
     [("fp8_e4m3",
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 7e77037da07d3..00806c3e129b1 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,10 +19,13 @@
     "facebook/opt-125m",
 ]
 
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
-    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
-    "tests/basic_correctness/test_preemption.py`")
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "tests/basic_correctness/test_preemption.py`")
 
 
 @pytest.fixture
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
             worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index d5b59db8c7887..5452ce6be8110 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -6,7 +6,8 @@
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
 def test_full_graph(model):
     # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
+    if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
+        os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
 
     from vllm import LLM, SamplingParams
     prompts = [
@@ -16,5 +17,12 @@ def test_full_graph(model):
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
-    llm.generate(prompts, sampling_params)
+    llm = LLM(model=model, enforce_eager=True)
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/conftest.py b/tests/conftest.py
index cd0091b7cba68..e4c7b96e82429 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
-                    TypeVar, Union)
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
+                    TypedDict, TypeVar, Union)
 
 import numpy as np
 import pytest
@@ -18,9 +18,11 @@
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                           BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.config import TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
@@ -44,6 +46,7 @@
 PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
 PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
                          List[List[Tuple[np.ndarray, int]]]]
+PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -85,8 +88,35 @@ def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         return [prompts["stop_sign"], prompts["cherry_blossom"]]
 
 
+class _VideoAssetPrompts(TypedDict):
+    sample_demo_1: str
+
+
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _VideoAssetsBase(UserList):
+        pass
+else:
+
+    class _VideoAssetsBase(UserList[VideoAsset]):
+        pass
+
+
+class _VideoAssets(_VideoAssetsBase):
+
+    def __init__(self) -> None:
+        super().__init__([
+            VideoAsset("sample_demo_1.mp4"),
+        ])
+
+    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+        return [prompts["sample_demo_1"]]
+
+
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
+VIDEO_ASSETS = _VideoAssets()
+"""Singleton instance of :class:`_VideoAssets`."""
 
 
 @pytest.fixture(autouse=True)
@@ -202,6 +232,11 @@ def image_assets() -> _ImageAssets:
     return IMAGE_ASSETS
 
 
+@pytest.fixture(scope="session")
+def video_assets() -> _VideoAssets:
+    return VIDEO_ASSETS
+
+
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
 
 
@@ -226,7 +261,7 @@ def __init__(
         *,
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
         postprocess_inputs: Callable[[BatchEncoding],
                                      BatchEncoding] = identity,
     ) -> None:
@@ -258,20 +293,14 @@ def __init__(
             trust_remote_code=True,
         )
 
-        try:
-            # don't put this import at the top level
-            # it will call torch.cuda.device_count()
-            from transformers import AutoProcessor  # noqa: F401
-            self.processor = AutoProcessor.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            )
-        except Exception as exc:
-            logger.warning(
-                "Unable to auto-load HuggingFace processor for model (%s). "
-                "Using tokenizer instead. Reason: %s", model_name, exc)
-            self.processor = self.tokenizer
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
 
         self.postprocess_inputs = postprocess_inputs
 
@@ -279,6 +308,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
@@ -292,6 +322,8 @@ def generate(
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -352,6 +384,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
         all_logprobs: List[List[torch.Tensor]] = []
@@ -362,6 +395,8 @@ def generate_greedy_logprobs(
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -435,6 +470,7 @@ def generate_greedy_logprobs_limit(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
@@ -454,6 +490,8 @@ def generate_greedy_logprobs_limit(
                 processor_kwargs["audio"] = audio
                 processor_kwargs["sampling_rate"] = sr
 
+            if videos is not None:
+                processor_kwargs["videos"] = videos[i]
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
 
@@ -615,8 +653,8 @@ def generate(
             outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
+    @staticmethod
     def _final_steps_generate_w_logprobs(
-        self,
         req_outputs: List[RequestOutput],
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
@@ -634,12 +672,16 @@ def generate_w_logprobs(
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
         if images is not None:
             assert len(prompts) == len(images)
 
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
@@ -649,6 +691,11 @@ def generate_w_logprobs(
             for i, audio in enumerate(audios):
                 inputs[i]["multi_modal_data"] = {"audio": audio}
 
+        if videos is not None:
+            for i, video in enumerate(videos):
+                inputs[i]["multi_modal_data"] = {"video": video}
+        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
+
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         return self._final_steps_generate_w_logprobs(req_outputs)
@@ -685,6 +732,7 @@ def generate_greedy_logprobs(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -694,7 +742,8 @@ def generate_greedy_logprobs(
         outputs = self.generate_w_logprobs(prompts,
                                            greedy_logprobs_params,
                                            images=images,
-                                           audios=audios)
+                                           audios=audios,
+                                           videos=videos)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
deleted file mode 100644
index e254686f269b1..0000000000000
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-cd $VLLM_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-
-    dtype = "half"
-    max_tokens = 5
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
deleted file mode 100644
index f00d5ef584a2a..0000000000000
--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""For encoder/decoder models only:
-Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-cd $VLLM_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness_enc_dec.py
-```
-"""
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..conftest import DecoderPromptType
-from ..models.utils import check_logprobs_close
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/bart-large-cnn", "ray"),
-    ("facebook/bart-large-cnn", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    model: str,
-    distributed_executor_backend: str,
-    hf_runner,
-    vllm_runner,
-    example_encoder_decoder_prompts,
-) -> None:
-    '''
-    Test vLLM BART inference on more than one GPU, comparing
-    outputs against HF as a baseline.
-
-    Fork a new process for each test, to prevent CUDA from
-    being re-initialized by successive tests within the same
-    process.
-
-    Arguments:
-
-    * model: the HF ID of the specific BART variant under test
-    * distributed_executor_backend
-    * hf_runner: HuggingFace (HF) test model runner
-    * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                        dictionary of dummy prompts
-    '''
-
-    dtype = "float"
-    max_tokens = 64
-    num_logprobs = 5
-
-    # Example inputs with non-trivial (i.e. not None/empty) encoder &
-    # decoder prompts.
-    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_prompts, max_tokens, num_logprobs)
-
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
deleted file mode 100644
index 262845f19822f..0000000000000
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-
-import os
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
-        assert distributed_executor_backend == "ray"
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
deleted file mode 100644
index 73ef863c2f193..0000000000000
--- a/tests/distributed/test_multimodal_broadcast.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-pytest -s -v test_multimodal_broadcast.py
-```
-"""
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("llava-hf/llava-1.5-7b-hf", "ray"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
-    ("facebook/chameleon-7b", "ray"),
-    ("llava-hf/llava-1.5-7b-hf", "mp"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
-    ("facebook/chameleon-7b", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(hf_runner, vllm_runner, image_assets, model: str,
-                distributed_executor_backend: str) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("llava-hf/llava-1.5"):
-        from ..models.test_llava import models, run_test
-    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
-        from ..models.test_llava_next import models
-    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
-        from ..models.test_chameleon import models
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        # So that LLaVA-NeXT processor may return nested list
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 637d2b30f6b1f..02288dc9dac90 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -32,7 +32,11 @@
         (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
         (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
         (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 1, 1, 1, "internlm/internlm2_5-7b-chat", "ray"),
+        # NOTE: InternVL2 multi-node tests are flaky,
+        # use mp backend to skip the multi-node tests
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
+        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
     ],
 )
 @fork_new_process_for_each_test
@@ -46,6 +50,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "float16",
+        "--max-model-len",
+        "8192",
         "--pipeline-parallel-size",
         str(PP_SIZE),
         "--tensor-parallel-size",
@@ -62,7 +68,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
     tp_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        "float16",
+        "--max-model-len",
+        "8192",
         "--tensor-parallel-size",
         str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
         "--distributed-executor-backend",
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 07e84d0ad54cd..defc4e23c8ce2 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,13 +1,13 @@
 import os
 
-import torch
+import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
 
-torch.distributed.init_process_group(backend="gloo")
-test_result = all(
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    print("Same node test passed!")
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 338b208723ba9..b8818af5614cf 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
     # token ids.
     llm = LLM(model=model, skip_tokenizer_init=True)
     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
+
     outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                            sampling_params=sampling_params)
     assert len(outputs) > 0
diff --git a/tests/entrypoints/offline_mode/__init__.py b/tests/entrypoints/offline_mode/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
new file mode 100644
index 0000000000000..0b6026a89c758
--- /dev/null
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,77 @@
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+import weakref
+
+import pytest
+
+from vllm import LLM
+
+from ...conftest import cleanup
+
+MODEL_NAME = "facebook/opt-125m"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup()
+
+
+@pytest.mark.skip_global_cleanup
+def test_offline_mode(llm: LLM, monkeypatch):
+    # we use the llm fixture to ensure the model files are in-cache
+    del llm
+
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        LLM(model=MODEL_NAME,
+            max_num_batched_tokens=4096,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.10,
+            enforce_eager=True)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+
+
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index d252b8ad3a918..097d6b1a32349 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -8,7 +8,9 @@
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 38b0477063528..ed050ce851535 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -3,8 +3,10 @@
 import pytest
 import torch
 
+from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
-                                                   NewGELU, SiluAndMul)
+                                                   NewGELU, QuickGELU,
+                                                   SiluAndMul)
 
 from .allclose_default import get_default_atol, get_default_rtol
 
@@ -39,18 +41,28 @@ def test_act_and_mul(
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
     if activation == "silu":
         layer = SiluAndMul()
+        fn = torch.ops._C.silu_and_mul
     elif activation == "gelu":
         layer = GeluAndMul(approximate="none")
+        fn = torch.ops._C.gelu_and_mul
     elif activation == "gelu_tanh":
         layer = GeluAndMul(approximate="tanh")
+        fn = torch.ops._C.gelu_tanh_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
     # The SiLU and GELU implementations are equivalent to the native PyTorch
     # implementations, so we can do exact comparison.
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    opcheck(fn, (out, x))
 
-@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
+
+@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
+                                        (NewGELU, torch.ops._C.gelu_new),
+                                        (QuickGELU, torch.ops._C.gelu_quick)])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -70,10 +82,14 @@ def test_activation(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = activation()
+    layer = activation[0]()
+    fn = activation[1]
     out = layer(x)
     ref_out = layer.forward_native(x)
     torch.testing.assert_close(out,
                                ref_out,
                                atol=get_default_atol(out),
                                rtol=get_default_rtol(out))
+
+    out = torch.empty_like(x)
+    opcheck(fn, (out, x))
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 8aa2d4a53aaa0..46831b506aff3 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -3,14 +3,17 @@
 
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
+from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.utils import get_max_shared_memory_bytes, is_hip
 
 from .allclose_default import get_default_atol, get_default_rtol
 
+if not is_hip():
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -198,6 +201,13 @@ def test_paged_attention(
             k_scale,
             v_scale,
         )
+
+        opcheck(torch.ops._C.paged_attention_v1,
+                (output, query, key_cache, value_cache, num_kv_heads, scale,
+                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
+                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                cond=(head_size == HEAD_SIZES[0]))
+
     elif version == "v2":
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         assert PARTITION_SIZE % block_size == 0
@@ -230,6 +240,14 @@ def test_paged_attention(
             k_scale,
             v_scale,
         )
+
+        opcheck(torch.ops._C.paged_attention_v2,
+                (output, exp_sums, max_logits, tmp_output, query, key_cache,
+                 value_cache, num_kv_heads, scale, block_tables, seq_lens,
+                 block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
+                 k_scale, v_scale, 0, 0, 0, 64, 0),
+                cond=(head_size == HEAD_SIZES[0]))
+
     else:
         raise AssertionError(f"Unknown version: {version}")
 
@@ -312,6 +330,165 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
+@pytest.mark.parametrize("version", ["rocm"])
+@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [64, 128])  # only test 64 128
+@pytest.mark.parametrize("use_alibi", USE_ALIBI)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", ["auto"])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(not is_hip(), reason="only for rocm")
+def test_paged_attention_rocm(
+    kv_cache_factory,
+    version: str,
+    num_seqs: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+
+    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    context_lens[-1] = MAX_SEQ_LEN
+    #context_lens = [8192 for _ in range(num_seqs)]
+    max_context_len = max(context_lens)
+    context_lens = torch.tensor(context_lens, dtype=torch.int)
+    #print('>>> ctx lens', context_lens)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+    block_tables = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
+                                                num_kv_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # TODO(charlifu) enable fp8 kv cache
+    # Using default kv_scale
+    # kv_scale = 1.0
+
+    # Call the paged attention kernel.
+    output = torch.empty_like(query)
+    PARTITION_SIZE_ROCM = 256
+    num_partitions = ((max_context_len + PARTITION_SIZE_ROCM - 1) //
+                      PARTITION_SIZE_ROCM)
+    assert PARTITION_SIZE_ROCM % block_size == 0
+    num_seqs, num_heads, head_size = output.shape
+    tmp_output = torch.empty(
+        size=(num_seqs, num_heads, num_partitions, head_size),
+        dtype=output.dtype,
+    )
+    exp_sums = torch.empty(
+        size=(num_seqs, num_heads, num_partitions),
+        dtype=torch.float32,
+    )
+    max_logits = torch.empty_like(exp_sums)
+    if version == "rocm":
+        ops.paged_attention_rocm(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+            kv_cache_dtype,
+        )
+    else:
+        raise AssertionError(f"Unknown version: {version}")
+
+    # Run the reference implementation.
+    if kv_cache_dtype == "fp8":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=device)
+        ops.convert_fp8(key_cache, dequantized_key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=device)
+        ops.convert_fp8(value_cache, dequantized_value_cache)
+        value_cache = dequantized_value_cache
+
+    ref_output = torch.empty_like(query)
+    ref_single_query_cached_kv_attention(
+        ref_output,
+        query,
+        num_queries_per_kv,
+        key_cache,
+        value_cache,
+        block_tables,
+        context_lens,
+        scale,
+        alibi_slopes,
+    )
+
+    # NOTE(woosuk): Due to the kernel-level differences in the two
+    # implementations, there is a small numerical difference in the two
+    # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-4, 1e-5
+    if dtype == torch.bfloat16:
+        atol, rtol = 2e-4, 1e-5
+    if use_alibi:
+        if dtype == torch.half:
+            atol, rtol = 5e-4, 1e-5
+        if dtype == torch.bfloat16:
+            atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
+        atol, rtol = 1e-2, 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+
+
 # TODO(woosuk): Add tests for USE_ALIBI=True.
 @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -319,6 +496,7 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(is_hip(), reason="skip for rocm")
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
diff --git a/tests/kernels/test_attention_custom.py b/tests/kernels/test_attention_custom.py
index 5efaee2e200ed..65cfbb9d9872e 100644
--- a/tests/kernels/test_attention_custom.py
+++ b/tests/kernels/test_attention_custom.py
@@ -164,79 +164,27 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
-    if version == "v1":
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            kv_scale,
-        )
-    elif version == "v2" or version == "custom":
-        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
-                          PARTITION_SIZE)
-        assert PARTITION_SIZE % block_size == 0
-        num_seqs, num_heads, head_size = output.shape
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, num_partitions, head_size),
-            dtype=output.dtype,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, num_partitions),
-            dtype=torch.float32,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        if version == "v2":
-            ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                context_lens,
-                block_size,
-                max_context_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                kv_scale,
-            )
-        elif version == "custom":
-            ops.paged_attention_custom(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                context_lens,
-                block_size,
-                max_context_len,
-                alibi_slopes,
-                kv_cache_dtype,
-            )
-    else:
-        raise AssertionError(f"Unknown version: {version}")
+    num_partitions = ((max_context_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+    assert PARTITION_SIZE % block_size == 0
+    num_seqs, num_heads, head_size = output.shape
+    tmp_output = torch.empty(
+        size=(num_seqs, num_heads, num_partitions, head_size),
+        dtype=output.dtype,
+    )
+    exp_sums = torch.empty(
+        size=(num_seqs, num_heads, num_partitions),
+        dtype=torch.float32,
+    )
+    max_logits = torch.empty_like(exp_sums)
+    ops.paged_attention_rocm(output, exp_sums, max_logits, tmp_output, query,
+                             key_cache, value_cache, num_kv_heads, scale,
+                             block_tables, context_lens, block_size,
+                             max_context_len, alibi_slopes, kv_cache_dtype,
+                             k_scale, v_scale)
 
     # Run the reference implementation.
     if kv_cache_dtype == "fp8":
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 71d18359164b1..19402a337b8d6 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
@@ -87,6 +88,11 @@ def test_copy_blocks(
     block_mapping_tensor = torch.tensor(block_mapping,
                                         dtype=torch.int64,
                                         device=device).view(-1, 2)
+
+    opcheck(torch.ops._C_cache_ops.copy_blocks,
+            (key_caches, value_caches, block_mapping_tensor),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+            cond=(head_size == HEAD_SIZES[0]))
     ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
@@ -162,6 +168,10 @@ def test_reshape_and_cache(
     k_scale = v_scale = 1.0
 
     # Call the reshape_and_cache kernel.
+    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
+            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+             k_scale, v_scale),
+            cond=(head_size == HEAD_SIZES[0]))
     ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
                           kv_cache_dtype, k_scale, v_scale)
 
@@ -269,6 +279,10 @@ def test_reshape_and_cache_flash(
     k_scale = v_scale = 1.0
 
     # Call the reshape_and_cache kernel.
+    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
+            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+             k_scale, v_scale),
+            cond=(head_size == HEAD_SIZES[0]))
     ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
                                 slot_mapping, kv_cache_dtype, k_scale, v_scale)
 
@@ -366,6 +380,14 @@ def test_swap_blocks(
     src_value_caches_clone = src_value_caches[0].clone()
 
     # Call the swap_blocks kernel.
+    do_opcheck = (head_size == HEAD_SIZES[0])
+    opcheck(torch.ops._C_cache_ops.swap_blocks,
+            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
+            cond=do_opcheck)
+    opcheck(torch.ops._C_cache_ops.swap_blocks,
+            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
+            cond=do_opcheck)
+
     ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
                     block_mapping_tensor)
     ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index e818651fe9c6a..d1f0524f83c4c 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -7,6 +7,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
@@ -108,6 +109,9 @@ def cutlass_int8_gemm_helper(m: int,
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
+    opcheck(torch.ops._C.cutlass_scaled_mm,
+            (out, a, b, scale_a, scale_b, bias))
+
 
 @pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
 @pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
@@ -341,6 +345,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
     torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
 
+    if azp_per_token:
+        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
+                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
+                 func_bias))
+    else:
+        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
+                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
+                 func_bias))
+
 
 # Test working with a subset of A and B
 def test_cutlass_subset():
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
new file mode 100644
index 0000000000000..ee29ed93b61fc
--- /dev/null
+++ b/tests/kernels/test_gguf.py
@@ -0,0 +1,126 @@
+from pathlib import Path
+from typing import List
+
+import pytest
+import torch
+from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
+from huggingface_hub import snapshot_download
+
+import vllm._custom_ops as ops
+
+GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
+
+
+def get_gguf_sample_tensors(
+        hidden_size: int,
+        quant_type: GGMLQuantizationType) -> List[ReaderTensor]:
+    sample_dir = GGUF_SAMPLE
+    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
+    sample_file = Path(sample_dir) / filename
+    return GGUFReader(sample_file).tensors
+
+
+DTYPES = [torch.half]
+# Hidden_size for testing, must match the sample file in HF repo,
+# we have `hidden_size = 256, 1024` for test in HF repo currently.
+HIDDEN_SIZES = [256, 1024]
+NUM_TOKENS = [7, 83, 128, 2048]  # Arbitrary values for testing
+SEEDS = [0]
+QUANT_TYPES = [
+    # i-matrix
+    GGMLQuantizationType.IQ1_M,
+    GGMLQuantizationType.IQ1_S,
+    GGMLQuantizationType.IQ2_S,
+    GGMLQuantizationType.IQ2_XS,
+    GGMLQuantizationType.IQ3_S,
+    GGMLQuantizationType.IQ3_XXS,
+    GGMLQuantizationType.IQ4_NL,
+    GGMLQuantizationType.IQ4_XS,
+    # k-quants
+    GGMLQuantizationType.Q2_K,
+    GGMLQuantizationType.Q3_K,
+    GGMLQuantizationType.Q4_K,
+    GGMLQuantizationType.Q5_K,
+    GGMLQuantizationType.Q6_K,
+    # standard quantization
+    GGMLQuantizationType.Q4_0,
+    GGMLQuantizationType.Q5_0,
+    GGMLQuantizationType.Q8_0,
+]
+
+
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_type", QUANT_TYPES)
+@torch.inference_mode()
+def test_dequantize(hidden_size: int, dtype: torch.dtype,
+                    quant_type: GGMLQuantizationType):
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    for tensor in tensors:
+        shape_str = tensor.name.split("_")[-1]
+        shape = map(int, shape_str.split("x"))
+
+        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
+                                  device="cuda").to(dtype)
+        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
+                                     quant_type, *list(shape)).to(dtype)
+
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
+
+
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_type", QUANT_TYPES)
+@torch.inference_mode()
+def test_mmvq(hidden_size: int, dtype: torch.dtype,
+              quant_type: GGMLQuantizationType):
+    torch.cuda.manual_seed_all(0)
+
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
+    for tensor in tensors:
+        weight = torch.tensor(dequantize(tensor.data, quant_type),
+                              device="cuda").to(dtype)
+        ref_output = x @ weight.T
+
+        qweight = torch.tensor(tensor.data, device="cuda")
+        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
+                                         qweight.shape[0]).to(dtype)
+
+        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize(
+    "quant_type",
+    [
+        # k-quants
+        GGMLQuantizationType.Q2_K,
+        GGMLQuantizationType.Q3_K,
+        GGMLQuantizationType.Q4_K,
+        GGMLQuantizationType.Q5_K,
+        GGMLQuantizationType.Q6_K,
+        # standard quants
+        GGMLQuantizationType.Q4_0,
+        GGMLQuantizationType.Q5_0,
+        GGMLQuantizationType.Q8_0,
+    ])
+@torch.inference_mode()
+def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
+             quant_type: GGMLQuantizationType):
+    torch.cuda.manual_seed_all(0)
+
+    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
+    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
+    for tensor in tensors:
+        weight = torch.tensor(dequantize(tensor.data, quant_type),
+                              device="cuda").to(dtype)
+        ref_output = x @ weight.T
+
+        qweight = torch.tensor(tensor.data, device="cuda")
+        output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
+                                     qweight.shape[0]).to(dtype)
+
+        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 7376dcaf60902..a82ecb026482e 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -2,6 +2,7 @@
 import torch
 
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
+from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -12,6 +13,16 @@
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 
 
+def opcheck_int8_quant(output, input, scale=None):
+    if scale is not None:
+        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale))
+    else:
+        scale = torch.empty((input.numel() // input.shape[-1], 1),
+                            device=input.device,
+                            dtype=torch.float32)
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale))
+
+
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -34,6 +45,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
         ops_out, ref_out, atol=1,
         rtol=0.0)  # big atol to account for rounding errors
 
+    opcheck_int8_quant(ops_out, x)
+
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -58,3 +71,5 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
     torch.testing.assert_close(
         out1, out2, atol=1,
         rtol=0.0)  # big atol to account for rounding errors
+
+    opcheck_int8_quant(out2, x, scale)
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 21bc38d67b771..6eaf67ec75f41 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -52,3 +53,10 @@ def test_rms_norm(
         torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
     else:
         torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    if residual is not None:
+        opcheck(torch.ops._C.fused_add_rms_norm,
+                (x, residual, layer.weight.data, layer.variance_epsilon))
+    else:
+        opcheck(torch.ops._C.rms_norm,
+                (out, x, layer.weight.data, layer.variance_epsilon))
diff --git a/tests/kernels/test_machete_gemm.py b/tests/kernels/test_machete_gemm.py
index dadf594409535..ce65aaef60ac6 100644
--- a/tests/kernels/test_machete_gemm.py
+++ b/tests/kernels/test_machete_gemm.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     pack_rows, quantize_weights)
@@ -76,6 +77,8 @@ def machete_quantize_and_pack(w: torch.Tensor,
     w_q = w_q.t().contiguous().t()  # convert to col major
     w_q_machete = ops.machete_prepack_B(w_q, wtype)
 
+    opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
+
     return w_ref, w_q_machete, w_s, w_zp
 
 
@@ -146,6 +149,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype,
             schedule=schedule,
         )
 
+        opcheck(torch.ops._C.machete_gemm,
+                (a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
+                    w_zp, w_s), group_size, None, None, None, schedule))
+
         # Relax atol as our reduction dim becomes larger (more rounding error)
         # Relax atol when we have zeropoints since the way machete applies
         #  zeropoints (after scales) causes noise around 0
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 18b66abe7be74..721d3a6a819ac 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
@@ -73,12 +74,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                             act_order, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
-    size_m = m_factor
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-
     # Filter act_order
     if act_order:
         if group_size == -1:
@@ -112,6 +110,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                   weight_perm)
 
+    opcheck(torch.ops._C.gptq_marlin_repack,
+            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
+
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
         q_w_gptq,
@@ -137,12 +138,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
                            mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
-    size_m = m_factor
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-
     # Normalize group_size
     if group_size == -1:
         group_size = size_k
@@ -165,6 +163,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
                                   weight_perm)
 
+    opcheck(torch.ops._C.awq_marlin_repack,
+            (q_w_awq, size_k, size_n, quant_type.size_bits))
+
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
         q_w_awq,
@@ -204,9 +205,6 @@ def test_gptq_marlin_gemm(
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-    print(f"groupsize = {group_size}")
-
     if act_order:
         if group_size == -1:
             return
@@ -224,6 +222,13 @@ def test_gptq_marlin_gemm(
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
 
+    opcheck(
+        torch.ops._C.gptq_marlin_gemm,
+        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
+         workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1],
+         a_input.shape[1], is_k_full, False, use_fp32_reduce),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+
     output = ops.gptq_marlin_gemm(
         a_input,
         marlin_q_w,
@@ -245,7 +250,6 @@ def test_gptq_marlin_gemm(
     torch.cuda.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
-    print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
 
@@ -265,9 +269,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-    print(f"groupsize = {group_size}")
-
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
 
@@ -279,6 +280,12 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
 
     output_ref = torch.matmul(a_input, w_24_ref)
 
+    opcheck(torch.ops._C.gptq_marlin_24_gemm,
+            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
+             workspace_24.scratch, quant_type, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1]),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+
     output = ops.gptq_marlin_24_gemm(
         a_input,
         marlin_24_q_w_comp,
@@ -294,7 +301,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     torch.cuda.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
-    print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
 
@@ -321,9 +327,6 @@ def test_fp8_marlin_gemm(
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-    print(f"groupsize = {group_size}")
-
     a_input = rand_data((size_m, size_k), dtype=dtype)
     b_weight = rand_data((size_k, size_n), dtype=dtype)
 
@@ -353,6 +356,10 @@ def test_fp8_marlin_gemm(
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
 
+    opcheck(torch.ops._C.fp8_marlin_gemm,
+            (a_input, marlin_qweight, marlin_scales, workspace.scratch,
+             num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
+
     output = ops.fp8_marlin_gemm(
         a=a_input,
         b_q_weight=marlin_qweight,
@@ -368,7 +375,6 @@ def test_fp8_marlin_gemm(
     torch.cuda.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
-    print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
 
@@ -396,9 +402,6 @@ def test_awq_marlin_gemm(
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-    print(f"groupsize = {group_size}")
-
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
 
@@ -434,7 +437,6 @@ def test_awq_marlin_gemm(
     torch.cuda.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
-    print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
 
@@ -460,9 +462,6 @@ def test_marlin_qqq_gemm(
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    print(f"MNK = {size_m} {size_n} {size_k}")
-    print(f"groupsize = {group_size}")
-
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
 
@@ -479,6 +478,11 @@ def test_marlin_qqq_gemm(
     workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
                                 MARLIN_QQQ_MAX_PARALLEL)
 
+    opcheck(torch.ops._C.marlin_qqq_gemm,
+            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
+             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1]))
+
     output = ops.marlin_qqq_gemm(
         q_a,
         marlin_qqq_q_w,
@@ -495,6 +499,5 @@ def test_marlin_qqq_gemm(
     torch.cuda.synchronize()
 
     max_diff = compute_max_diff(output, output_ref)
-    print("max_diff = {}".format(max_diff))
 
     assert max_diff < 0.04
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 79f94a331fdd8..88c3ef6ece511 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+from typing import List
+
 import pytest
 import torch
 from torch.nn import Parameter
@@ -12,7 +14,14 @@
 import vllm.envs as envs
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.scalar_type import scalar_types
+from vllm.utils import is_hip
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -32,6 +41,20 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -46,11 +69,11 @@ def test_fused_moe(
     topk: int,
     dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     torch_output = torch_moe(a, w1, w2, score, topk)
 
     # Pad the input if use padding
@@ -120,3 +143,196 @@ def test_mixtral_moe(dtype: torch.dtype):
                                vllm_states,
                                rtol=mixtral_moe_tol[dtype],
                                atol=mixtral_moe_tol[dtype])
+
+
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+@pytest.mark.skipif(is_hip(),
+                    reason="Make this test work with MoE padding on HIP")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    torch.manual_seed(7)
+
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+
+    quant_type = scalar_types.uint4b8
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    for i in range(w2.shape[0]):
+        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        score,
+        g_idx1,
+        g_idx2,
+        sort_indices1,
+        sort_indices2,
+        topk_weights,
+        topk_ids,
+        w1_scale=scales1,
+        w2_scale=scales2,
+    )
+
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
+
+
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+def test_marlin_moe_mmm(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+):
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == k:
+            return
+
+    quant_type = scalar_types.uint4b8
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
+
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    g_idx_l = []
+    sort_indices_l = []
+
+    for i in range(w.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        g_idx_l.append(g_idx)
+        sort_indices_l.append(sort_indices)
+
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l)
+    g_idx = stack_and_dev(g_idx_l)
+    sort_indices = stack_and_dev(sort_indices_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    marlin_output = single_marlin_moe(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 3f8f6502039aa..5746932c30a45 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -3,16 +3,31 @@
 import itertools
 import random
 from numbers import Number
-from typing import Any, List, NamedTuple, Optional, Tuple, Union
+from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
+                    Union)
 
 import pytest
 import torch
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                         make_tensor_with_pad)
 
+# For now, disable "test_aot_dispatch_dynamic" since there are some
+# bugs related to this test in PyTorch 2.4.
+DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+)
+
+ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+
 
 class QKVInputs(NamedTuple):
     '''
@@ -505,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
     * Backend instance
     '''
     if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from vllm.attention.backends.xformers import XFormersBackend
+
         return XFormersBackend()
     raise AssertionError(
         f"Unrecognized backend_name {backend_name} for unit test")
@@ -926,3 +944,19 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
     ideal_output = test_params.packed_qkvo.ideal_output
     torch.testing.assert_close(ideal_output,
                                output_under_test.view_as(ideal_output))
+
+
+def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
+                      torch._library.custom_ops.CustomOpDef],
+            args: Tuple[Any, ...],
+            kwargs: Optional[Dict[str, Any]] = None,
+            *,
+            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+            raise_exception: bool = True,
+            cond: bool = True) -> Dict[str, str]:
+    return torch.library.opcheck(
+        op,
+        args,
+        kwargs,
+        test_utils=test_utils,
+        raise_exception=raise_exception) if cond else {}
diff --git a/tests/models/decoder_only/__init__.py b/tests/models/decoder_only/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/decoder_only/audio_language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
similarity index 98%
rename from tests/models/test_ultravox.py
rename to tests/models/decoder_only/audio_language/test_ultravox.py
index e98db9b65f484..bfffd34d1142c 100644
--- a/tests/models/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -7,10 +7,8 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import HfRunner, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import HfRunner, VllmRunner
+from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
 
diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/decoder_only/language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py
similarity index 72%
rename from tests/models/test_aqlm.py
rename to tests/models/decoder_only/language/test_aqlm.py
index 80034a5118863..de46032113086 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -7,26 +7,6 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-# In this test we hardcode prompts and generations for the model so we don't
-# need to require the AQLM package as a dependency
-example_prompts = [
-    'vLLM is a high-throughput and memory-efficient inference and serving '
-    'engine for LLMs.\n',
-    'Briefly describe the major milestones in the development of artificial '
-    'intelligence from 1950 to 2020.\n',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information.\n',
-    'Describe the basic components of a neural network and how it can be '
-    'trained.\n',
-    'Write a short story about a robot that dreams for the first time.\n',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures '
-    'and future business models.\n',
-    'Explain the cultural significance of the Mona Lisa painting, and how its '
-    'perception might vary in Western versus Eastern societies.\n',
-    "Translate the following English sentence into Japanese, French, and "
-    "Swahili: 'The early bird catches the worm.'\n"
-]
-
 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
 # and the below code:
diff --git a/tests/models/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
similarity index 77%
rename from tests/models/test_big_models.py
rename to tests/models/decoder_only/language/test_big_models.py
index c3e48b56ee58f..fcc158639748d 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -5,9 +5,10 @@
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
-import torch
 
-from .utils import check_outputs_equal
+from vllm.platforms import current_platform
+
+from ...utils import check_outputs_equal
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -19,10 +20,12 @@
     # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
 
+if not current_platform.is_cpu():
+    # MiniCPM requires fused_moe which is not supported by CPU
+    MODELS.append("openbmb/MiniCPM3-4B")
+
 #TODO: remove this after CPU float16 support ready
-target_dtype = "float"
-if torch.cuda.is_available():
-    target_dtype = "half"
+target_dtype = "float" if current_platform.is_cpu() else "half"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -39,7 +42,7 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
@@ -57,7 +60,7 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
         print(vllm_model.model.llm_engine.model_executor.driver_worker.
diff --git a/tests/models/test_danube3_4b.py b/tests/models/decoder_only/language/test_danube3_4b.py
similarity index 97%
rename from tests/models/test_danube3_4b.py
rename to tests/models/decoder_only/language/test_danube3_4b.py
index bfaa275f73c19..bdd498edc293d 100644
--- a/tests/models/test_danube3_4b.py
+++ b/tests/models/decoder_only/language/test_danube3_4b.py
@@ -6,7 +6,7 @@
 """
 import pytest
 
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 
 MODELS = ["h2oai/h2o-danube3-4b-base"]
 
diff --git a/tests/models/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
similarity index 98%
rename from tests/models/test_fp8.py
rename to tests/models/decoder_only/language/test_fp8.py
index 17acdb52322fd..5a947ce62c785 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -10,7 +10,7 @@
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 
-from ..models.utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
diff --git a/tests/models/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
similarity index 98%
rename from tests/models/test_gguf.py
rename to tests/models/decoder_only/language/test_gguf.py
index 196cd88e039a1..8fc64a10c84af 100644
--- a/tests/models/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -11,7 +11,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
similarity index 98%
rename from tests/models/test_gptq_marlin.py
rename to tests/models/decoder_only/language/test_gptq_marlin.py
index 4abbc41c9c287..2155e83dbe915 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -15,7 +15,7 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
similarity index 97%
rename from tests/models/test_gptq_marlin_24.py
rename to tests/models/decoder_only/language/test_gptq_marlin_24.py
index 60d9ae2f1c629..d65be05f141b4 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -10,9 +10,10 @@
 
 import pytest
 
-from tests.models.utils import check_logprobs_close
 from tests.quantization.utils import is_quant_method_supported
 
+from ...utils import check_logprobs_close
+
 
 @dataclass
 class ModelPair:
diff --git a/tests/models/test_granite.py b/tests/models/decoder_only/language/test_granite.py
similarity index 97%
rename from tests/models/test_granite.py
rename to tests/models/decoder_only/language/test_granite.py
index 2435b5dc3ff88..82c753855e714 100644
--- a/tests/models/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 TRANSFORMERS_VERSION = tuple(
     map(int,
diff --git a/tests/models/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
similarity index 99%
rename from tests/models/test_jamba.py
rename to tests/models/decoder_only/language/test_jamba.py
index efb7b1c607721..36fa67a22b0f6 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,8 +1,9 @@
 import pytest
 
-from tests.models.utils import check_outputs_equal
 from vllm.worker.model_runner import _get_graph_batch_size
 
+from ...utils import check_outputs_equal
+
 MODELS = ["ai21labs/Jamba-tiny-random"]
 
 
diff --git a/tests/models/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py
similarity index 98%
rename from tests/models/test_marlin.py
rename to tests/models/decoder_only/language/test_marlin.py
index e86f6e29d1567..c802346dee8af 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
@@ -16,7 +16,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 
 @dataclass
diff --git a/tests/models/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
similarity index 98%
rename from tests/models/test_mistral.py
rename to tests/models/decoder_only/language/test_mistral.py
index 0741174497e32..687ba6a03a691 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@
 """
 import pytest
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
new file mode 100644
index 0000000000000..e643b115d0ea8
--- /dev/null
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -0,0 +1,79 @@
+# flake8: noqa
+"""Tests Model Optimizer fp8 models against ground truth generation
+Note: these tests will only pass on H100
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.1-8B-Instruct-FP8": [
+        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="modelopt",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
diff --git a/tests/models/test_models.py b/tests/models/decoder_only/language/test_models.py
similarity index 97%
rename from tests/models/test_models.py
rename to tests/models/decoder_only/language/test_models.py
index 4cd2cb665c8f0..68055cbe29095 100644
--- a/tests/models/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,7 +7,7 @@
 """
 import pytest
 
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
diff --git a/tests/models/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
similarity index 98%
rename from tests/models/test_phimoe.py
rename to tests/models/decoder_only/language/test_phimoe.py
index 2fb2eecc94672..dbdf5a1b934a6 100644
--- a/tests/models/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -7,7 +7,7 @@
 
 from vllm.utils import is_cpu
 
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 
 MODELS = [
     "microsoft/Phi-3.5-MoE-instruct",
diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/decoder_only/vision_language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py
similarity index 95%
rename from tests/models/test_blip2.py
rename to tests/models/decoder_only/vision_language/test_blip2.py
index 5d48bad0d7b35..e1e32b96d89ac 100644
--- a/tests/models/test_blip2.py
+++ b/tests/models/decoder_only/vision_language/test_blip2.py
@@ -6,10 +6,8 @@
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                 dtype: str, max_tokens: int, num_logprobs: int) -> None:
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalData objects and corresponding
     MultiModalConfig as input.
diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py
new file mode 100644
index 0000000000000..d01490d74bd4d
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_broadcast.py
@@ -0,0 +1,42 @@
+import pytest
+
+from ....utils import multi_gpu_test
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "facebook/chameleon-7b",
+])
+def test_models(hf_runner, vllm_runner, image_assets,
+                distributed_executor_backend, model) -> None:
+
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+
+    if model.startswith("llava-hf/llava-1.5"):
+        from .test_llava import models, run_test
+    elif model.startswith("llava-hf/llava-v1.6"):
+        from .test_llava_next import models, run_test  # type: ignore[no-redef]
+    elif model.startswith("facebook/chameleon"):
+        from .test_chameleon import models, run_test  # type: ignore[no-redef]
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/models/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py
similarity index 95%
rename from tests/models/test_chameleon.py
rename to tests/models/decoder_only/vision_language/test_chameleon.py
index e02b4b1ed72bd..8334451970a4f 100644
--- a/tests/models/test_chameleon.py
+++ b/tests/models/decoder_only/vision_language/test_chameleon.py
@@ -6,10 +6,8 @@
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_outputs_equal
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -36,7 +34,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
diff --git a/tests/models/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py
similarity index 95%
rename from tests/models/test_fuyu.py
rename to tests/models/decoder_only/vision_language/test_fuyu.py
index 0d666d8f71a92..94b8431424db5 100644
--- a/tests/models/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -6,10 +6,8 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -46,7 +44,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
diff --git a/tests/models/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py
similarity index 97%
rename from tests/models/test_intern_vit.py
rename to tests/models/decoder_only/vision_language/test_intern_vit.py
index 816f846f69bae..3c3b95b38baac 100644
--- a/tests/models/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -6,9 +6,7 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
-from ..conftest import _ImageAssets, cleanup
-
-pytestmark = pytest.mark.vlm
+from ....conftest import _ImageAssets, cleanup
 
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
diff --git a/tests/models/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py
similarity index 89%
rename from tests/models/test_internvl.py
rename to tests/models/decoder_only/vision_language/test_internvl.py
index fa3369dc53345..a756f8214edee 100644
--- a/tests/models/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -9,11 +9,9 @@
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
 
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -78,7 +76,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -331,6 +329,41 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
     )
 
 
+@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
+@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
+                               size_factors, dtype: str, max_tokens: int,
+                               num_logprobs: int) -> None:
+    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
+
+    inputs_batching = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    inputs_multi_images = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+    for inputs in [inputs_batching, inputs_multi_images]:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            mm_limit=2,
+            tensor_parallel_size=1,
+        )
+
+
 @pytest.mark.parametrize(
     "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
 @pytest.mark.parametrize(
diff --git a/tests/models/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py
similarity index 96%
rename from tests/models/test_llava.py
rename to tests/models/decoder_only/vision_language/test_llava.py
index 84ca23f6222a9..fd28a9367b4b2 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/decoder_only/vision_language/test_llava.py
@@ -8,11 +8,9 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 4
 
@@ -143,7 +141,7 @@ def _run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
@@ -239,7 +237,7 @@ def process(hf_inputs: BatchEncoding):
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+                dtype, max_tokens, num_logprobs) -> None:
     run_test(
         hf_runner,
         vllm_runner,
diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
similarity index 96%
rename from tests/models/test_llava_image_embeds.py
rename to tests/models/decoder_only/vision_language/test_llava_image_embeds.py
index cc444fe32e79b..66414032509ed 100644
--- a/tests/models/test_llava_image_embeds.py
+++ b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
@@ -5,10 +5,8 @@
 
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -62,7 +60,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
diff --git a/tests/models/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py
similarity index 97%
rename from tests/models/test_llava_next.py
rename to tests/models/decoder_only/vision_language/test_llava_next.py
index d5fe0cbe32880..f833fe0c8bbb4 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next.py
@@ -6,11 +6,9 @@
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 4
 
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                 dtype, max_tokens, num_logprobs) -> None:
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects
     and corresponding MultiModalConfig as input.
diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py
new file mode 100644
index 0000000000000..373c8964054cd
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py
@@ -0,0 +1,234 @@
+from typing import List, Optional, Tuple, Type, overload
+
+import pytest
+import transformers
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
+
+from vllm.multimodal.utils import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from ...utils import check_logprobs_close
+
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+
+HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1":
+    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
+})
+
+models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    videos = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    for video in videos:
+        print(video.shape)
+
+    if size_factors is not None:
+        inputs_per_video = [(
+            [prompt for _ in size_factors],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_video = [(
+            [prompt for _ in sizes],
+            [resize_video(video, size) for size in sizes],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_video = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        hf_outputs_per_video = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
+                                        vllm_outputs_per_video):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No video
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs, num_frames) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/videos.
+    For huggingface runner, we provide the np.ndarray as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs,
+                            num_frames) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/test_minicpmv.py b/tests/models/decoder_only/vision_language/test_minicpmv.py
similarity index 97%
rename from tests/models/test_minicpmv.py
rename to tests/models/decoder_only/vision_language/test_minicpmv.py
index 99e49c14f1f26..7bf5d75f400f9 100644
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/decoder_only/vision_language/test_minicpmv.py
@@ -9,10 +9,8 @@
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ...utils import check_logprobs_close
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -65,7 +63,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
diff --git a/tests/models/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py
similarity index 96%
rename from tests/models/test_paligemma.py
rename to tests/models/decoder_only/vision_language/test_paligemma.py
index beddaaf608a18..d7e29ea76ba4e 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/decoder_only/vision_language/test_paligemma.py
@@ -8,10 +8,8 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_hip
 
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -69,7 +67,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
diff --git a/tests/models/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
similarity index 97%
rename from tests/models/test_phi3v.py
rename to tests/models/decoder_only/vision_language/test_phi3v.py
index 6ecbf07a08b7c..e248151c40a60 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -9,10 +9,8 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip
 
-from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -71,7 +69,7 @@ def run_test(
 ):
     """Inference result should be the same between hf and vllm.
 
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
     and corresponding MultiModalConfig as input.
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
new file mode 100644
index 0000000000000..072bedfc01a1f
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -0,0 +1,199 @@
+"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_mistral.py`.
+"""
+import json
+import uuid
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import pytest
+from mistral_common.protocol.instruct.messages import ImageURLChunk
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+
+from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
+from vllm.multimodal import MultiModalDataBuiltins
+from vllm.sequence import Logprob, SampleLogprobs
+
+from ....utils import VLLM_PATH
+from ...utils import check_logprobs_close
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+MODELS = ["mistralai/Pixtral-12B-2409"]
+IMG_URLS = [
+    "https://picsum.photos/id/237/400/300",
+    "https://picsum.photos/id/231/200/300",
+    "https://picsum.photos/id/27/500/500",
+    "https://picsum.photos/id/17/150/600",
+]
+PROMPT = "Describe each image in one short sentence."
+
+
+def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": PROMPT,
+        }] + [{
+            "type": "image_url",
+            "image_url": {
+                "url": url
+            }
+        } for url in urls],
+    }]
+
+
+def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
+    msg = _create_msg_format(urls)
+
+    tokenizer = MistralTokenizer.from_model("pixtral")
+
+    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
+    tokenized = tokenizer.encode_chat_completion(request)
+
+    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
+
+    images = []
+    for chunk in request.messages[0].content:
+        if isinstance(chunk, ImageURLChunk):
+            images.append(image_from_chunk(chunk))
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs["multi_modal_data"] = mm_data
+
+    return engine_inputs
+
+
+MSGS = [
+    _create_msg_format(IMG_URLS[:1]),
+    _create_msg_format(IMG_URLS[:2]),
+    _create_msg_format(IMG_URLS),
+]
+ENGINE_INPUTS = [
+    _create_engine_inputs(IMG_URLS[:1]),
+    _create_engine_inputs(IMG_URLS[:2]),
+    _create_engine_inputs(IMG_URLS),
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+LIMIT_MM_PER_PROMPT = dict(image=4)
+
+MAX_MODEL_LEN = [8192, 65536]
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+
+FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+
+OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+
+
+# For the test author to store golden output in JSON
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
+    json_data = [(tokens, text,
+                  [{k: asdict(v)
+                    for k, v in token_logprobs.items()}
+                   for token_logprobs in (logprobs or [])])
+                 for tokens, text, logprobs in outputs]
+
+    with open(filename, "w") as f:
+        json.dump(json_data, f)
+
+
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
+    with open(filename, "rb") as f:
+        json_data = json.load(f)
+
+    return [(tokens, text,
+             [{int(k): Logprob(**v)
+               for k, v in token_logprobs.items()}
+              for token_logprobs in logprobs])
+            for tokens, text, logprobs in json_data]
+
+
+@pytest.mark.skip(
+    reason=
+    "Model is too big, test passed on A100 locally but will OOM on CI machine."
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_chat(
+    vllm_runner,
+    max_model_len: int,
+    model: str,
+    dtype: str,
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            enable_chunked_prefill=False,
+            max_model_len=max_model_len,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = []
+        for msg in MSGS:
+            output = vllm_model.model.chat(msg,
+                                           sampling_params=SAMPLING_PARAMS)
+
+            outputs.extend(output)
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
+
+
+@pytest.mark.skip(
+    reason=
+    "Model is too big, test passed on A100 locally but will OOM on CI machine."
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
+    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
+    args = EngineArgs(
+        model=model,
+        tokenizer_mode="mistral",
+        enable_chunked_prefill=False,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        dtype=dtype,
+    )
+    engine = LLMEngine.from_engine_args(args)
+
+    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
+    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
+
+    outputs = []
+    count = 0
+    while True:
+        out = engine.step()
+        count += 1
+        for request_output in out:
+            if request_output.finished:
+                outputs.append(request_output)
+
+        if count == 2:
+            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
+                               SAMPLING_PARAMS)
+        if not engine.has_unfinished_requests():
+            break
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
diff --git a/tests/models/decoder_only/vision_language/test_qwen.py b/tests/models/decoder_only/vision_language/test_qwen.py
new file mode 100644
index 0000000000000..e4f79092b7606
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_qwen.py
@@ -0,0 +1,401 @@
+import pathlib
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import pytest
+import torch
+from PIL.Image import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext, LLMInputs
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
+                          VllmRunner, _ImageAssets)
+from ...utils import check_logprobs_close
+
+text_only_models = [
+    "Qwen/Qwen-7B-Chat"  # Has no visual component
+]
+
+multimodal_models = ["Qwen/Qwen-VL"]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "Picture 1: <img></img>\nWhat's the content of the image?: ",
+    "cherry_blossom":
+    "Picture 1: <img></img>\nWhat is the season?: ",
+})
+
+HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n"  # noqa: E501
+HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n"  # noqa: E501
+### Multimodal preprocessing tests
+SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
+# These values are specific to Qwen-VL/Chat; we can get these from the model
+# config also, but they are hardcoded here to keep the parameterize/fixtures
+# easy to read.
+IMG_START_ID = 151857
+IMG_END_ID = 151858
+IMG_PAD_ID = 151859
+TOKS_PER_IMG = 256
+VIS_ENC_DIM = 4096
+IMG_SIZE = 448
+
+
+def build_model_context(model_name: str,
+                        tokenizer_name: Optional[str] = None,
+                        trust_remote_code: bool = False):
+    """Creates an InputContext for a given model.
+    
+    Args:
+        model_name: Name of the model being considered.
+        tokenizer_name: Name of the tokenizer being considered.
+        trust_remote_code: Whether or not to allow loading remote code.
+
+    Returns:
+        InputContext for the model being considered.
+    """
+    if tokenizer_name is None:
+        tokenizer_name = model_name
+    model_config = ModelConfig(
+        model_name,
+        tokenizer_name,
+        tokenizer_mode="auto",
+        trust_remote_code=trust_remote_code,
+        dtype="float32",
+        seed=0,
+    )
+    return InputContext(model_config)
+
+
+@pytest.fixture()
+def input_mapper_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_mapper_for_qwen
+    return input_mapper_for_qwen
+
+
+@pytest.fixture()
+def input_processor_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_processor_for_qwen
+    return input_processor_for_qwen
+
+
+@pytest.fixture()
+def qwen_vl_context() -> InputContext:
+    """Get an InputContext for Qwen-VL."""
+    return build_model_context(model_name="Qwen/Qwen-VL",
+                               trust_remote_code=True)
+
+
+# Happy path tests for single/multi-image scenarios for the multimodal
+# input processor and mapper, respectively
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_input_processor_valid_mm_data(input_processor_for_qwen,
+                                       qwen_vl_context: InputContext,
+                                       num_images: int):
+    """Happy cases for image inputs to Qwen's multimodal input processor."""
+    prompt = "".join(
+        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
+    inputs = LLMInputs(
+        prompt=prompt,
+        # When processing multimodal data for a multimodal model, the qwen
+        # input processor will overwrite the provided prompt_token_ids with
+        # the image prompts
+        prompt_token_ids=None,
+        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
+    )
+    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
+    assert isinstance(proc_inputs, dict)
+
+    # Each image should have one start / stop and a fixed context of 256
+    proc_tokens = proc_inputs["prompt_token_ids"]
+    assert proc_tokens.count(IMG_START_ID) == num_images
+    assert proc_tokens.count(IMG_END_ID) == num_images
+    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
+
+
+@pytest.mark.parametrize(
+    "img_data,expected_shape",
+    [
+        # single / multi-image
+        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
+        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
+        # single / multi-image embeddings
+        (torch.rand(
+            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
+    ])
+def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
+                                    qwen_vl_context: InputContext,
+                                    img_data: Union[torch.Tensor, List[Image],
+                                                    Image],
+                                    expected_shape: List[int]):
+    """Happy cases for image inputs to Qwen's multimodal input mapper."""
+    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
+    # Ensure that we get the appropriately shaped pixel_values
+    # for images and image embeddings, respectively.
+    assert isinstance(mapped_img_data, MultiModalInputs)
+    assert "pixel_values" in mapped_img_data
+    assert mapped_img_data["pixel_values"].shape == expected_shape
+
+
+# Sad path tests for the multimodal input processor and mapper, respectively
+@pytest.mark.parametrize("mm_data", [
+    {
+        "image": torch.rand((5))
+    },
+    {
+        "image": torch.rand((5, 5, 5, 5, 5))
+    },
+])
+def test_input_processor_invalid_mm_data(input_processor_for_qwen,
+                                         qwen_vl_context: InputContext,
+                                         mm_data: Dict[str, torch.Tensor]):
+    """Test sad cases validated in Qwen's multimodal input processor."""
+    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
+                                     trust_remote_code=True)
+    prompt = "Picture 1: <img></img>\n"
+    prompt_token_ids = tokenizer.encode(prompt)
+    inputs = LLMInputs(prompt=prompt,
+                       prompt_token_ids=prompt_token_ids,
+                       multi_modal_data=mm_data)
+    # Should fail since we have too many or too few dimensions for embeddings
+    with pytest.raises(ValueError):
+        input_processor_for_qwen(qwen_vl_context, inputs)
+
+
+@pytest.mark.parametrize(
+    "img_data",
+    [
+        # Wrong context length
+        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
+        # Wrong visual encoder output size
+        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
+    ])
+def test_input_mapper_invalid_mm_data(
+    input_mapper_for_qwen,
+    qwen_vl_context: InputContext,
+    img_data: Union[torch.Tensor, List[Image], Image],
+):
+    """Sad cases validated in Qwen VL's multimodal input mapper."""
+    with pytest.raises(ValueError):
+        input_mapper_for_qwen(qwen_vl_context, img_data)
+
+
+### End-to-end generation tests
+def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
+                         assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
+    """Given a temporary dir path, export one or more image assets into the
+    tempdir & replace its contents with the local path to the string so that
+    the HF version of Qwen-VL can resolve the path and load the image ni its
+    forward() call.
+
+    Args:
+        tmp_path: Tempdir for test under consideration.
+        prompt: Prompt with image placeholders.
+        assets: List of image assets whose len equals the num placeholders.
+    """
+    # Ensure that the number of placeholders matches the number of assets;
+    # If this is not true, the test is probably written incorrectly.
+    assert prompt.count("<img></img>") == len(assets)
+
+    # Replace the placeholders with local paths to the exported assets
+    for asset in assets:
+        image_tmp_path = tmp_path / f"{asset.name}.jpg"
+        asset.pil_image.save(image_tmp_path)
+        prompt = prompt.replace(
+            "<img></img>",
+            f"<img>{image_tmp_path}</img>",
+            1,
+        )
+    return prompt
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    # Qwen encodes each image into a fixed content size of 256
+    with vllm_runner(model,
+                     max_model_len=1024,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", multimodal_models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
+                                        hf_runner: Type[HfRunner],
+                                        vllm_runner: Type[VllmRunner],
+                                        image_assets: _ImageAssets, model: str,
+                                        size_factors: List[float], dtype: str,
+                                        max_tokens: int,
+                                        num_logprobs: int) -> None:
+    """Tests multimodal models with single image prompts."""
+    images = [asset.pil_image for asset in image_assets]
+
+    prompts = [
+        get_prompt_with_path(tmp_path, prompt, [asset])
+        for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+
+    inputs = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, prompts)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", multimodal_models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
+                                       hf_runner: Type[HfRunner],
+                                       vllm_runner: Type[VllmRunner],
+                                       image_assets: _ImageAssets, model: str,
+                                       size_factors: List[float], dtype: str,
+                                       max_tokens: int,
+                                       num_logprobs: int) -> None:
+    """Tests multimodal models with multi-image prompts."""
+    images = [asset.pil_image for asset in image_assets]
+    # Put all of the images into one prompt.
+    prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
+                                  image_assets)
+    inputs = [([prompt for _ in size_factors],
+               [[rescale_image_size(image, factor) for image in images]
+                for factor in size_factors])]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+# Ensure that a text-only Qwen model can still be loaded and
+# used for inference in VLLM without throwing.
+@pytest.mark.parametrize("model", text_only_models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_text_only_qwen_model_can_be_loaded_and_run(
+    vllm_runner: Type[VllmRunner],
+    example_prompts: List[str],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens,
+            num_logprobs=num_logprobs,
+        )
diff --git a/tests/models/embedding/__init__.py b/tests/models/embedding/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/embedding/language/__init__.py b/tests/models/embedding/language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/test_embedding.py b/tests/models/embedding/language/test_embedding.py
similarity index 100%
rename from tests/models/test_embedding.py
rename to tests/models/embedding/language/test_embedding.py
diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
similarity index 69%
rename from tests/models/test_bart.py
rename to tests/models/encoder_decoder/language/test_bart.py
index 660b61d1a7ade..758a9b743b397 100644
--- a/tests/models/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -1,8 +1,8 @@
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
 
-Run `pytest tests/models/test_bart.py`.
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 from vllm.utils import is_cpu
 
@@ -16,8 +16,10 @@
 
     from vllm.sequence import SampleLogprobs
 
-    from ..conftest import DecoderPromptType
-    from .utils import check_logprobs_close
+    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
+                              HfRunner, VllmRunner)
+    from ....utils import multi_gpu_test
+    from ...utils import check_logprobs_close
 
     MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
 
@@ -34,20 +36,18 @@ def vllm_to_hf_output(
 
         return output_ids, hf_output_str, out_logprobs
 
-    @pytest.mark.parametrize("model", MODELS)
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-    @pytest.mark.parametrize("max_tokens", [64])
-    @pytest.mark.parametrize("num_logprobs", [5])
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-    def test_models(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts,
+    def run_test(
+        hf_runner: Type[HfRunner],
+        vllm_runner: Type[VllmRunner],
+        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        decoder_prompt_type: DecoderPromptType,
         model: str,
+        *,
         dtype: str,
         max_tokens: int,
         num_logprobs: int,
-        decoder_prompt_type: DecoderPromptType,
+        tensor_parallel_size: int,
+        distributed_executor_backend: Optional[str] = None,
     ) -> None:
         '''
         Test the vLLM BART model for a variety of encoder/decoder input prompts,
@@ -116,8 +116,29 @@ def test_models(
         token during the process of validating the vLLM decoded output.
         '''
 
-        test_case_prompts = example_encoder_decoder_prompts[
-            decoder_prompt_type]
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method (the default).
+
+        # Note: currently encoder/decoder models are only compatible with
+        # enforce_eager=True. Normally this is not a problem because
+        # for encoder/decoder models vLLM will
+        # default to enforce_eager=True if enforce_eager
+        # is left unspecified. However, the
+        # VllmRunner test fixture (which wraps around the LLM class) defaults to
+        # enforce_eager=False (a behavior which a number of already-exisitng
+        # decoder-only unit tests expect), so when testing an encoder/decoder
+        # model we must explicitly specify enforce_eager=True in the VllmRunner
+        # constructor.
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallel_size,
+                distributed_executor_backend=distributed_executor_backend,
+                enforce_eager=True) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
 
         # Configuration settings for HF baseline
         hf_kwargs = {
@@ -135,26 +156,12 @@ def test_models(
                        auto_cls=AutoModelForSeq2SeqLM) as hf_model:
             hf_outputs = (
                 hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
+                    prompts,
                     max_tokens,
                     num_logprobs,
                     **hf_kwargs,
                 ))
 
-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models vLLM will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=False (a behavior which a number of already-exisitng
-        # decoder-only unit tests expect), so when testing an encoder/decoder
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # constructor.
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                test_case_prompts, max_tokens, num_logprobs)
-
         hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
                           else 0)
 
@@ -168,3 +175,49 @@ def test_models(
             name_1="vllm",
             num_outputs_0_skip_tokens=hf_skip_tokens,
         )
+
+    @pytest.mark.parametrize("model", MODELS)
+    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
+                    model, dtype, max_tokens, num_logprobs,
+                    decoder_prompt_type) -> None:
+
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+    @multi_gpu_test(num_gpus=2)
+    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+    @pytest.mark.parametrize("dtype", ["float"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+    def test_models_distributed(hf_runner, vllm_runner,
+                                example_encoder_decoder_prompts,
+                                distributed_executor_backend, model, dtype,
+                                max_tokens, num_logprobs,
+                                decoder_prompt_type) -> None:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+        )
diff --git a/tests/models/fixtures/pixtral_chat.json b/tests/models/fixtures/pixtral_chat.json
new file mode 100644
index 0000000000000..643afb83d29b8
--- /dev/null
+++ b/tests/models/fixtures/pixtral_chat.json
@@ -0,0 +1 @@
+[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11687260121107101, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.366872549057007, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741872787475586, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991872787475586, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991872787475586, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.28887900710105896, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.4138790369033813, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.788878917694092, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.163878917694092, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.788878917694092, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9653709530830383, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.4653708934783936, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.4653708934783936, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8403708934783936, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8403708934783936, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.003059827256947756, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.2530598640441895, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.8780598640441895, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.8780598640441895, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628059387207031, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17616479098796844, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3011648654937744, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4261648654937744, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113664627075195, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176164627075195, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10940006375312805, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4844000339508057, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109400272369385, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296900272369385, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.421900272369385, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.8322296738624573, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.5822296142578125, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.9572296142578125, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.2072296142578125, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.0197296142578125, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08769982308149338, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.7126998901367188, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.9626998901367188, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.587699890136719, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.087699890136719, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.5400654673576355, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.9150654673576355, "rank": 2, "decoded_token": " wooden"}, "3977": {"logprob": -5.415065288543701, "rank": 3, "decoded_token": " top"}, "12603": {"logprob": -5.540065288543701, "rank": 4, "decoded_token": " wood"}, "44130": {"logprob": -6.290065288543701, "rank": 5, "decoded_token": " rust"}}, {"32656": {"logprob": -0.02516966126859188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400169849395752, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275169849395752, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.525169849395752, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.962669849395752, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7264319658279419, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8514319658279419, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6014318466186523, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.226431846618652, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.726431846618652, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4668232202529907, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9668232202529907, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.466823101043701, "rank": 3, "decoded_token": " and"}, "7283": {"logprob": -2.716823101043701, "rank": 4, "decoded_token": " looking"}, "1454": {"logprob": -2.716823101043701, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -0.002247072057798505, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.627246856689453, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -7.127246856689453, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.877246856689453, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.127246856689453, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 1454, 122203, 27469, 94973, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range with rugged peaks stretches under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.9788545614574105e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.750020027160645, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.125020027160645, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062520027160645, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750020027160645, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.14020134508609772, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3902013301849365, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.7652013301849365, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -4.890201568603516, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.015201568603516, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.2003599852323532, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.075360059738159, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.575360059738159, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.887860059738159, "rank": 4, "decoded_token": " large"}, "6231": {"logprob": -4.32535982131958, "rank": 5, "decoded_token": " close"}}, {"10575": {"logprob": -0.18818901479244232, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.0631890296936035, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1881890296936035, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.9381890296936035, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.3131890296936035, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5699259042739868, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2574259042739868, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.0699257850646973, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.6324257850646973, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.7574257850646973, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2377738952636719, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3627738952636719, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.9252738952636719, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.675273895263672, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.237773895263672, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.0025601964443922043, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.315060138702393, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877560138702393, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.81506061553955, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.69006061553955, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.250051498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.812551498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.062551498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6179640889167786, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9929640293121338, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.430464029312134, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.617964029312134, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.055464029312134, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.3746516704559326, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.3121516704559326, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.3746516704559326, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.6246516704559326, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.124651908874512, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.463501580990851e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.62508487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.06439964473247528, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.0643997192382812, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.939399719238281, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.689399719238281, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.814399719238281, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.2108541578054428, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.710854172706604, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.5858540534973145, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.0858540534973145, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.9608540534973145, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08556432276964188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710564374923706, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710564136505127, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960564136505127, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960564136505127, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7751782536506653, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.7751782536506653, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.9001781940460205, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.1501784324646, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.1501784324646, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12918435037136078, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3791842460632324, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129184246063232, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129184246063232, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.629184246063232, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00017474555352237076, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.000174522399902, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875174522399902, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.625174522399902, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125174522399902, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -7.629365427419543e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -12.875007629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.062507629394531, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.562507629394531, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.812507629394531, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.558266282081604, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.495766282081604, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2457661628723145, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.9957661628723145, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9957661628723145, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.6446555852890015, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.019655704498291, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.394655704498291, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.082155704498291, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.207155704498291, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.7034653425216675, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.9534653425216675, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.078465461730957, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.328465461730957, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.453465461730957, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058106362819672, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955809593200684, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"1454": {"logprob": -1.1448894739151, "rank": 1, "decoded_token": " with"}, "94973": {"logprob": -1.1448894739151, "rank": 2, "decoded_token": " stretches"}, "2425": {"logprob": -1.8948894739151, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5198893547058105, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -3.0198893547058105, "rank": 5, "decoded_token": " covered"}}, {"122203": {"logprob": -1.0288245677947998, "rank": 1, "decoded_token": " rugged"}, "58127": {"logprob": -1.6538245677947998, "rank": 2, "decoded_token": " jag"}, "27469": {"logprob": -2.1538245677948, "rank": 3, "decoded_token": " peaks"}, "23745": {"logprob": -2.6538245677948, "rank": 4, "decoded_token": " snow"}, "95746": {"logprob": -2.8413245677948, "rank": 5, "decoded_token": " rocky"}}, {"27469": {"logprob": -0.20564845204353333, "rank": 1, "decoded_token": " peaks"}, "24765": {"logprob": -2.580648422241211, "rank": 2, "decoded_token": " terrain"}, "130655": {"logprob": -2.955648422241211, "rank": 3, "decoded_token": ""}, "1044": {"logprob": -3.580648422241211, "rank": 4, "decoded_token": ","}, "61263": {"logprob": -4.455648422241211, "rank": 5, "decoded_token": " slopes"}}, {"94973": {"logprob": -1.0839273929595947, "rank": 1, "decoded_token": " stretches"}, "1321": {"logprob": -1.1464273929595947, "rank": 2, "decoded_token": " and"}, "2425": {"logprob": -1.7714273929595947, "rank": 3, "decoded_token": " under"}, "13875": {"logprob": -3.0839273929595947, "rank": 4, "decoded_token": " covered"}, "1395": {"logprob": -3.2714273929595947, "rank": 5, "decoded_token": " is"}}, {"2425": {"logprob": -0.9016233682632446, "rank": 1, "decoded_token": " under"}, "5669": {"logprob": -1.0266233682632446, "rank": 2, "decoded_token": " across"}, "1848": {"logprob": -1.9016233682632446, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -3.151623249053955, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.026623249053955, "rank": 5, "decoded_token": " towards"}}, {"1261": {"logprob": -0.00555459875613451, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.380554676055908, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -7.630554676055908, "rank": 3, "decoded_token": " the"}, "2136": {"logprob": -9.31805419921875, "rank": 4, "decoded_token": " over"}, "16152": {"logprob": -9.38055419921875, "rank": 5, "decoded_token": " cloud"}}, {"16152": {"logprob": -0.6862213015556335, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4362213611602783, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.6862213611602783, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.0612213611602783, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.1862213611602783, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10446903109550476, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.854469060897827, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.479469060897827, "rank": 3, "decoded_token": "ed"}, "114525": {"logprob": -5.479468822479248, "rank": 4, "decoded_token": "-covered"}, "77187": {"logprob": -5.479468822479248, "rank": 5, "decoded_token": "-filled"}}, {"21283": {"logprob": -0.003459066851064563, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.3784589767456055, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.8784589767456055, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -7.8784589767456055, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -8.503458976745605, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.01103890035301447, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -4.636038780212402, "rank": 2, "decoded_token": ","}, "1338": {"logprob": -7.261038780212402, "rank": 3, "decoded_token": ".\n\n"}, "1294": {"logprob": -8.136038780212402, "rank": 4, "decoded_token": " in"}, "1454": {"logprob": -8.761038780212402, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -9.059865078597795e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.625008583068848, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.125009536743164, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.375009536743164, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.750009536743164, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -9.536697689327411e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875009536743164, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375009536743164, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750009536743164, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687509536743164, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.12580634653568268, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3758063316345215, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.6258063316345215, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.6258063316345215, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.6258063316345215, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.15412142872810364, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.3416213989257812, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.9666213989257812, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.216621398925781, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.404121398925781, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12086891382932663, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.3708689212799072, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.9958689212799072, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683368682861328, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808368682861328, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8729249238967896, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1229249238967896, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.4354248046875, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.6854248046875, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.6854248046875, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5888903737068176, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2763903141021729, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.838890314102173, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.901390314102173, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -5.026390552520752, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -9.16677454370074e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.625091552734375, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875091552734375, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.125091552734375, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750091552734375, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.052677519619464874, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.802677631378174, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.302677631378174, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.177677631378174, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.427677631378174, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.36706605553627014, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.2420660257339478, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.617065906524658, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.742065906524658, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.617065906524658, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07824385166168213, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.8282437324523926, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.703243732452393, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.828243732452393, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.953243732452393, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.5853750705718994, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0853750705718994, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.7103750705718994, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.5853750705718994, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.08537483215332, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7340722680091858, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8590722680091858, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.359072208404541, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.609072208404541, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.109072208404541, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.1324817933200393e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.625011444091797, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.000011444091797, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.625011444091797, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.625011444091797, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.50339189733495e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -15.43750286102295, "rank": 3, "decoded_token": ".\n"}, "4700": {"logprob": -15.50000286102295, "rank": 4, "decoded_token": ".M"}, "3051": {"logprob": -16.000001907348633, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6769706010818481, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9269706010818481, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.1144704818725586, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.6144704818725586, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.8644704818725586, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9251430034637451, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.300143003463745, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.362643003463745, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.425143003463745, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.800143003463745, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5277582406997681, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.902758240699768, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.5277581214904785, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.5277581214904785, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.7777581214904785, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.055658817291259766, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.9306588172912598, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.430658340454102, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.055658340454102, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.805658340454102, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6880245208740234, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.7505245208740234, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.3130245208740234, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.6880245208740234, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.2505245208740234, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4545598328113556, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.4545598030090332, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454559803009033, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204559803009033, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.642059803009033, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23015151917934418, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6051515340805054, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.605151653289795, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.167651653289795, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167651176452637, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2797861397266388, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.0297861099243164, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.2797861099243164, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6547861099243164, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.7797861099243164, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.28862035274505615, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4136204719543457, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5386204719543457, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7886204719543457, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.9136204719543457, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.04524127021431923, "rank": 1, "decoded_token": " a"}, "16152": {"logprob": -4.045241355895996, "rank": 2, "decoded_token": " cloud"}, "1420": {"logprob": -4.045241355895996, "rank": 3, "decoded_token": " an"}, "2136": {"logprob": -6.107741355895996, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.357741355895996, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.19613930583000183, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.883639335632324, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.508639335632324, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.883639335632324, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.321139335632324, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05146069824695587, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8014607429504395, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.5514607429504395, "rank": 3, "decoded_token": "-filled"}, "114525": {"logprob": -4.9264607429504395, "rank": 4, "decoded_token": "-covered"}, "4527": {"logprob": -4.9264607429504395, "rank": 5, "decoded_token": "less"}}, {"21283": {"logprob": -0.00033122775494121015, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.875330924987793, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.500330924987793, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.500330924987793, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.375330924987793, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012683063687290996, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.375126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -17.000003814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.937503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.625001907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.625001907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.750001907348633, "rank": 4, "decoded_token": ".T"}, "4700": {"logprob": -16.750001907348633, "rank": 5, "decoded_token": ".M"}}, {"8342": {"logprob": -0.5928499102592468, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6553499698638916, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5303499698638916, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.7178499698638916, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.2178499698638916, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.003268140833824873, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878268241882324, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.753268241882324, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.315768241882324, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.065768241882324, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4883846044540405, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7383846044540405, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9258846044540405, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9258846044540405, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.23838472366333, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6120346188545227, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9870346188545227, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.737034559249878, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.487034797668457, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612034797668457, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.008224429562687874, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.5082244873046875, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.6332244873046875, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.133224487304688, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.758224487304688, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3204176723957062, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.195417642593384, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.320417642593384, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.695417642593384, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.820417642593384, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004615250043570995, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.192115306854248, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.942115306854248, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.317115306854248, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.879615306854248, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.06491076946258545, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.439910888671875, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.314910888671875, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.377410888671875, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.377410888671875, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.144903540611267, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.269903540611267, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.394903540611267, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -3.0199036598205566, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1449036598205566, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12556149065494537, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.875561475753784, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.375561475753784, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -4.000561714172363, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.125561714172363, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.26737067103385925, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.2673707008361816, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.7673707008361816, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.5173707008361816, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.142370700836182, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -2.9802276912960224e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.37500286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -14.00000286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.56250286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.750003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.562501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.500001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004883386194705963, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.504883289337158, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.754883289337158, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -9.754883766174316, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.692383766174316, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5612412691116333, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7487412691116333, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.1237411499023438, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.5612411499023438, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.6862411499023438, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.9024254083633423, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1524254083633423, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6524254083633423, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.1524252891540527, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.277425289154053, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.021290099248290062, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.8962900638580322, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -7.896290302276611, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.521289825439453, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.646289825439453, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.16593234241008759, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.8534324169158936, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.9784324169158936, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -4.1034321784973145, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.2909321784973145, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05767015367746353, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.0576701164245605, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.1826701164245605, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.5576701164245605, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.5576701164245605, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017209367826581, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892209529876709, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017209529876709, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.767209529876709, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267209529876709, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9430665969848633, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3180665969848633, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9430665969848633, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4430665969848633, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3180665969848633, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.36697858572006226, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.366978645324707, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.491978645324707, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.116978645324707, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.866978645324707, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5570574402809143, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -1.9320573806762695, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1820573806762695, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.4320573806762695, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8070573806762695, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7940837144851685, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2940837144851685, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.794083595275879, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.544083595275879, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.544083595275879, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -2.145764938177308e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.125001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.000001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -18.750001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -19.687501907348633, "rank": 5, "decoded_token": " ("}}]]]
\ No newline at end of file
diff --git a/tests/models/fixtures/pixtral_chat_engine.json b/tests/models/fixtures/pixtral_chat_engine.json
new file mode 100644
index 0000000000000..60e4ae6cebf59
--- /dev/null
+++ b/tests/models/fixtures/pixtral_chat_engine.json
@@ -0,0 +1 @@
+[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11685245484113693, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.3668525218963623, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741852283477783, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991852283477783, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991852283477783, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.2591013014316559, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.5091012716293335, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.884101390838623, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.259101390838623, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.759101390838623, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9660423994064331, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.466042399406433, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.466042399406433, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8410425186157227, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8410425186157227, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.0030613720882683992, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.253061294555664, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.878061294555664, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.878061294555664, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628061294555664, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17649099230766296, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3014910221099854, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4264910221099854, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113990783691406, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176490783691406, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10929587483406067, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4842958450317383, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109295845031738, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296795845031738, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.484295845031738, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.830376148223877, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.580376148223877, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.955376148223877, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.205376148223877, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.017876148223877, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08554735779762268, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.71054744720459, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.96054744720459, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.71054744720459, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.08554744720459, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.540847897529602, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.915847897529602, "rank": 2, "decoded_token": " wooden"}, "12603": {"logprob": -5.4158477783203125, "rank": 3, "decoded_token": " wood"}, "3977": {"logprob": -5.4158477783203125, "rank": 4, "decoded_token": " top"}, "17253": {"logprob": -6.2908477783203125, "rank": 5, "decoded_token": " weather"}}, {"32656": {"logprob": -0.025753861293196678, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400753974914551, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275753974914551, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.400753974914551, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.963253974914551, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7265751957893372, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8515751957893372, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6015751361846924, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.2265753746032715, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.7265753746032715, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4868825674057007, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9868825674057007, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.3618826866149902, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.6118826866149902, "rank": 4, "decoded_token": " with"}, "7283": {"logprob": -2.7368826866149902, "rank": 5, "decoded_token": " looking"}}, {"2": {"logprob": -0.0026643513701856136, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.502664566040039, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -6.877664566040039, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.752664566040039, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.002664566040039, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range stretches across the horizon under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.811964830267243e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.250018119812012, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062518119812012, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750018119812012, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.13647246360778809, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.386472463607788, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.886472463607788, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -5.011472702026367, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.011472702026367, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.18561004102230072, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.185610055923462, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.623110055923462, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.935610055923462, "rank": 4, "decoded_token": " large"}, "74168": {"logprob": -4.373109817504883, "rank": 5, "decoded_token": " gloss"}}, {"10575": {"logprob": -0.17297746241092682, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.1729774475097656, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1729774475097656, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.985477447509766, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.360477447509766, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5785807967185974, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2660808563232422, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.016080856323242, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.578580856323242, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.703580856323242, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2709298133850098, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3959298133850098, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.8959298133850098, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.6459298133850098, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.1459298133850098, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.002432247158139944, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.377432346343994, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877432346343994, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.877431869506836, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.752431869506836, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.312551498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.750051498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.000051498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6131591200828552, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9881591796875, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.4256591796875, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.6756591796875, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.0506591796875, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.36187249422073364, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.361872434616089, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.424372434616089, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.611872434616089, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.174372673034668, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.451581379631534e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.75008487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.058125678449869156, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.1831257343292236, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.9331254959106445, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.8081254959106445, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.9331254959106445, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.21029606461524963, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.7102960348129272, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.710296154022217, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.085296154022217, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.960296154022217, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08548421412706375, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710484266281128, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710484027862549, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960484027862549, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960484027862549, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7172377109527588, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8422377109527588, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.842237710952759, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.21723747253418, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.21723747253418, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12971943616867065, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3797194957733154, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129719257354736, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129719257354736, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.504719257354736, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00015698630886618048, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.125157356262207, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875157356262207, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.750157356262207, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125157356262207, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -6.6756979322235566e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.062506675720215, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.187506675720215, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.750006675720215, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.937506675720215, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.5863217115402222, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.4613217115402222, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2113218307495117, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.8988218307495117, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9613218307495117, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.639299213886261, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.014299154281616, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.389299154281616, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.139299154281616, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.201799154281616, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.702845573425293, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.952845573425293, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.077845573425293, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.327845573425293, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.452845573425293, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058162242174149, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955816745758057, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"94973": {"logprob": -1.1164050102233887, "rank": 1, "decoded_token": " stretches"}, "1454": {"logprob": -1.1789050102233887, "rank": 2, "decoded_token": " with"}, "2425": {"logprob": -1.8664050102233887, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5539050102233887, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -2.9914050102233887, "rank": 5, "decoded_token": " covered"}}, {"5669": {"logprob": -0.3286789357662201, "rank": 1, "decoded_token": " across"}, "1848": {"logprob": -2.078678846359253, "rank": 2, "decoded_token": " out"}, "2425": {"logprob": -2.328678846359253, "rank": 3, "decoded_token": " under"}, "2203": {"logprob": -3.328678846359253, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.766179084777832, "rank": 5, "decoded_token": " towards"}}, {"1278": {"logprob": -0.039004355669021606, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -3.289004325866699, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -7.414004325866699, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -9.0390043258667, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -9.2265043258667, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2659883201122284, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.140988349914551, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.015988349914551, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.515988349914551, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -4.265988349914551, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.5356141328811646, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -1.5356141328811646, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -1.7856141328811646, "rank": 3, "decoded_token": " with"}, "25136": {"logprob": -3.785614013671875, "rank": 4, "decoded_token": " beneath"}, "1408": {"logprob": -5.785614013671875, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.006081883795559406, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.506082057952881, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -7.631082057952881, "rank": 3, "decoded_token": " cloud"}, "6133": {"logprob": -7.881082057952881, "rank": 4, "decoded_token": " clear"}, "2136": {"logprob": -8.006081581115723, "rank": 5, "decoded_token": " over"}}, {"16152": {"logprob": -0.6749536991119385, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4249536991119385, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.8624536991119385, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -2.9874536991119385, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.2374536991119385, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10860869288444519, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.9836087226867676, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.4836087226867676, "rank": 3, "decoded_token": "ed"}, "77187": {"logprob": -4.608608722686768, "rank": 4, "decoded_token": "-filled"}, "114525": {"logprob": -4.858608722686768, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.002785732736811042, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.252785682678223, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.627785682678223, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -8.627785682678223, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -9.377785682678223, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.047878943383693695, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -3.1728789806365967, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -5.547878742218018, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -7.172878742218018, "rank": 4, "decoded_token": ".\n\n"}, "1294": {"logprob": -9.172879219055176, "rank": 5, "decoded_token": " in"}}, {"2": {"logprob": -1.3351351299206726e-05, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.25001335144043, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.00001335144043, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.25001335144043, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.50001335144043, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -8.702239938429557e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -12.000008583068848, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375008583068848, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750008583068848, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687508583068848, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.14196155965328217, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.2669615745544434, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.516961574554443, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.516961574554443, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.391961574554443, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.14889711141586304, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.398897171020508, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.961397171020508, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.273897171020508, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.398897171020508, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12091328203678131, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.37091326713562, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.99591326713562, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683413505554199, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808413505554199, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8691943287849426, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1191942691802979, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.431694269180298, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.744194269180298, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.681694269180298, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5939557552337646, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2814557552337646, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.8439557552337646, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.8439557552337646, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -4.968955993652344, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -0.00010084597306558862, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.500101089477539, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875101089477539, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.000101089477539, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750101089477539, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.056158196181058884, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.6811583042144775, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.306158065795898, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.181158065795898, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.431158065795898, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.33056098222732544, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.3305609226226807, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.70556116104126, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.83056116104126, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.58056116104126, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07081110030412674, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.9458110332489014, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.6958112716674805, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.8208112716674805, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -6.0708112716674805, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.6428436636924744, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0178437232971191, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.642843723297119, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.517843723297119, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.017843723297119, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7337945103645325, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8587945103645325, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.3587944507598877, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.6087944507598877, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.108794689178467, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.0132738680113107e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.75001049041748, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.00001049041748, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.62501049041748, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.62501049041748, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.861018856492592e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.43750286102295, "rank": 2, "decoded_token": ".A"}, "4700": {"logprob": -15.37500286102295, "rank": 3, "decoded_token": ".M"}, "1626": {"logprob": -15.37500286102295, "rank": 4, "decoded_token": ".\n"}, "3051": {"logprob": -15.87500286102295, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6794427633285522, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9294427633285522, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.116942882537842, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.616942882537842, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.866942882537842, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9194075465202332, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.294407606124878, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.356907606124878, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.419407606124878, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.856907606124878, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5804797410964966, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.8304797410964966, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.455479621887207, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.455479621887207, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.705479621887207, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.0493546724319458, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -3.0493545532226562, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.424354553222656, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.049354553222656, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.799354553222656, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6676871180534363, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.792687177658081, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.292687177658081, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.730187177658081, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.292687177658081, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4542117118835449, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.454211711883545, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454211711883545, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204211711883545, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.641711711883545, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23009441792964935, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6050944328308105, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.6050944328308105, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.2300944328308105, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167593955993652, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.3072167932987213, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -1.932216763496399, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.1822168827056885, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6822168827056885, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.6822168827056885, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.2914469838142395, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4164469242095947, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5414469242095947, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7914469242095947, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.7914469242095947, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.0460360012948513, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -3.9210360050201416, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -4.1085357666015625, "rank": 3, "decoded_token": " cloud"}, "2136": {"logprob": -6.1710357666015625, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.4210357666015625, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.20367540419101715, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.8286755084991455, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.5161755084991455, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.8286755084991455, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.328675270080566, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05241352692246437, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8024134635925293, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.552413463592529, "rank": 3, "decoded_token": "-filled"}, "4527": {"logprob": -4.802413463592529, "rank": 4, "decoded_token": "less"}, "114525": {"logprob": -4.927413463592529, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.0003716255014296621, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.750371932983398, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.375371932983398, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.375371932983398, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.250371932983398, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012730741582345217, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.250126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -16.937503814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.875003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.687501907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.687501907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.812501907348633, "rank": 4, "decoded_token": ".T"}, "48426": {"logprob": -16.812501907348633, "rank": 5, "decoded_token": ".The"}}, {"8342": {"logprob": -0.5730464458465576, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6980464458465576, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5730464458465576, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.6980464458465576, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.1980464458465576, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.0033258858602494, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878325939178467, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.628325939178467, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.253325462341309, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.003325462341309, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4996429681777954, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7496429681777954, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9371429681777954, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9371429681777954, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.187142848968506, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6126739382743835, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9876739382743835, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.7376739978790283, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.425173759460449, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612673759460449, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.00729279313236475, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.632292747497559, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.757292747497559, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.257292747497559, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.757292747497559, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3071398138999939, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.1821398735046387, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.4321398735046387, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.6821398735046387, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.8071398735046387, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004646694287657738, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.1921467781066895, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.9421467781066895, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.2546467781066895, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.8796467781066895, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.0658877044916153, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.440887689590454, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.253387928009033, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.315887928009033, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.378387928009033, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.1504861116409302, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.2754861116409302, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.4004861116409302, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -2.9004859924316406, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1504859924316406, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12151996046304703, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.8715200424194336, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.4965200424194336, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -3.9965200424194336, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.121520042419434, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.3073118329048157, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.182311773300171, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.557311773300171, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.432311773300171, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.05731201171875, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -3.3378546504536644e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.25000286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -13.93750286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.43750286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.437501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.375001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004292916506528854, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.629292964935303, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.879292964935303, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -10.004292488098145, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.879292488098145, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5473321676254272, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7348321676254272, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.109832286834717, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.609832286834717, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.672332286834717, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.8954829573631287, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1454830169677734, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6454830169677734, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.2704830169677734, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.270483016967773, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.02117946185171604, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.896179437637329, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -8.14617919921875, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.64617919921875, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.64617919921875, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.18962937593460083, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.752129316329956, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.877129316329956, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -3.939629316329956, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.189629554748535, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05349981039762497, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.053499698638916, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.303499698638916, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.678499698638916, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.678499698638916, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017386287450790405, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892386436462402, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017386436462402, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.642386436462402, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267386436462402, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9453322887420654, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3203322887420654, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9453322887420654, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4453322887420654, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3203322887420654, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.3668670654296875, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.3668670654296875, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.4918670654296875, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.1168670654296875, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.8668670654296875, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5530153512954712, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -2.0530152320861816, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1780152320861816, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.3030152320861816, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8030152320861816, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7924000024795532, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2924000024795532, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.7923998832702637, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.5423998832702637, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.5423998832702637, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.250001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.250001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -19.000001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -20.000001907348633, "rank": 5, "decoded_token": " ("}}]]]
\ No newline at end of file
diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py
deleted file mode 100644
index 05f5cbf8c3435..0000000000000
--- a/tests/models/test_qwen.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import pathlib
-from typing import List, Optional, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
-
-text_only_models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual component
-]
-
-multimodal_models = ["Qwen/Qwen-VL"]
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "Picture 1: <img></img>\nWhat's the content of the image?: ",
-    "cherry_blossom":
-    "Picture 1: <img></img>\nWhat is the season?: ",
-})
-
-
-### Tests for multimodal Qwen models
-def run_test(
-    tmp_path: pathlib.PosixPath,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    # Export the images to a tempdir and substitute it into the hf prompt;
-    # the contents between <img>/</img> will be ignored by VLLM, but the
-    # transformers implementation for the visual transformer parses this to
-    # reload it in the forward call; the contents are treated as a URL or a
-    # local path.
-    for idx, asset in enumerate(image_assets):
-        image_tmp_path = tmp_path / f"{asset.name}.jpg"
-        asset.pil_image.save(image_tmp_path)
-        HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
-            "<img></img>", f"<img>{image_tmp_path}</img>")
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    # Qwen encodes images into a fixed content size of 256
-    with vllm_runner(model,
-                     max_model_len=300,
-                     max_num_seqs=1,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
-                           model, size_factors, dtype, max_tokens,
-                           num_logprobs) -> None:
-    run_test(
-        tmp_path,
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-# Ensure that a text-only Qwen model can still be loaded and
-# used for inference in VLLM without throwing.
-@pytest.mark.parametrize("model", text_only_models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts,
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index b058e2755c245..3930a5f465f70 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,9 +1,14 @@
 import pytest
+import transformers
 
 from vllm.model_executor.models import _MODELS, ModelRegistry
 
 
 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
+    if (model_cls == "Qwen2VLForConditionalGeneration"
+            and transformers.__version__ < "4.45"):
+        pytest.skip("Waiting for next transformers release")
+
     # Ensure all model classes can be imported successfully
     ModelRegistry.resolve_model_cls([model_cls])
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 0cbe8371e235a..a75a671e57f74 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,9 +1,10 @@
 # Test the AsyncLLMEngine with multi-step-decoding
-
 from typing import List, Optional
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
+
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
                      get_client_text_logprob_generations)
@@ -33,8 +34,9 @@
 @pytest.mark.parametrize("eager_mode", [False, True])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("is_async", [False, True])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("is_async", [True])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 @pytest.mark.asyncio
 async def test_multi_step(
     example_prompts,
@@ -46,6 +48,8 @@ async def test_multi_step(
     num_prompts: int,
     is_async: bool,
     num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
     client/server environment.
@@ -71,6 +75,8 @@ async def test_multi_step(
                     completions endpoint; `None` -> no logprobs
     """
 
+    override_backend_env_variable(monkeypatch, attention_backend)
+
     prompts = example_prompts
     if len(prompts) < num_prompts:
         prompts = prompts * ((num_prompts // len(prompts)) + 1)
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 3f0c6cbc051a7..87200b1dcc534 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -10,6 +10,8 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
+from ..utils import fork_new_process_for_each_test
+
 models_4bit_to_test = [
     ('huggyllama/llama-7b', 'quantize model inflight'),
 ]
@@ -29,6 +31,7 @@
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@fork_new_process_for_each_test
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                          models_pre_qaunt_4bit_to_test)
+@fork_new_process_for_each_test
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                        model_name, description) -> None:
 
@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                          models_pre_quant_8bit_to_test)
+@fork_new_process_for_each_test
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
                              model_name,
                              hf_model_kwargs=None):
 
-    if hf_model_kwargs is None:
-        hf_model_kwargs = {}
-
-    # Run with HF runner
-    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
-        hf_outputs = llm.generate_greedy(prompts, 8)
-        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
-
-    # Clean up the GPU memory for the next test
-    torch.cuda.synchronize()
-    gc.collect()
-    torch.cuda.empty_cache()
+    # NOTE: run vLLM first, as it requires a clean process
+    # when using distributed inference
 
     #Run with vLLM runner
     with vllm_runner(model_name,
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
     gc.collect()
     torch.cuda.empty_cache()
 
+    if hf_model_kwargs is None:
+        hf_model_kwargs = {}
+
+    # Run with HF runner
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        hf_outputs = llm.generate_greedy(prompts, 8)
+        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
+
+    # Clean up the GPU memory for the next test
+    torch.cuda.synchronize()
+    gc.collect()
+    torch.cuda.empty_cache()
+
     # Compare the generated strings
     for hf_log, vllm_log in zip(hf_logs, vllm_logs):
         hf_str = hf_log["generated_text"]
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 7dd20636c892f..627b2abaabcf9 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -56,7 +56,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
         assert qkv_proj.weight_scale.dtype is torch.float32
         assert qkv_proj.input_scale.dtype is torch.float32
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
 
 
@@ -85,7 +85,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
         assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
 
 
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 65bb80ed70c6a..5fad06878f4a3 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,12 +1,10 @@
-import torch
-
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
     # Currently, all quantization methods require Nvidia or AMD GPUs
-    if not torch.cuda.is_available():
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
     capability = current_platform.get_device_capability()
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index a701f482b4ffb..3d93f4a23b68a 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,224 +1,54 @@
-import asyncio
-import os
 from itertools import cycle
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Tuple
 
 import pytest
-import ray
-import torch
 
-from vllm import LLM
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.lora.request import LoRARequest
+from vllm import LLM, SamplingParams
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import MultiModalDataDict
-from vllm.outputs import RequestOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, random_uuid
 
 from ...conftest import cleanup
-from ...utils import wait_for_gpu_memory_to_clear
+from ...models.utils import check_logprobs_close, check_outputs_equal
+from ...utils import RemoteOpenAIServer
 
-
-class AsyncLLM:
-    """AsyncLLM
-
-    Note: Current LLM class in vllm don't support async mode, for test purpose,
-    we implement async one in here. Maybe we could move to
-    vllm/entrypoints/llm.py in future.
-
-    Below AsyncLLM is directly borrow from vllm/entrypoints/llm.py with changes
-    to make to work in async mode.
-    """
-
-    def __init__(
-        self,
-        model: str,
-        tokenizer: Optional[str] = None,
-        tokenizer_mode: str = "auto",
-        skip_tokenizer_init: bool = False,
-        trust_remote_code: bool = False,
-        tensor_parallel_size: int = 1,
-        dtype: str = "auto",
-        quantization: Optional[str] = None,
-        revision: Optional[str] = None,
-        tokenizer_revision: Optional[str] = None,
-        seed: int = 0,
-        gpu_memory_utilization: float = 0.9,
-        swap_space: int = 4,
-        enforce_eager: bool = False,
-        max_seq_len_to_capture: int = 8192,
-        disable_custom_all_reduce: bool = False,
-        **kwargs,
-    ) -> None:
-        if "disable_log_stats" not in kwargs:
-            kwargs["disable_log_stats"] = True
-
-        # Needed to engine_use_ray works as a deprecated feature,
-        # otherwise the following constructor will raise an exception
-        os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-        engine_args = AsyncEngineArgs(
-            model=model,
-            tokenizer=tokenizer,
-            tokenizer_mode=tokenizer_mode,
-            skip_tokenizer_init=skip_tokenizer_init,
-            trust_remote_code=trust_remote_code,
-            tensor_parallel_size=tensor_parallel_size,
-            dtype=dtype,
-            quantization=quantization,
-            revision=revision,
-            tokenizer_revision=tokenizer_revision,
-            seed=seed,
-            gpu_memory_utilization=gpu_memory_utilization,
-            swap_space=swap_space,
-            enforce_eager=enforce_eager,
-            max_seq_len_to_capture=max_seq_len_to_capture,
-            # For now use ray for the distributed back-end, since
-            # we rely on the use of engine_use_ray=True to avoid
-            # reinitializing CUDA in the same process (driver worker)
-            engine_use_ray=True,
-            distributed_executor_backend="ray",
-            disable_custom_all_reduce=disable_custom_all_reduce,
-            **kwargs,
-        )
-        self.request_counter = Counter()
-        self.llm_engine = AsyncLLMEngine.from_engine_args(
-            engine_args, usage_context=UsageContext.LLM_CLASS)
-
-    def generate(
-        self,
-        prompts: Optional[Union[str, List[str]]] = None,
-        sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
-        prompt_token_ids: Optional[List[List[int]]] = None,
-        use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalDataDict] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None
-    ) -> List[RequestOutput]:
-
-        if prompts is None:
-            raise ValueError("prompts must be provided.")
-        if isinstance(prompts, str):
-            # Convert a single prompt to a list.
-            prompts = [prompts]
-
-        if prompts is not None:
-            num_requests = len(prompts)
-
-        if sampling_params is None:
-            # Use default sampling params.
-            sampling_params = SamplingParams()
-
-        elif isinstance(sampling_params,
-                        list) and len(sampling_params) != num_requests:
-            raise ValueError("The lengths of prompts and "
-                             "sampling_params must be the same.")
-
-        async def get_output(prompt, sampling_param) -> RequestOutput:
-            request_id = random_uuid()
-            results_generator = self.llm_engine.generate(
-                prompt, sampling_param, request_id)
-            final_output = None
-            async for request_output in results_generator:
-                final_output = request_output
-            assert final_output is not None
-            return final_output
-
-        outputs: List[RequestOutput] = []
-        try:
-            for i in range(num_requests):
-                prompt = prompts[i] if prompts is not None else None
-                params = sampling_params[i] if isinstance(
-                    sampling_params, Sequence) else sampling_params
-                res = asyncio.run(get_output(prompt, params))
-                outputs.append(res)
-        finally:
-            ray.shutdown()
-        return outputs
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+    "San Francisco is know for its",
+    "Facebook was created in 2004 by",
+    "Curious George is a",
+    "Python 3.11 brings improvements to its",
+]
 
 
 @pytest.fixture
-def baseline_llm_generator(request, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           seed):
-    return create_llm_generator("baseline", request, common_llm_kwargs,
-                                per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, seed)
-
-
-@pytest.fixture
-def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
                        test_llm_kwargs, seed):
-    return create_llm_generator("test", request, common_llm_kwargs,
-                                per_test_common_llm_kwargs, test_llm_kwargs,
-                                seed)
 
+    def generate():
+        kwargs = {
+            **common_llm_kwargs,
+            **per_test_common_llm_kwargs,
+            **test_llm_kwargs,
+        }
+
+        llm = LLM(**kwargs)
 
-def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
-                         per_test_common_llm_kwargs, distinct_llm_kwargs,
-                         seed):
-    kwargs = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **distinct_llm_kwargs,
-    }
-    test_name = request.node.name
-
-    model = kwargs["model"]
-    draft_model = kwargs.get("speculative_model", None)
-    same_draft_target_model = (draft_model is not None
-                               and draft_model == model)
-
-    def generator_inner():
-
-        wait_for_gpu_memory_to_clear(
-            devices=list(range(torch.cuda.device_count())),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
-        )
-
-        use_async = False
-        if "use_async" in kwargs:
-            use_async = kwargs.pop("use_async")
-        print(f'{use_async=}')
-
-        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
-        llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
-
-        # Override logging interval to 0 for spec decode test run to
-        # log all metrics in time.
-        if (baseline_or_test == "test" and not use_async
-                and llm.llm_engine.log_stats):
-            for sate_logger in llm.llm_engine.stat_loggers.values():
-                sate_logger.local_interval = 0
         if seed is not None:
             set_random_seed(seed)
 
         yield llm
+
         del llm
         cleanup()
 
-    def generator_outer():
-        for llm in generator_inner():
-            yield llm
-            del llm
-
-    # Set an attribute to the generator_outer function to allow us to
-    # determine whether to further check the acceptance rate in tests.
-    generator_outer.same_draft_target_model = same_draft_target_model  # type: ignore
-    return generator_outer
+    return generate
 
 
 def maybe_assert_ngram_worker(llm):
     # Verify the proposer worker is ngram if ngram is specified.
-    if (not isinstance(llm, AsyncLLM)
-            and llm.llm_engine.speculative_config is not None
+    if (llm.llm_engine.speculative_config is not None
             and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
         from vllm.spec_decode.ngram_worker import NGramWorker
         assert isinstance(
@@ -251,118 +81,165 @@ def get_output_from_llm_generator(
     return tokens, token_ids, acceptance_rate
 
 
-def get_logprobs_from_llm_generator(
-        llm_generator, prompts,
-        sampling_params) -> List[List[Dict[int, Logprob]]]:
-    """Returns a dict of (token_id: Logprob) for each generated position, for
-    each sequence in the batch.
-    """
-    for llm in llm_generator():
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        logprobs = [output.outputs[0].logprobs[:] for output in outputs]
-        del llm
+def run_logprob_correctness_test(vllm_runner,
+                                 common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs,
+                                 test_llm_kwargs,
+                                 batch_size: int,
+                                 max_output_len: int,
+                                 seed: Optional[int] = 0,
+                                 temperature: float = 0.0,
+                                 logprobs: int = 1):
+    org_args = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **baseline_llm_kwargs,
+    }
 
-    return logprobs
+    sd_args = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **test_llm_kwargs,
+    }
 
+    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
 
-def run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len,
-                                         force_output_len: bool,
-                                         print_tokens: bool = False,
-                                         ensure_all_accepted: bool = False):
-    """Helper method that compares the outputs of both the baseline LLM and
-    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
-    the same when temperature is zero.
-    """
+    sampling_params = SamplingParams(temperature=temperature,
+                                     max_tokens=max_output_len,
+                                     seed=seed,
+                                     logprobs=logprobs)
+
+    with vllm_runner(**org_args) as vllm_model:
+        org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
 
-    run_equality_correctness_test(baseline_llm_generator,
-                                  test_llm_generator,
-                                  batch_size,
-                                  max_output_len,
-                                  force_output_len,
-                                  temperature=0.0,
-                                  seeded=False,
-                                  print_tokens=print_tokens,
-                                  ensure_all_accepted=ensure_all_accepted)
+    with vllm_runner(**sd_args) as vllm_model:
+        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
+
+    check_logprobs_close(outputs_0_lst=org_outputs,
+                         outputs_1_lst=sd_outputs,
+                         name_0="org",
+                         name_1="sd")
 
 
 def run_equality_correctness_test(
-        baseline_llm_generator,
-        test_llm_generator,
-        batch_size,
-        max_output_len,
-        force_output_len: bool,
-        temperature: float,
-        seeded: bool,
-        print_tokens: bool = False,
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size: int,
+        max_output_len: int,
+        seed: Optional[int] = 0,
+        temperature: float = 0.0,
+        disable_seed: bool = False,
+        ignore_eos: bool = True,
         ensure_all_accepted: bool = False,
         expected_acceptance_rate: Optional[float] = None):
+
+    org_args = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **baseline_llm_kwargs,
+    }
+
+    sd_args = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **test_llm_kwargs,
+    }
+
+    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
+
+    if disable_seed:
+        seed = None
+
+    sampling_params = SamplingParams(temperature=temperature,
+                                     max_tokens=max_output_len,
+                                     seed=seed,
+                                     ignore_eos=ignore_eos)
+
+    with vllm_runner(**org_args) as vllm_model:
+        org_outputs = vllm_model.generate(prompts, sampling_params)
+
+    with vllm_runner(**sd_args) as vllm_model:
+        if ensure_all_accepted or expected_acceptance_rate is not None:
+            # Force log interval to be 0 to catch all metrics.
+            stat_logger = vllm_model.model.llm_engine.stat_loggers[
+                'prometheus']
+            stat_logger.local_interval = -100
+
+        sd_outputs = vllm_model.generate(prompts, sampling_params)
+
+        if ensure_all_accepted or expected_acceptance_rate is not None:
+            acceptance_rate = (stat_logger.metrics.
+                               gauge_spec_decode_draft_acceptance_rate.labels(
+                                   **stat_logger.labels)._value.get())
+
+            if ensure_all_accepted:
+                assert True
+                # FIXME: ci fails to log acceptance rate.
+                # It works locally.
+                # assert acceptance_rate == 1.0
+
+            if expected_acceptance_rate is not None:
+                assert acceptance_rate >= expected_acceptance_rate - 1e-2
+
+    check_outputs_equal(outputs_0_lst=org_outputs,
+                        outputs_1_lst=sd_outputs,
+                        name_0="org",
+                        name_1="sd")
+
+
+def run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size: int,
+                                     max_output_len: int,
+                                     seed: int = 0,
+                                     temperature: float = 0.0):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
-    the same when temperature is zero (or when temperature is > 0 and seeded).
+    the same when temperature is zero.
     """
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    # If the test requires that we generated max_output_len tokens, then set the
-    # sampling params to ignore eos token.
-    ignore_eos = force_output_len
-
-    if seeded:
-        sampling_params = [
-            SamplingParams(
-                max_tokens=max_output_len,
-                ignore_eos=ignore_eos,
-                temperature=temperature,
-                seed=i,
-            ) for i in range(len(prompts))
-        ]
-    else:
-        sampling_params = SamplingParams(
-            max_tokens=max_output_len,
-            ignore_eos=ignore_eos,
-            temperature=temperature,
-        )
-
-    (spec_batch_tokens, spec_batch_token_ids,
-     acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    (baseline_batch_tokens, baseline_batch_token_ids,
-     _) = get_output_from_llm_generator(baseline_llm_generator, prompts,
-                                        sampling_params)
-
-    assert len(baseline_batch_token_ids) == len(prompts)
-    assert len(spec_batch_token_ids) == len(prompts)
-
-    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
-            spec_tokens) in enumerate(
-                zip(baseline_batch_token_ids, baseline_batch_tokens,
-                    spec_batch_token_ids, spec_batch_tokens)):
-        if print_tokens:
-            print(f'{i=} {baseline_tokens=}')
-            print(f'{i=}     {spec_tokens=}')
-        print(f'{i=} {baseline_token_ids=}')
-        print(f'{i=}     {spec_token_ids=}')
-        assert baseline_token_ids == spec_token_ids
-
-    print(f'{acceptance_rate=}')
-
-    if ensure_all_accepted:
-        assert acceptance_rate == 1.0
-
-    if expected_acceptance_rate is not None:
-        assert acceptance_rate >= expected_acceptance_rate - 1e-2
+    arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs
+    arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs
+    env1 = env2 = None
+
+    max_wait_seconds = 240
+    results = []
+
+    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
+
+    for args, env in ((arg1, env1), (arg2, env2)):
+        with RemoteOpenAIServer(model,
+                                args,
+                                env_dict=env,
+                                max_wait_seconds=max_wait_seconds) as server:
+            client = server.get_client()
+
+            completion = client.completions.create(model=model,
+                                                   prompt=prompts,
+                                                   max_tokens=max_output_len,
+                                                   seed=seed,
+                                                   temperature=temperature)
+
+            results.append({
+                "test":
+                "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason":
+                [choice.finish_reason for choice in completion.choices],
+                "usage":
+                completion.usage,
+            })
+
+    n = len(results) // 2
+    arg1_results = results[:n]
+    arg2_results = results[n:]
+    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
+        assert arg1_result == arg2_result, (
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
+            f"{arg1_result=} != {arg2_result=}")
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 6a1819e990f44..f2af2c2bedb12 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -21,7 +21,7 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 
 # main model
 MAIN_MODEL = "JackFram/llama-68m"
@@ -53,7 +53,7 @@
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -68,15 +68,16 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
-                                      test_llm_generator, batch_size: int,
-                                      output_len: int):
-    """Verify greedy equality with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                      per_test_common_llm_kwargs,
+                                      baseline_llm_kwargs, test_llm_kwargs,
+                                      batch_size: int, output_len: int,
+                                      seed: int):
+
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
 
 
 @pytest.mark.parametrize(
@@ -94,7 +95,7 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -109,17 +110,16 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
-                                                 test_llm_generator,
-                                                 batch_size: int,
-                                                 output_len: int):
-    """Verify greedy equality with cuda graph enabled and different 
+def test_eagle_e2e_greedy_correctness_cuda_graph(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality with cuda graph enabled and different
     batch sizes."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
 
 
 @pytest.mark.parametrize(
@@ -140,7 +140,7 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -158,18 +158,17 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
-                                                      test_llm_generator,
-                                                      batch_size: int,
-                                                      output_len: int):
+def test_eagle_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
 
 
 @pytest.mark.parametrize(
@@ -185,7 +184,7 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -207,16 +206,17 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
-                           batch_size: int, output_len: int):
+def test_eagle_different_k(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
     """Verify that eagle speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
 
 
 @pytest.mark.parametrize(
@@ -232,7 +232,7 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -250,17 +250,18 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_disable_queue(baseline_llm_generator, test_llm_generator,
-                             batch_size: int, output_len: int):
+def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
+                             per_test_common_llm_kwargs, baseline_llm_kwargs,
+                             test_llm_kwargs, batch_size: int, output_len: int,
+                             seed: int):
     """Verify that eagle speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
 
 
 if __name__ == "__main__":
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index b44d269fa7382..4a427d4c3e287 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -4,7 +4,9 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
+
+MAIN_MODEL = "JackFram/llama-68m"
 
 
 @pytest.mark.parametrize(
@@ -15,7 +17,7 @@
 
         # Verify equality when cuda graphs allowed.
         "enforce_eager": False,
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
     }])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
@@ -31,23 +33,27 @@
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [32])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
-                                batch_size, output_len):
+def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
+                                per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, test_llm_kwargs,
+                                batch_size: int, output_len: int, seed: int):
     """Verify spec decode equality when cuda graphs are enabled.
     """
-    run_greedy_equality_correctness_test(
-        baseline_llm_generator,
-        test_llm_generator,
-        batch_size,
-        max_output_len=output_len,
-        force_output_len=True,
-    )
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -80,13 +86,19 @@ def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_speculative_model_quantization_config(baseline_llm_generator,
-                                               test_llm_generator,
-                                               batch_size: int):
+def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
+                                               per_test_common_llm_kwargs,
+                                               baseline_llm_kwargs,
+                                               test_llm_kwargs,
+                                               batch_size: int, seed: int):
     """Verify spec decode works well with draft model quantization configs.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=32,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=32,
+                                  seed=seed,
+                                  temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 944b28a2d14fa..679a6ded9ee79 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -7,42 +7,39 @@
 
 from vllm.utils import is_hip
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test_tp
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
-
+    [[
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
 
         # Required for spec decode.
-        "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
-
-        # Use AsyncLLM engine, so that the engine runs in its own process.
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
-        # process will have both the engine and the rank0 worker. NCCL is not
-        # cleaned up properly, and its server host thread leaks, causing the
-        # second run of the test to fail with internal NCCL error.
-        "use_async": True,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+        "--use-v2-block-manager",
+        "--tensor-parallel-size",
+        "2"
+    ]])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-    },
+    [
+        "--speculative-model",
+        "JackFram/llama-68m",
+        "--num-speculative-tokens",
+        "3",
+    ],
+    [
+        "--speculative-model",
+        "[ngram]",
+        "--num-speculative-tokens",
+        "5",
+        "--ngram-prompt-lookup-max",
+        "3",
+    ],
 ])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
@@ -52,75 +49,75 @@
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
+def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
+                              baseline_llm_kwargs, test_llm_kwargs,
+                              batch_size: int, output_len: int, seed: int):
     """Verify greedy equality when tensor parallelism is used.
     """
     if is_hip():
         pytest.skip("hip is not well-supported yet")
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test_tp("JackFram/llama-68m",
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     output_len,
+                                     seed,
+                                     temperature=0.0)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
+    [[
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
 
         # Required for spec decode.
-        "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
-
-        # Use AsyncLLM engine, so that the engine runs in its own process.
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
-        # process will have both the engine and the rank0 worker. NCCL is not
-        # cleaned up properly, and its server host thread leaks, causing the
-        # second run of the test to fail with internal NCCL error.
-        "use_async": True,
+        "--use_v2_block_manager",
+        "--tensor_parallel_size",
+        "2",
 
         # precision
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs, test_llm_kwargs",
-    [
-        (
-            {
-                # Use a small model for a fast test.
-                # Note this is repeated in the test body; to initialize a
-                # tokenizer.
-                "model": "JackFram/llama-68m",
-            },
-            {
-                "speculative_model": "JackFram/llama-68m",
-                "num_speculative_tokens": 5,
-                "speculative_draft_tensor_parallel_size": 1,
-            }),
-        ({
-            "model": "ibm-granite/granite-3b-code-instruct",
-        }, {
-            "speculative_model":
-            "ibm-granite/granite-3b-code-instruct-accelerator",
-            "num_speculative_tokens": 5,
-            "speculative_draft_tensor_parallel_size": 1,
-        })
-    ])
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative-model",
+                             "JackFram/llama-68m",
+                             "--num_speculative-tokens",
+                             "5",
+                             "--speculative-draft-tensor-parallel-size",
+                             "1",
+                         ]),
+                          ("ibm-granite/granite-3b-code-instruct", [
+                              "--speculative-model",
+                              "ibm-granite/granite-3b-code-instruct",
+                              "--num_speculative-tokens",
+                              "5",
+                              "--speculative-draft-tensor-parallel-size",
+                              "1",
+                          ])])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
-                                            baseline_llm_generator,
-                                            batch_size: int):
+def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
+                                            per_test_common_llm_kwargs,
+                                            baseline_llm_kwargs,
+                                            test_llm_kwargs, batch_size: int,
+                                            seed: int):
     """Verify spec decode works well with smaller tp for draft models.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=32,
-                                         force_output_len=True)
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 49e4a5f8150b5..3f7c5d749e4f9 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -2,98 +2,97 @@
 tensor parallelism.
 """
 
+import openai
 import pytest
 import torch
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test_tp
+
+MAIN_MODEL = "JackFram/llama-68m"
+SPEC_MODEL = "JackFram/llama-68m"
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
-
+    [[
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce_eager",
 
         # Required for spec decode.
-        "use_v2_block_manager": True,
-        "tensor_parallel_size": 4,
-
-        # Use AsyncLLM engine, so that the engine runs in its own process.
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
-        # process will have both the engine and the rank0 worker. NCCL is not
-        # cleaned up properly, and its server host thread leaks, causing the
-        # second run of the test to fail with internal NCCL error.
-        "use_async": True,
-    }])
+        "--use-v2-block-manager",
+        "--tensor-parallel-size",
+        "4",
+    ]])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-    },
+    [
+        "--speculative-model",
+        f"{SPEC_MODEL}",
+        "--num-speculative-tokens",
+        "5",
+    ],
 ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
         #TODO(wooyeon): add spec_draft_dp=2 case
-        {
-            "speculative_draft_tensor_parallel_size": 1,
-        },
+        [
+            "--speculative-draft-tensor-parallel-size",
+            "1",
+        ],
     ])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
-                                            baseline_llm_generator,
-                                            batch_size: int):
+def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
+                                            per_test_common_llm_kwargs,
+                                            baseline_llm_kwargs,
+                                            test_llm_kwargs, batch_size: int,
+                                            seed: int):
     """Verify spec decode works well with smaller tp for draft models.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=32,
-                                         force_output_len=True)
+    run_equality_correctness_test_tp(MAIN_MODEL,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-160m",
+    [[
 
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "--enforce-eager",
 
         # Required for spec decode.
-        "use_v2_block_manager": True,
-        "tensor_parallel_size": 4,
-
-        # Use AsyncLLM engine, so that the engine runs in its own process.
-        # Otherwise, since vLLM does not follow true SPMD, the test runner
-        # process will have both the engine and the rank0 worker. NCCL is not
-        # cleaned up properly, and its server host thread leaks, causing the
-        # second run of the test to fail with internal NCCL error.
-        "use_async": True,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+        "--use-v2-block-manager",
+        "--tensor-parallel-size",
+        "4",
+    ]])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
-        {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        [
+            "--speculative-model",
+            f"{SPEC_MODEL}",
+            "--num-speculative-tokens",
+            "5",
 
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
-            "speculative_max_model_len": 32,
-        },
+            "--speculative-max-model-len",
+            "32",
+        ],
     ])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
@@ -105,8 +104,9 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
         64,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
-                          batch_size: int, output_len: int):
+def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
+                          baseline_llm_kwargs, test_llm_kwargs,
+                          batch_size: int, output_len: int, seed: int):
     """Verify job failure with RuntimeError when all sequences skip speculation.
     We do this by setting the max model len of the draft model to an
     artificially low value, such that when the sequences grow beyond it, they
@@ -114,9 +114,13 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
 
     TODO: fix it to pass without raising Error. (#5814)
     """
-    with pytest.raises(RuntimeError):
-        run_greedy_equality_correctness_test(baseline_llm_generator,
-                                             test_llm_generator,
-                                             batch_size,
-                                             max_output_len=output_len,
-                                             force_output_len=True)
+    with pytest.raises(openai.APIConnectionError):
+        run_equality_correctness_test_tp(MAIN_MODEL,
+                                         common_llm_kwargs,
+                                         per_test_common_llm_kwargs,
+                                         baseline_llm_kwargs,
+                                         test_llm_kwargs,
+                                         batch_size,
+                                         output_len,
+                                         seed,
+                                         temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4c6012ec49237..03c1733f104ff 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,24 +1,22 @@
-import math
 from itertools import cycle
 
 import pytest
 
 from vllm import SamplingParams
 
-from .conftest import get_logprobs_from_llm_generator
+from .conftest import run_logprob_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "max_logprobs": 6,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -36,64 +34,29 @@
         7,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
-                           batch_size: int, output_len: int):
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_logprobs_equality(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int, logprobs: int):
     """Verify output logprobs are equal with and without speculative decoding.
     """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_logprob_correctness_test(vllm_runner,
+                                 common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs,
+                                 test_llm_kwargs,
+                                 batch_size,
+                                 output_len,
+                                 seed,
+                                 temperature=0.0,
+                                 logprobs=logprobs)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True,
-        "max_logprobs": 6,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }])
-@pytest.mark.parametrize("batch_size", [1])
-@pytest.mark.parametrize("num_logprobs", [6])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        7,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
-                           batch_size: int, output_len: int,
-                           num_logprobs: int):
-    """Verify output logprobs are equal with and without spec decode.
-    This specifies a number of logprobs >1.
-    """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True,
-                                         logprob_rank=num_logprobs)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -121,21 +84,29 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
+                              per_test_common_llm_kwargs, baseline_llm_kwargs,
+                              test_llm_kwargs, batch_size: int,
+                              output_len: int, seed: int, logprobs: int):
     """Veriy logprob greedy equality with different speculation lens.
     """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_logprob_correctness_test(vllm_runner,
+                                 common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs,
+                                 test_llm_kwargs,
+                                 batch_size,
+                                 output_len,
+                                 seed,
+                                 temperature=0.0,
+                                 logprobs=logprobs)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -164,22 +135,30 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_logprobs_when_skip_speculation(baseline_llm_generator,
-                                        test_llm_generator, batch_size: int,
-                                        output_len: int):
+@pytest.mark.parametrize("logprobs", [1])
+def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
+                                        per_test_common_llm_kwargs,
+                                        baseline_llm_kwargs, test_llm_kwargs,
+                                        batch_size: int, output_len: int,
+                                        seed: int, logprobs: int):
     """Verify logprobs greedy equality when some sequences skip speculation.
     """
-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_logprob_correctness_test(vllm_runner,
+                                 common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs,
+                                 test_llm_kwargs,
+                                 batch_size,
+                                 output_len,
+                                 seed,
+                                 temperature=0.0,
+                                 logprobs=logprobs)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -203,19 +182,17 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
-                         batch_size: int, output_len: int):
+@pytest.mark.parametrize("logprobs", [6])
+def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
+                         test_llm_kwargs, batch_size: int, output_len: int,
+                         seed: int, logprobs: int):
     """Verify at least one logprob result has num_logprobs+1, which tests the
     case where the sampled token is not in top-k logprobs.
 
     Ideally, this test should validate equality with non-spec by getting
     logprobs. This is left as future improvement.
     """
-    batch_size = 8
-    max_output_len = output_len
-    force_output_len = True
-    logprob_rank = 5
-
     temperature = 1.0
 
     prompts = [
@@ -231,129 +208,40 @@ def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
 
     prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
 
-    # If the test requires that we generated max_output_len tokens, then set the
-    # sampling params to ignore eos token.
-    ignore_eos = force_output_len
-
     sampling_params = SamplingParams(
-        max_tokens=max_output_len,
-        ignore_eos=ignore_eos,
+        max_tokens=output_len,
+        ignore_eos=True,
         temperature=temperature,
-        logprobs=logprob_rank,
+        logprobs=logprobs,
     )
 
-    spec_batch_logprobs = get_logprobs_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
+    sd_args = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **test_llm_kwargs,
+    }
+
+    with vllm_runner(**sd_args) as vllm_model:
+        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
 
     num_returned_logprobs = [
-        len(logprob_dict) for seq_logprobs in spec_batch_logprobs
-        for logprob_dict in seq_logprobs
+        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
     ]
 
     # Assert one of the returned logprobs has > num_logprobs (indicating the
     # sampled token is not in top-k).
-    assert any([
-        num_returned > logprob_rank for num_returned in num_returned_logprobs
-    ])
-
-
-def run_greedy_logprobs_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len,
-                                         force_output_len: bool,
-                                         logprob_rank: int = 1):
-    """Helper method that compares the logprobs outputs of both the baseline LLM
-    and the test LLM. It asserts greedy equality of the logprobs when the
-    temperature is zero.
-    """
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    # If the test requires that we generated max_output_len tokens, then set the
-    # sampling params to ignore eos token.
-    ignore_eos = force_output_len
-
-    sampling_params = SamplingParams(
-        max_tokens=max_output_len,
-        ignore_eos=ignore_eos,
-        temperature=temperature,
-        logprobs=logprob_rank,
-    )
-
-    spec_batch_logprobs = get_logprobs_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
-    baseline_batch_logprobs = get_logprobs_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    assert len(baseline_batch_logprobs) == len(prompts)
-    assert len(spec_batch_logprobs) == len(prompts)
-
-    # For each sequence in the batch.
-    for i, (baseline_logprobs, spec_logprobs) in enumerate(
-            zip(baseline_batch_logprobs, spec_batch_logprobs)):
-        assert len(spec_logprobs) == len(baseline_logprobs)
-
-        # For each generated position of the sequence.
-        for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
-                zip(spec_logprobs, baseline_logprobs)):
-
-            # Map rank to token/logprob in spec output.
-            spec_rank_to_token_id = {
-                value.rank: key
-                for key, value in spec_pos_logprobs.items()
-            }
-            spec_rank_to_logprob = {
-                value.rank: value.logprob
-                for key, value in spec_pos_logprobs.items()
-            }
-
-            # Map rank to token/logprob in baseline output.
-            baseline_rank_to_token_id = {
-                value.rank: key
-                for key, value in baseline_pos_logprobs.items()
-            }
-            baseline_rank_to_logprob = {
-                value.rank: value.logprob
-                for key, value in baseline_pos_logprobs.items()
-            }
-
-            # Assert set of ranks returned is equal.
-            assert set(spec_rank_to_token_id.keys()) == set(
-                baseline_rank_to_token_id.keys())
-
-            # Assert each logprob/token id is correct, keyed by rank.
-            for rank in sorted(set(spec_rank_to_token_id.keys())):
-                assert spec_rank_to_token_id[
-                    rank] == baseline_rank_to_token_id[rank], f"{rank}"
-                assert math.isclose(
-                    a=spec_rank_to_logprob[rank],
-                    b=baseline_rank_to_logprob[rank],
-                    abs_tol=1e-1,
-                )
+    assert any(
+        [num_returned > logprobs for num_returned in num_returned_logprobs])
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "max_logprobs": 6,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -364,57 +252,28 @@ def run_greedy_logprobs_correctness_test(baseline_llm_generator,
                              "disable_logprobs_during_spec_decoding": True,
                          }])
 @pytest.mark.parametrize("seed", [1])
-def test_logprobs_disabled(baseline_llm_generator, test_llm_generator):
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("logprobs", [0])
+def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int, logprobs: int):
     """Check the behavior when logprobs are disabled.
     Token choices should match with the base model.
     """
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(4))]
-
-    sampling_params = SamplingParams(
-        # Use smaller output len for fast test
-        max_tokens=7,
-        ignore_eos=True,
-        temperature=0.0,
-        logprobs=2,
-    )
-
-    spec_batch_logprobs = get_logprobs_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
-    baseline_batch_logprobs = get_logprobs_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    assert len(baseline_batch_logprobs) == len(prompts)
-    assert len(spec_batch_logprobs) == len(prompts)
-
-    # For each sequence in the batch.
-    for _, (baseline_logprobs, spec_logprobs) in enumerate(
-            zip(baseline_batch_logprobs, spec_batch_logprobs)):
-        assert len(spec_logprobs) == len(baseline_logprobs)
-
-        # For each generated position of the sequence.
-        for _, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
-                zip(spec_logprobs, baseline_logprobs)):
-
-            assert len(spec_pos_logprobs) == 1
-            spec_top_token_id = list(spec_pos_logprobs)[0]
-
-            spec_top_logprob = spec_pos_logprobs[spec_top_token_id]
-            assert spec_top_logprob.logprob == 0.0
-            assert spec_top_logprob.rank == -1
-
-            # check that the chosen token matches the base model
-            baseline_logprob = baseline_pos_logprobs[spec_top_token_id]
-            assert baseline_logprob.rank == 1
-            assert spec_top_logprob.decoded_token \
-                == baseline_logprob.decoded_token
+    run_logprob_correctness_test(vllm_runner,
+                                 common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs,
+                                 test_llm_kwargs,
+                                 batch_size,
+                                 output_len,
+                                 seed,
+                                 temperature=0.0,
+                                 logprobs=logprobs)
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index de4b2ab796a3c..568c2d65fca59 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -21,7 +21,7 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 
 # main model
 # lmsys/vicuna-7b-v1.3 was to be used but it's causing
@@ -55,7 +55,7 @@
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -70,15 +70,21 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
-                                       test_llm_generator, batch_size: int,
-                                       output_len: int):
+def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                       per_test_common_llm_kwargs,
+                                       baseline_llm_kwargs, test_llm_kwargs,
+                                       batch_size: int, output_len: int,
+                                       seed: int):
     """Verify greedy equality with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -96,7 +102,7 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -111,17 +117,21 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
-                                                  test_llm_generator,
-                                                  batch_size: int,
-                                                  output_len: int):
+def test_medusa_e2e_greedy_correctness_cuda_graph(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality with cuda graph enabled and different 
     batch sizes."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -142,7 +152,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -160,18 +170,22 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
-                                                       test_llm_generator,
-                                                       batch_size: int,
-                                                       output_len: int):
+def test_medusa_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -187,7 +201,7 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -209,16 +223,22 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
-                            batch_size: int, output_len: int):
+def test_medusa_different_k(vllm_runner, common_llm_kwargs,
+                            per_test_common_llm_kwargs, baseline_llm_kwargs,
+                            test_llm_kwargs, batch_size: int, output_len: int,
+                            seed: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -234,7 +254,7 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -252,17 +272,23 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_medusa_disable_queue(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
+def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
+                              per_test_common_llm_kwargs, baseline_llm_kwargs,
+                              test_llm_kwargs, batch_size: int,
+                              output_len: int, seed: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 if __name__ == "__main__":
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index c72e4595fd335..2d0d6fb923ad1 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -25,8 +25,7 @@
 
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
 
-from .conftest import (run_equality_correctness_test,
-                       run_greedy_equality_correctness_test)
+from .conftest import run_equality_correctness_test
 
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
@@ -58,7 +57,7 @@
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -72,14 +71,21 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
-                                    batch_size: int, output_len: int):
+def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                    per_test_common_llm_kwargs,
+                                    baseline_llm_kwargs, test_llm_kwargs,
+                                    batch_size: int, output_len: int,
+                                    seed: int):
     """Verify greedy equality with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -98,7 +104,7 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -110,17 +116,21 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
-                                 batch_size: int, output_len: int):
+def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs, test_llm_kwargs,
+                                 batch_size: int, output_len: int, seed: int):
     """Verify acceptance rate with different batch size and large output 
     length."""
-    run_equality_correctness_test(baseline_llm_generator,
-                                  test_llm_generator,
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
                                   batch_size,
                                   max_output_len=output_len,
                                   temperature=0.0,
-                                  seeded=True,
-                                  force_output_len=True,
+                                  seed=seed,
                                   expected_acceptance_rate=0.48)
 
 
@@ -140,7 +150,7 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
 
         # Speculative model
         "speculative_model": SPEC_MODEL,
@@ -151,28 +161,35 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("temperature", [0.1, 1.0])
-@pytest.mark.parametrize("seed", [None])
-def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
+                                    per_test_common_llm_kwargs,
+                                    baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    temperature: float):
+                                    temperature: float, seed: int):
     """Verify seeded runs produce the same output."""
-    run_equality_correctness_test(baseline_llm_generator,
-                                  test_llm_generator,
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
                                   batch_size,
                                   max_output_len=output_len,
                                   temperature=temperature,
-                                  seeded=True,
-                                  force_output_len=True)
+                                  seed=seed)
 
     # Ensure this same test does fail if we _don't_ include per-request seeds
     with pytest.raises(AssertionError):
-        run_equality_correctness_test(baseline_llm_generator,
-                                      test_llm_generator,
+        run_equality_correctness_test(vllm_runner,
+                                      common_llm_kwargs,
+                                      per_test_common_llm_kwargs,
+                                      baseline_llm_kwargs,
+                                      test_llm_kwargs,
                                       batch_size,
                                       max_output_len=output_len,
                                       temperature=temperature,
-                                      seeded=False,
-                                      force_output_len=True)
+                                      seed=seed,
+                                      disable_seed=True)
 
 
 @pytest.mark.parametrize(
@@ -193,7 +210,7 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -210,18 +227,22 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
-                                                    test_llm_generator,
-                                                    batch_size: int,
-                                                    output_len: int):
+def test_mlp_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -242,7 +263,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -259,10 +280,10 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
-                                                 test_llm_generator,
-                                                 batch_size: int,
-                                                 output_len: int):
+def test_mlp_e2e_greedy_correctness_with_padding(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality when the vocab dimension is padded
     """
 
@@ -273,11 +294,15 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
     with patch(
             "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
             patched_pad_vocab_size):
-        run_greedy_equality_correctness_test(baseline_llm_generator,
-                                             test_llm_generator,
-                                             batch_size,
-                                             max_output_len=output_len,
-                                             force_output_len=True)
+        run_equality_correctness_test(vllm_runner,
+                                      common_llm_kwargs,
+                                      per_test_common_llm_kwargs,
+                                      baseline_llm_kwargs,
+                                      test_llm_kwargs,
+                                      batch_size,
+                                      max_output_len=output_len,
+                                      seed=seed,
+                                      temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -293,7 +318,7 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -315,16 +340,22 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
-                         batch_size: int, output_len: int):
+def test_mlp_different_k(vllm_runner, common_llm_kwargs,
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
+                         test_llm_kwargs, batch_size: int, seed: int,
+                         output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -340,7 +371,7 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
         "dtype": PRECISION,
 
         # Main model
-        "model": MAIN_MODEL,
+        "model_name": MAIN_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -357,14 +388,20 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
-                           batch_size: int, output_len: int):
+def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, seed: int,
+                           output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 86cab7aba2380..df6f12d57b400 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -41,8 +41,9 @@
 
 from vllm import SamplingParams
 
+from ...utils import fork_new_process_for_each_test
 from .conftest import (get_output_from_llm_generator,
-                       run_greedy_equality_correctness_test)
+                       run_equality_correctness_test)
 
 
 @pytest.mark.parametrize(
@@ -73,6 +74,7 @@
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_with_detokenization(test_llm_generator,
                                              batch_size: int):
     """Run generation with speculative decoding on a batch. Verify the engine
@@ -116,44 +118,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         assert actual_tokens.strip() == expected_tokens.strip()
 
 
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True,
-
-        # Use AsyncLLM engine
-        "use_async": True,
-    }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-    },
-])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_e2e_with_async_engine(test_llm_generator,
-                                           baseline_llm_generator,
-                                           batch_size: int):
-    """Verify spec decode works well with async LLM engine.
-    """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=32,
-                                         force_output_len=True)
-
-
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -172,10 +136,10 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
         # Try two different tiny base models.
         # Note that one is equal to the draft model, another isn't.
         {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
         },
         {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -189,13 +153,15 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
     "output_len",
     [
         # Use long output len for the small model test.
-        1536,
+        10,
     ])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality on a tiny model with batch size of one.
 
     Since this test is cheaper than other e2e correctness tests, we generate
@@ -204,14 +170,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     When the draft model is the same as the target model, we further check
     whether all speculative tokens are accepted.
     """
-    ensure_all_accepted = test_llm_generator.same_draft_target_model
-    run_greedy_equality_correctness_test(
-        baseline_llm_generator,
-        test_llm_generator,
-        batch_size,
-        max_output_len=output_len,
-        force_output_len=True,
-        ensure_all_accepted=ensure_all_accepted)
+    ensure_all_accepted = per_test_common_llm_kwargs.get(
+        "model_name") == test_llm_kwargs.get("speculative_model")
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0,
+                                  ensure_all_accepted=ensure_all_accepted)
 
 
 @pytest.mark.parametrize(
@@ -232,10 +202,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
         # Try two different tiny base models.
         # Note that one is equal to the draft model, another isn't.
         {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
         },
         {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -253,16 +223,22 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     ])
 @pytest.mark.parametrize("batch_size", [64])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality on a tiny model and large batch size.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -280,10 +256,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
         # Try two different tiny base models.
         # Note that one is equal to the draft model, another isn't.
         {
-            "model": "JackFram/llama-68m",
+            "model_name": "JackFram/llama-68m",
         },
         {
-            "model": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-160m",
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -298,24 +274,31 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
 ])
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        max_output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
+        max_output_len: int, seed: int):
     """Verify greedy equality on a tiny model, with a large batch size, and when
     sampling respects the EOS token.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len,
-                                         force_output_len=False)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len,
+                                  seed=seed,
+                                  temperature=0.0,
+                                  ignore_eos=False)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
         # A "real" model (not tiny).
-        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -342,24 +325,30 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
         256,
     ])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality on a "real" model and batch size of 1. This is
     separate from large BS tests to make identifying the source of bugs easier.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
         # A "real" model (not tiny).
-        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -386,17 +375,23 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
         64,
     ])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality with a "real" model on a nontrivial batch size.
     This is the closest test to a real production workload.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -415,7 +410,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
     },
 ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -433,23 +428,29 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@fork_new_process_for_each_test
 def test_spec_decode_e2e_greedy_correctness_with_preemption(
-        baseline_llm_generator, test_llm_generator, batch_size: int,
-        output_len: int):
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -487,22 +488,29 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_different_block_size(baseline_llm_generator,
-                                          test_llm_generator, batch_size: int,
-                                          output_len: int):
+@fork_new_process_for_each_test
+def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
+                                          per_test_common_llm_kwargs,
+                                          baseline_llm_kwargs, test_llm_kwargs,
+                                          batch_size: int, output_len: int,
+                                          seed: int):
     """Verify greedy equality over different block sizes.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -534,24 +542,31 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
         64,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
-                          batch_size: int, output_len: int):
+@fork_new_process_for_each_test
+def test_skip_speculation(vllm_runner, common_llm_kwargs,
+                          per_test_common_llm_kwargs, baseline_llm_kwargs,
+                          test_llm_kwargs, batch_size: int, output_len: int,
+                          seed: int):
     """Verify greedy equality when some (or all) sequences skip speculation.
     We do this by setting the max model len of the draft model to an
     artificially low value, such that when the sequences grow beyond it, they
     are skipped in speculative decoding.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -571,21 +586,28 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_disable_speculation(baseline_llm_generator, test_llm_generator,
-                             batch_size: int, output_len: int):
+@fork_new_process_for_each_test
+def test_disable_speculation(vllm_runner, common_llm_kwargs,
+                             per_test_common_llm_kwargs, baseline_llm_kwargs,
+                             test_llm_kwargs, batch_size: int, output_len: int,
+                             seed: int):
     """Verify greedy equality when all sequences disable speculation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -613,22 +635,28 @@ def test_disable_speculation(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
-                output_len: int):
+@fork_new_process_for_each_test
+def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+                baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
+                output_len: int, seed: int):
     """Verify that speculative decoding produces exact equality to without spec
     decode with many different values of k.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -657,15 +685,22 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_typical_acceptance_sampling(baseline_llm_generator,
-                                     test_llm_generator, batch_size: int,
-                                     output_len: int):
+@fork_new_process_for_each_test
+def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs, test_llm_kwargs,
+                                     batch_size: int, output_len: int,
+                                     seed: int):
     """Verify that speculative decoding produces exact equality to without spec
     decode with TypicalAcceptanceSampler as the draft token acceptance
     sampling method.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index d475d37af6425..89301f24e1159 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,7 +26,7 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
@@ -43,7 +43,7 @@
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
     },
 ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -59,15 +59,21 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
-                                      test_llm_generator, batch_size: int,
-                                      output_len: int):
+def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                      per_test_common_llm_kwargs,
+                                      baseline_llm_kwargs, test_llm_kwargs,
+                                      batch_size: int, output_len: int,
+                                      seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
@@ -86,7 +92,7 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
-        "model": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-160m",
     },
 ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -105,24 +111,28 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
-                                                      test_llm_generator,
-                                                      batch_size: int,
-                                                      output_len: int):
+def test_ngram_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=0,
+                                  seed=seed)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -159,23 +169,29 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
-                           batch_size: int, output_len: int):
+def test_ngram_different_k(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
     different ngram_prompt_lookup_max.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -200,14 +216,20 @@ def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_disable_queue(baseline_llm_generator, test_llm_generator,
-                             batch_size: int, output_len: int):
+def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
+                             per_test_common_llm_kwargs, baseline_llm_kwargs,
+                             test_llm_kwargs, batch_size: int, output_len: int,
+                             seed: int):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
     different ngram_prompt_lookup_max.
     """
-    run_greedy_equality_correctness_test(baseline_llm_generator,
-                                         test_llm_generator,
-                                         batch_size,
-                                         max_output_len=output_len,
-                                         force_output_len=True)
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  seed=seed,
+                                  temperature=0.0)
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index f84c346c1d315..b17013216ae23 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -2,11 +2,17 @@
 
 from .conftest import run_equality_correctness_test
 
+# main model
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "JackFram/llama-160m"
+
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
@@ -31,26 +37,34 @@
         # Use smaller output len for fast test.
         20,
     ])
-@pytest.mark.parametrize("seed", [None])
-def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
-                            batch_size: int, temperature: float,
-                            output_len: int):
+def test_seeded_consistency(vllm_runner, common_llm_kwargs,
+                            per_test_common_llm_kwargs, baseline_llm_kwargs,
+                            test_llm_kwargs, batch_size: int,
+                            temperature: float, output_len: int):
     """Verify outputs are consistent across multiple runs with same seed
     """
-    run_equality_correctness_test(baseline_llm_generator,
-                                  test_llm_generator,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=temperature,
-                                  seeded=True,
-                                  force_output_len=True)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        temperature=temperature,
+        disable_seed=False,
+    )
 
     # Ensure this same test does fail if we _don't_ include per-request seeds
     with pytest.raises(AssertionError):
-        run_equality_correctness_test(baseline_llm_generator,
-                                      test_llm_generator,
-                                      batch_size,
-                                      max_output_len=output_len,
-                                      temperature=temperature,
-                                      seeded=False,
-                                      force_output_len=True)
+        run_equality_correctness_test(
+            vllm_runner,
+            common_llm_kwargs,
+            per_test_common_llm_kwargs,
+            baseline_llm_kwargs,
+            test_llm_kwargs,
+            batch_size,
+            max_output_len=output_len,
+            temperature=temperature,
+            disable_seed=True,
+        )
diff --git a/tests/utils.py b/tests/utils.py
index 6e5bc05b3901a..f6c2be17ebdcf 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,7 @@
 from typing import Any, Callable, Dict, List, Optional
 
 import openai
+import pytest
 import requests
 from openai.types.completion import Completion
 from transformers import AutoTokenizer
@@ -22,7 +23,8 @@
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
+from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
+                        get_open_port, is_hip)
 
 if current_platform.is_rocm():
     from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -356,12 +358,23 @@ def error_on_warning():
         yield
 
 
+def get_physical_device_indices(devices):
+    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if visible_devices is None:
+        return devices
+
+    visible_indices = [int(x) for x in visible_devices.split(",")]
+    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
+    return [index_mapping[i] for i in devices if i in index_mapping]
+
+
 @_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                  threshold_bytes: int,
                                  timeout_s: float = 120) -> None:
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
+    devices = get_physical_device_indices(devices)
     start_time = time.time()
     while True:
         output: Dict[int, str] = {}
@@ -441,6 +454,22 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
     return wrapper
 
 
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+    test_skipif = pytest.mark.skipif(
+        cuda_device_count_stateless() < num_gpus,
+        reason=f"Need at least {num_gpus} GPUs to run the test.",
+    )
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+
+    return wrapper
+
+
 async def completions_with_server_args(
     prompts: List[str],
     model_name: str,
diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
new file mode 100644
index 0000000000000..fe76705746766
--- /dev/null
+++ b/tests/weight_loading/models-large.txt
@@ -0,0 +1,3 @@
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
\ No newline at end of file
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1dc529037a98e..a90b352a39bca 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -19,8 +19,7 @@ compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
 compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9699e12a53543..c0e72cfd55b6f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -18,6 +18,9 @@
     except ImportError as e:
         logger.warning("Failed to import from vllm._C with %r", e)
 
+if current_platform.is_rocm():
+    import vllm._rocm_C  # noqa: F401
+
 with contextlib.suppress(ImportError):
     import vllm._moe_C  # noqa: F401
 
@@ -183,16 +186,36 @@ def scaled_fused_add_rms_norm(out: torch.Tensor, input: torch.Tensor,
                                            epsilon)
 
 
-def advance_step(num_seqs: int, num_queries: int, block_size: int,
-                 input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
-                 input_positions: torch.Tensor, seq_lens: torch.Tensor,
-                 slot_mapping: torch.Tensor,
-                 block_tables: torch.Tensor) -> None:
+def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
+                           input_tokens: torch.Tensor,
+                           sampled_token_ids: torch.Tensor,
+                           input_positions: torch.Tensor,
+                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                           block_tables: torch.Tensor) -> None:
     """Advance a step on GPU for existing inputs for a multi-step runner"""
-    return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
-                                     input_tokens, sampled_token_ids,
-                                     input_positions, seq_lens, slot_mapping,
-                                     block_tables)
+    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
+                                               block_size, input_tokens,
+                                               sampled_token_ids,
+                                               input_positions, seq_lens,
+                                               slot_mapping, block_tables)
+
+
+def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
+                            input_tokens: torch.Tensor,
+                            sampled_token_ids: torch.Tensor,
+                            input_positions: torch.Tensor,
+                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                            block_tables: torch.Tensor,
+                            paged_kv_indices: torch.Tensor,
+                            paged_kv_indptr: torch.Tensor,
+                            paged_kv_last_page_len: torch.Tensor,
+                            block_table_bound: torch.Tensor) -> None:
+
+    return torch.ops._C.advance_step_flashinfer(
+        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+        input_positions, seq_lens, slot_mapping, block_tables,
+        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
+        block_table_bound)
 
 
 # quantization ops
@@ -226,6 +249,22 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                                   b_g_idx, use_exllama, bit)
 
 
+# TODO: has to be a better way to do this
+try:
+    torch.ops._C.gptq_gemm  # noqa B018
+
+    @torch.library.register_fake("_C::gptq_gemm")
+    def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                        b_gptq_qzeros: torch.Tensor,
+                        b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor,
+                        use_exllama: bool, bit: int) -> torch.Tensor:
+        return torch.empty((a.size(0), b_q_weight.size(1)),
+                           dtype=a.dtype,
+                           device=a.device)
+except Exception:
+    pass
+
+
 def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                  bit: int) -> None:
     torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
@@ -249,6 +288,194 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                                             size_n, size_k)
 
 
+# TODO: has to be a better way to do this
+try:
+    torch.ops._C.gptq_marlin_24_gemm  # noqa B018
+
+    @torch.library.register_fake("_C::gptq_marlin_24_gemm")
+    def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                                  b_meta: torch.Tensor, b_scales: torch.Tensor,
+                                  workspace: torch.Tensor,
+                                  b_q_type: ScalarType, size_m: int,
+                                  size_n: int, size_k: int) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+
+    @torch.library.register_fake("_C::gptq_marlin_gemm")
+    def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               b_zeros: torch.Tensor,
+                               g_idx: torch.Tensor,
+                               perm: torch.Tensor,
+                               workspace: torch.Tensor,
+                               b_q_type: ScalarType,
+                               size_m: int,
+                               size_n: int,
+                               size_k: int,
+                               is_k_full: bool,
+                               has_zp: bool = False,
+                               use_fp32_reduce: bool = False) -> torch.Tensor:
+        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+
+    @torch.library.register_fake("_C::ggml_dequantize")
+    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int, m: int,
+                              n: int) -> torch.Tensor:
+        return torch.empty((m, n), dtype=torch.float16, device=W.device)
+
+    @torch.library.register_fake("_C::ggml_mul_mat_vec_a8")
+    def _ggml_mul_mat_vec_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: int,
+    ) -> torch.Tensor:
+        return torch.empty((1, row), dtype=torch.float16, device=W.device)
+
+    @torch.library.register_fake("_C::ggml_mul_mat_a8")
+    def _ggml_mul_mat_a8_fake(
+        W: torch.Tensor,
+        X: torch.Tensor,
+        quant_type: int,
+        row: int,
+    ) -> torch.Tensor:
+        batch = X.size(0)
+        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
+
+    @torch.library.register_fake("_C::marlin_qqq_gemm")
+    def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                              s_tok: torch.Tensor, s_ch: torch.Tensor,
+                              s_group: torch.Tensor, workspace: torch.Tensor,
+                              size_m: int, size_n: int,
+                              size_k: int) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+
+    @torch.library.register_fake("_C::marlin_gemm")
+    def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                          b_scales: torch.Tensor, workspace: torch.Tensor,
+                          size_m: int, size_n: int,
+                          size_k: int) -> torch.Tensor:
+        return torch.empty((size_m, size_n),
+                           dtype=torch.float16,
+                           device=a.device)
+
+    @torch.library.register_fake("_C::awq_dequantize")
+    def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
+                             zeros: torch.Tensor, split_k_iters: int, thx: int,
+                             thy: int) -> torch.Tensor:
+        in_c = qweight.size(0)
+        qout_c = qweight.size(1)
+        out_c = qout_c * 8
+        return torch.empty((in_c, out_c),
+                           dtype=scales.dtype,
+                           device=scales.device)
+
+    @torch.library.register_fake("_C::awq_gemm")
+    def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
+                       qzeros: torch.Tensor, scales: torch.Tensor,
+                       split_k_iters: int) -> torch.Tensor:
+        num_in_feats = input.size(0)
+        return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8),
+                           dtype=input.dtype,
+                           device=input.device).sum(0)
+
+    @torch.library.register_fake("_C::aqlm_gemm")
+    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
+                        codebooks: torch.Tensor, scales: torch.Tensor,
+                        codebook_partition_sizes: List[int],
+                        bias: Optional[torch.Tensor]) -> torch.Tensor:
+        out_features = codes.size(0) * codebooks.size(2)
+        flat_input = input.reshape((-1, input.size(-1)))
+        flat_output = torch.empty((flat_input.size(0), out_features),
+                                  dtype=input.dtype,
+                                  device=input.device)
+
+        output_sizes = list(input.shape)
+        output_sizes.pop()
+        output_sizes.append(-1)
+        return flat_output.reshape(tuple(output_sizes))
+
+    @torch.library.register_fake("_C::aqlm_dequant")
+    def _aqlm_dequant_fake(
+            codes: torch.Tensor, codebooks: torch.Tensor,
+            codebook_partition_sizes: List[int]) -> torch.Tensor:
+        in_features = codes.size(1) * 8
+        out_features = codes.size(0)
+        return torch.empty((out_features, in_features),
+                           dtype=codebooks.dtype,
+                           device=codebooks.device)
+
+    @torch.library.register_fake("_C::fp8_marlin_gemm")
+    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
+                              b_scales: torch.Tensor, workspace: torch.Tensor,
+                              num_bits: int, size_m: int, size_n: int,
+                              size_k: int) -> torch.Tensor:
+        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
+
+    @torch.library.register_fake("_C::machete_gemm")
+    def machete_gemm_fake(
+        a: torch.Tensor,
+        b_q: torch.
+        Tensor,  # Should be the tensor returned by machete_prepack_B
+        b_type: ScalarType,
+        b_scales: Optional[torch.Tensor] = None,
+        b_zeros: Optional[torch.Tensor] = None,
+        b_group_size: Optional[int] = None,
+        c: Optional[torch.Tensor] = None,
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        schedule: Optional[str] = None,
+    ) -> torch.Tensor:
+        m = a.size(0)
+        n = b_q.size(1)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)
+
+    @torch.library.register_fake("_C::machete_prepack_B")
+    def machete_prepack_B_fake(b_q_weight: torch.Tensor,
+                               b_type: ScalarType) -> torch.Tensor:
+        return torch.empty_like(b_q_weight)
+
+    @torch.library.register_fake("_C::causal_conv1d_fwd")
+    def causal_conv1d_fwd_fake(x: torch.Tensor, weight: torch.Tensor,
+                               bias_: Optional[torch.Tensor],
+                               seq_idx_: Optional[torch.Tensor],
+                               initial_states_: Optional[torch.Tensor],
+                               final_states_out_: Optional[torch.Tensor],
+                               silu_activation: bool) -> torch.Tensor:
+        return torch.empty_like(x)
+
+    @torch.library.register_fake("_C::causal_conv1d_update")
+    def causal_conv1d_update_fake(x: torch.Tensor, conv_state: torch.Tensor,
+                                  weight: torch.Tensor,
+                                  bias_: Optional[torch.Tensor],
+                                  silu_activation: bool) -> torch.Tensor:
+        return torch.empty_like(x)
+
+    @torch.library.register_fake("_C::selective_scan_fwd")
+    def selective_scan_fwd_fake(
+            u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
+            B: torch.Tensor, C: torch.Tensor, D_: Optional[torch.Tensor],
+            z_: Optional[torch.Tensor], delta_bias_: Optional[torch.Tensor],
+            delta_softplus: bool, index_: Optional[torch.Tensor],
+            x: Optional[torch.Tensor]) -> List[torch.Tensor]:
+        a = torch.empty_like(u)
+        if x is not None:
+            b = x
+        else:
+            b = torch.empty((u.size(0), u.size(1), A.size(1)),
+                            dtype=u.dtype,
+                            device=u.device)
+        if z_ is not None:
+            c = torch.empty_like(z_)
+            return [a, b, c]
+        else:
+            return [a, b]
+
+except Exception:
+    pass
+
+
 # cutlass
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
@@ -678,15 +905,15 @@ def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
 # ROCm custom
 def LLMM1(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor,
           rows_per_block: int) -> None:
-    torch.ops._custom_C.LLMM1(a, b, out, rows_per_block)
+    torch.ops._rocm_C.LLMM1(a, b, out, rows_per_block)
 
 
 def LLMM_Silu(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor,
               rows_per_block: int) -> None:
-    torch.ops._custom_C.LLMM_Silu(a, b, out, rows_per_block)
+    torch.ops._rocm_C.LLMM_Silu(a, b, out, rows_per_block)
 
 
-def paged_attention_custom(
+def paged_attention_rocm(
     out: torch.Tensor,
     exp_sum: torch.Tensor,
     max_logits: torch.Tensor,
@@ -705,15 +932,16 @@ def paged_attention_custom(
     k_scale: int,
     v_scale: int,
 ) -> None:
-    torch.ops._custom_C.paged_attention_custom(
-        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
-        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, k_scale, v_scale)
+    torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
+                                      key_cache, value_cache, num_kv_heads,
+                                      scale, block_tables, seq_lens,
+                                      block_size, max_seq_len, alibi_slopes,
+                                      kv_cache_dtype, k_scale, v_scale)
 
 
 def wvSpltK(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor, N: int,
             cu_count: int) -> None:
-    torch.ops._custom_C.wvSpltK(a, b, out, N, cu_count)
+    torch.ops._rocm_C.wvSpltK(a, b, out, N, cu_count)
 
 
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 2156f6b18adb6..31fcc4c3256a8 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -27,29 +27,27 @@ def _reshape_activation_tensor(
 
     @staticmethod
     def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.silu_mul(x1, x2, out)
+        ipex.llm.functional.silu_and_mul(x, out)
 
     @staticmethod
     def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+        ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod
     def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+        ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod
-    def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-        out.copy_(torch.nn.functional.gelu(x))
+    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
 
     @staticmethod
-    def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-        out.copy_(torch.nn.functional.gelu(x))
+    def gelu_new(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
 
-    # TODO add implementation of gelu_quick here
-    # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    @staticmethod
+    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_quick(x, out)
 
     @staticmethod
     def paged_attention_v1(
@@ -160,29 +158,10 @@ def rotary_embedding(
         cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
         is_neox: bool,
     ) -> None:
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-            query = query.unsqueeze(0)
-            key = key.unsqueeze(0)
-
-        rotary_dim = cos_sin_cache.size(1)
-        query = query.view(*query.shape[:-1], -1, head_size)
-        key = key.view(*key.shape[:-1], -1, head_size)
-
-        query_rot = query[..., :rotary_dim]
-        key_rot = key[..., :rotary_dim]
-
-        cos_sin = cos_sin_cache[positions.long()]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        if is_neox:
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
-                                             rotary_dim, is_neox, positions)
+        rot_dim = cos_sin_cache.size(1)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim)
 
     @staticmethod
     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -190,37 +169,15 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                                  cos_sin_cache: torch.Tensor, is_neox: bool,
                                  rot_dim: int,
                                  cos_sin_cache_offsets: torch.Tensor) -> None:
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-            query = query.unsqueeze(0)
-            key = key.unsqueeze(0)
-        cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
-        rotary_dim = cos_sin_cache.size(1)
-        query = query.view(*query.shape[:-1], -1, head_size)
-        key = key.view(*key.shape[:-1], -1, head_size)
-
-        query_rot = query[..., :rotary_dim]
-        key_rot = key[..., :rotary_dim]
-
-        cos_sin = cos_sin_cache[torch.add(positions,
-                                          cos_sin_cache_offsets).long()]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        if is_neox:
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-
-        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
-                                             rotary_dim, is_neox, positions)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim,
+                                                     cos_sin_cache_offsets)
 
     @staticmethod
-    def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
-                 epsilon: float) -> None:
-        tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
-        out.copy_(tmp)
+    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> torch.Tensor:
+        return ipex.llm.functional.rms_norm(input, weight, epsilon)
 
     @staticmethod
     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
@@ -246,11 +203,14 @@ def varlen_attention(
         return_softmax: bool,
         gen_: torch.Generator,
     ) -> None:
-        ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
-                                             seqlen_k, max_seqlen_q,
-                                             max_seqlen_k, pdropout,
-                                             softmax_scale, zero_tensors,
-                                             is_causal, return_softmax, gen_)
+        ipex.llm.functional.varlen_attention(query.contiguous(),
+                                             key.contiguous(),
+                                             value.contiguous(), out,
+                                             seqlen_q.int(), seqlen_k.int(),
+                                             max_seqlen_q, max_seqlen_k,
+                                             pdropout, softmax_scale,
+                                             zero_tensors, is_causal,
+                                             return_softmax, gen_)
 
     @staticmethod
     def reshape_and_cache(
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
new file mode 100644
index 0000000000000..e71011f5769e7
--- /dev/null
+++ b/vllm/assets/video.py
@@ -0,0 +1,85 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Literal
+
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from vllm.multimodal.utils import (sample_frames_from_video,
+                                   try_import_video_packages)
+
+from .base import get_cache_dir
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+
+
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    cv2 = try_import_video_packages()
+
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+    for i in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+    cap.release()
+
+    frames = np.stack(frames)
+    frames = sample_frames_from_video(frames, num_frames)
+    if len(frames) < num_frames:
+        raise ValueError(f"Could not read enough frames from video file {path}"
+                         f" (expected {num_frames} frames, got {len(frames)})")
+    return frames
+
+
+def video_to_pil_images_list(path: str,
+                             num_frames: int = -1) -> List[Image.Image]:
+    cv2 = try_import_video_packages()
+    frames = video_to_ndarrays(path, num_frames)
+    return [
+        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        for frame in frames
+    ]
+
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: Literal["sample_demo_1.mp4"]
+    num_frames: int = -1
+
+    @property
+    def pil_images(self) -> List[Image.Image]:
+        video_path = download_video_asset(self.name)
+        ret = video_to_pil_images_list(video_path, self.num_frames)
+        return ret
+
+    @property
+    def np_ndarrays(self) -> List[npt.NDArray]:
+        video_path = download_video_asset(self.name)
+        ret = video_to_ndarrays(video_path, self.num_frames)
+        return ret
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index ccfc6b254c1e7..adc8390e6f9ec 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -83,7 +83,9 @@ def copy_blocks(
     ) -> None:
         raise NotImplementedError
 
-    def advance_step(self, num_seqs: int, num_queries: int):
+    def advance_step(self, model_input: "ModelRunnerInputBase",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int, num_seqs: int, num_queries: int) -> None:
         raise NotImplementedError
 
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 30ce715d5d05a..bf883987bd80b 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -16,7 +16,8 @@
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
 
 from vllm_flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func
 from vllm_flash_attn import flash_attn_with_kvcache as _flash_attn_with_kvcache
@@ -121,6 +122,40 @@ def _(
     return torch.empty_like(decode_query)
 
 
+@torch.library.custom_op("vllm::reshape_and_cache_flash",
+                         mutates_args=["kv_cache"])
+def reshape_and_cache_flash(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
+) -> None:
+    """Inductor cannot deal with inplace operations on views.
+    See https://github.com/pytorch/pytorch/issues/131192
+    and https://github.com/pytorch/pytorch/issues/130174
+    This is a workaround to hide the view operation from the inductor.
+    """
+    return torch.ops._C_cache_ops.reshape_and_cache_flash(
+        key, value, kv_cache[0], kv_cache[1], slot_mapping, kv_cache_dtype,
+        k_scale, v_scale)
+
+
+@reshape_and_cache_flash.register_fake  # type: ignore
+def _(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
+) -> None:
+    pass
+
+
 class FlashAttentionBackend(AttentionBackend):
 
     @staticmethod
@@ -302,14 +337,12 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
         )
         return self._cached_decode_metadata
 
-    def advance_step(self, num_seqs: int, num_queries: int):
+    def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int, num_seqs: int, num_queries: int):
         """
         Update metadata in-place to advance one decode step.
         """
-        # GPU in-place update is currently called separately through
-        # custom_ops.advance_step(). See draft_model_runner. TODO(will): Move
-        # this logic to the backend.
-
         # When using cudagraph, the num_seqs is padded to the next captured
         # batch sized, but num_queries tracks the actual number of requests in
         # the batch. For --enforce-eager mode, num_seqs == num_queries
@@ -347,6 +380,16 @@ def advance_step(self, num_seqs: int, num_queries: int):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
@@ -462,9 +505,19 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
             input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
             for i, block_table in enumerate(self.block_tables):
                 if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
+
             block_tables = torch.from_numpy(input_block_tables).to(
                 device=device, non_blocking=True)
         else:
@@ -634,11 +687,10 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            ops.reshape_and_cache_flash(
+            torch.ops.vllm.reshape_and_cache_flash(
                 key,
                 value,
-                key_cache,
-                value_cache,
+                kv_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
                 k_scale,
@@ -650,7 +702,6 @@ def forward(
         assert key.shape[0] == num_prefill_tokens + num_decode_tokens
         assert value.shape[0] == num_prefill_tokens + num_decode_tokens
 
-        output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
@@ -661,6 +712,9 @@ def forward(
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
 
+        prefill_output: Optional[torch.Tensor] = None
+        decode_output: Optional[torch.Tensor] = None
+
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache is None or prefill_meta.block_tables is None
@@ -668,7 +722,7 @@ def forward(
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                out = torch.ops.vllm.flash_attn_varlen_func(
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(
                     q=query,
                     k=key,
                     v=value,
@@ -682,42 +736,44 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                     softcap=self.logits_soft_cap,
                 )
-                assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
             else:
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                output[:
-                       num_prefill_tokens] = torch.ops.vllm.flash_attn_varlen_func(  # noqa
-                           q=query,
-                           k=key_cache,
-                           v=value_cache,
-                           cu_seqlens_q=prefill_meta.query_start_loc,
-                           max_seqlen_q=prefill_meta.max_query_len,
-                           cu_seqlens_k=prefill_meta.seq_start_loc,
-                           max_seqlen_k=max_seq_len,
-                           softmax_scale=self.scale,
-                           causal=True,
-                           alibi_slopes=self.alibi_slopes,
-                           block_table=prefill_meta.block_tables,
-                           softcap=self.logits_soft_cap,
-                       )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Decoding run.
-            output[
-                num_prefill_tokens:] = torch.ops.vllm.flash_attn_with_kvcache(
-                    decode_query.unsqueeze(1),
-                    key_cache,
-                    value_cache,
-                    block_table=decode_meta.block_tables,
-                    cache_seqlens=decode_meta.seq_lens_tensor,
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(  # noqa
+                    q=query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_k=max_seq_len,
                     softmax_scale=self.scale,
                     causal=True,
                     alibi_slopes=self.alibi_slopes,
+                    block_table=prefill_meta.block_tables,
                     softcap=self.logits_soft_cap,
-                ).squeeze(1)
+                )
 
-        # Reshape the output tensor.
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            decode_output = torch.ops.vllm.flash_attn_with_kvcache(
+                decode_query.unsqueeze(1),
+                key_cache,
+                value_cache,
+                block_table=decode_meta.block_tables,
+                cache_seqlens=decode_meta.seq_lens_tensor,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                softcap=self.logits_soft_cap,
+            ).squeeze(1)
+
+        if prefill_output is None:
+            assert decode_output is not None
+            return decode_output.view(num_decode_tokens, hidden_size)
+        if decode_output is None:
+            assert prefill_output is not None
+            return prefill_output.view(num_prefill_tokens, hidden_size)
+        output = torch.cat([prefill_output, decode_output], dim=0)
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 7aec8203eb1e5..4054d337316fe 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -30,7 +30,8 @@
                         make_tensor_with_pad)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
 
 
 class FlashInferBackend(AttentionBackend):
@@ -268,6 +269,10 @@ class FlashInferMetadata(AttentionMetadata):
     query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
+    # used for GPU in-place advance_step
+    seq_lens_tensor: Optional[torch.Tensor] = None
+    block_table_bound: Optional[torch.Tensor] = None
+
     # An example for paged_kv_indices, paged_kv_indptr:
     # request 1, page indices [0, 5, 8]
     # request 2, page indices [1, 6, 7]
@@ -318,6 +323,8 @@ def begin_forward(self):
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
             assert self.paged_kv_last_page_len is not None
+            assert self.block_table_bound is not None
+            assert self.seq_lens_tensor is not None
             batch_size = self.query_start_loc.shape[0] - 1
             assert batch_size >= 0
             # We will use flash attention for profiling to
@@ -327,6 +334,8 @@ def begin_forward(self):
                 self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
                 self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                     self.device)
+                self.block_table_bound = self.block_table_bound.to(self.device)
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                 self.paged_kv_indices = self.paged_kv_indices.to(self.device)
                 self.prefill_wrapper.end_forward()
                 self.prefill_wrapper.begin_forward(
@@ -335,14 +344,18 @@ def begin_forward(self):
                     self.num_qo_heads, self.num_kv_heads, self.head_dim,
                     self.page_size)
         else:
-            if not self.use_cuda_graph:
-                assert self.paged_kv_indices is not None
-                assert self.paged_kv_indptr is not None
-                assert self.paged_kv_last_page_len is not None
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                    self.device)
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            # handle model warmup path
+            if self.block_table_bound is not None:
+                self.block_table_bound = self.block_table_bound.to(self.device)
+            if self.seq_lens_tensor is not None:
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
 
             assert self.decode_wrapper is not None
             self.decode_wrapper.end_forward()
@@ -391,6 +404,48 @@ def decode_metadata(self) -> Optional["FlashInferMetadata"]:
 
         return self
 
+    def advance_step(
+        self,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        sampled_token_ids: Optional[torch.Tensor],
+        block_size: int,
+        num_seqs: int,
+        num_queries: int,
+    ):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+
+        assert num_seqs > 0
+        assert num_queries > 0
+        assert model_input.attn_metadata is not None
+        assert sampled_token_ids is not None
+
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
+
+        # Update GPU tensors
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=model_input.input_tokens,
+            sampled_token_ids=model_input.input_tokens,
+            input_positions=model_input.input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_len=self.paged_kv_last_page_len,
+            block_table_bound=self.block_table_bound)
+
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
@@ -428,7 +483,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.paged_kv_indptr: List[int] = [0]
         # paged_kv_last_page_len is the length of the last page of each request
         self.paged_kv_last_page_len: List[int] = []
-
+        self.total_blocks = 0
         self.is_profile_run: bool = False
 
     def _add_seq_group(
@@ -499,6 +554,7 @@ def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
         # block_table_bound is 1 with 1 valid block.
         # If seq_len = 15, block_size = 16,
         # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
         block_table_bound = seq_len // self.block_size + 1 \
                             if seq_len % self.block_size != 0 \
                             else seq_len // self.block_size
@@ -541,9 +597,19 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
             input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
             for i, block_table in enumerate(self.block_tables):
                 if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
+
             block_tables = torch.from_numpy(input_block_tables).to(
                 device, non_blocking=True)
 
@@ -583,6 +649,10 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                      out=query_start_loc[1:])
 
         if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
             paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
                                                    device="cpu",
                                                    dtype=torch.int)
@@ -591,10 +661,15 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                                   dtype=torch.int)
             paged_kv_last_page_len_tensor = torch.tensor(
                 self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device="cpu",
+                                                   dtype=torch.int)
         else:
             paged_kv_indices_tensor = None
             paged_kv_indptr_tensor = None
             paged_kv_last_page_len_tensor = None
+            block_table_bound_tensor = None
 
         if self.runner.kv_cache_dtype.startswith("fp8"):
             kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
@@ -613,6 +688,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             paged_kv_indptr=paged_kv_indptr_tensor,
             paged_kv_indices=paged_kv_indices_tensor,
             paged_kv_last_page_len=paged_kv_last_page_len_tensor,
+            block_table_bound=block_table_bound_tensor,
+            seq_lens_tensor=seq_lens_tensor,
             num_qo_heads=self.runner.model_config.get_num_attention_heads(
                 self.runner.parallel_config),
             num_kv_heads=self.runner.model_config.get_num_kv_heads(
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 64d60e4e47e48..113a2788eacd3 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -49,14 +49,18 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        from vllm._ipex_ops import ipex_ops as ops
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        from vllm._ipex_ops import ipex_ops as ops
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
 @dataclass
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 412171296839d..9dd74081390a7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,10 +1,11 @@
 """Attention layer ROCm GPUs."""
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
@@ -13,8 +14,14 @@
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
 logger = init_logger(__name__)
 
+_PARTITION_SIZE = 256
+ON_NAVI = "gfx1" in torch.cuda.get_device_properties("cuda").gcnArchName
+
 
 class ROCmFlashAttentionBackend(AttentionBackend):
 
@@ -175,6 +182,59 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
         )
         return self._cached_decode_metadata
 
+    def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int, num_seqs: int, num_queries: int):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+        assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
 
 class ROCmFlashAttentionMetadataBuilder(
         CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
@@ -480,20 +540,50 @@ def forward(
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
-                decode_query,
-                key_cache,
-                value_cache,
-                decode_meta.block_tables,
-                decode_meta.seq_lens_tensor,
-                decode_meta.max_decode_seq_len,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                k_scale,
-                v_scale,
-            )
+            # Whether to use rocm custom paged attention or not
+            num_seqs, num_heads, head_size = decode_query.shape
+            block_size = value_cache.shape[3]
+            gqa_ratio = num_heads // self.num_kv_heads
+            use_custom = use_rocm_custom_paged_attention(
+                decode_query.dtype, head_size, block_size, gqa_ratio,
+                decode_meta.max_decode_seq_len)
+            if use_custom:
+                max_seq_len = decode_meta.max_decode_seq_len
+                max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+                                      _PARTITION_SIZE)
+                assert _PARTITION_SIZE % block_size == 0
+                tmp_output = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions, head_size),
+                    dtype=output.dtype,
+                    device=output.device,
+                )
+                exp_sums = torch.empty(
+                    size=(num_seqs, num_heads, max_num_partitions),
+                    dtype=torch.float32,
+                    device=output.device,
+                )
+                max_logits = torch.empty_like(exp_sums)
+                ops.paged_attention_rocm(
+                    output[num_prefill_tokens:], exp_sums, max_logits,
+                    tmp_output, decode_query, key_cache, value_cache,
+                    self.num_kv_heads, self.scale, decode_meta.block_tables,
+                    decode_meta.seq_lens_tensor, block_size, max_seq_len,
+                    self.alibi_slopes, self.kv_cache_dtype, k_scale, v_scale)
+            else:
+                output[num_prefill_tokens:] = PagedAttention.forward_decode(
+                    decode_query,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables,
+                    decode_meta.seq_lens_tensor,
+                    decode_meta.max_decode_seq_len,
+                    self.kv_cache_dtype,
+                    self.num_kv_heads,
+                    self.scale,
+                    self.alibi_slopes,
+                    k_scale,
+                    v_scale,
+                )
 
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)
@@ -532,3 +622,14 @@ def _sdpa_attention(
             start = end
 
     return output
+
+
+def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
+                                    block_size: int, gqa_ratio: int,
+                                    max_seq_len: int) -> bool:
+    # rocm custom page attention not support on navi (gfx1*)
+    return (envs.VLLM_USE_ROCM_CUSTOM_PAGED_ATTN and not ON_NAVI
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index b055319c8a5ac..92023d5b75f5a 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -8,15 +8,9 @@
 
 if HAS_TRITON:
     from vllm.attention.ops.prefix_prefill import context_attention_fwd
-from vllm.envs import VLLM_USE_ROCM_CUSTOM_PAGED_ATTN
-from vllm.utils import is_hip
-
-custom_attn_available = is_hip() and VLLM_USE_ROCM_CUSTOM_PAGED_ATTN and \
-    "gfx1" not in torch.cuda.get_device_properties('cuda').gcnArchName
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE_V1V2 = 512
-_PARTITION_SIZE_CUSTOM = 512
+_PARTITION_SIZE = 512
 
 
 @dataclass
@@ -120,16 +114,6 @@ def forward_decode(
         output = torch.empty_like(query)
         block_size = value_cache.shape[3]
         num_seqs, num_heads, head_size = query.shape
-        gqa_ratio = num_heads // num_kv_heads
-        use_custom = (custom_attn_available
-                      and query.dtype in (torch.half, torch.bfloat16)
-                      and head_size in (64, 128) and block_size in (16, 32)
-                      and (gqa_ratio >= 1 and gqa_ratio <= 16)
-                      and max_seq_len <= 32768)
-        if not use_custom:
-            _PARTITION_SIZE = _PARTITION_SIZE_V1V2
-        else:
-            _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
                               _PARTITION_SIZE)
         # NOTE(woosuk): We use a simple heuristic to decide whether to use
@@ -140,8 +124,8 @@ def forward_decode(
         # TODO(woosuk): Tune this heuristic.
         # For context len > 8192, use V2 kernel to avoid shared memory shortage.
         use_v1 = (max_seq_len <= 8192
-                  and (max_num_partitions == 1 or num_seqs * num_heads > 512)
-                  and not use_custom)
+                  and (max_num_partitions == 1 or num_seqs * num_heads > 512))
+
         if use_v1:
             # Run PagedAttention V1.
             ops.paged_attention_v1(
@@ -166,7 +150,7 @@ def forward_decode(
                 blocksparse_head_sliding_step,
             )
         else:
-            # Run PagedAttention V2 or PagedAttention Custom.
+            # Run PagedAttention V2.
             assert _PARTITION_SIZE % block_size == 0
             tmp_output = torch.empty(
                 size=(num_seqs, num_heads, max_num_partitions, head_size),
@@ -179,38 +163,30 @@ def forward_decode(
                 device=output.device,
             )
             max_logits = torch.empty_like(exp_sums)
-            if not use_custom:
-                ops.paged_attention_v2(
-                    output,
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
-                    tp_rank,
-                    blocksparse_local_blocks,
-                    blocksparse_vert_stride,
-                    blocksparse_block_size,
-                    blocksparse_head_sliding_step,
-                )
-            else:
-                ops.paged_attention_custom(output, exp_sums, max_logits,
-                                           tmp_output, query, key_cache,
-                                           value_cache, num_kv_heads, scale,
-                                           block_tables, seq_lens, block_size,
-                                           max_seq_len, alibi_slopes,
-                                           kv_cache_dtype, k_scale, v_scale)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                tp_rank,
+                blocksparse_local_blocks,
+                blocksparse_vert_stride,
+                blocksparse_block_size,
+                blocksparse_head_sliding_step,
+            )
         return output
 
     @staticmethod
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
new file mode 100644
index 0000000000000..de0b1d8a75757
--- /dev/null
+++ b/vllm/compilation/backends.py
@@ -0,0 +1,156 @@
+import operator
+
+import torch
+import torch.fx as fx
+
+
+def fix_functionalization(graph: fx.Graph):
+    """
+    Rewrite the graph module to replace the pattern involving
+    torch._higher_order_ops.auto_functionalize.auto_functionalized
+    with a direct call to the inplace custom op.
+
+    # TODO: check if PyTorch nightly has fixed this issue
+    """
+
+    # debug code, if we want to see the graph before the transformation
+    # with open("before.py", "w") as f:
+    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
+
+    nodes_to_remove = []
+
+    for node in graph.nodes:
+        # Identify the auto_functionalized node
+        if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized:  # noqa
+            if node.args[0] == torch.ops._C.rotary_embedding.default:
+                # manual replace for rotary_embedding
+
+                # Now, collect the arguments
+                kwargs = node.kwargs
+
+                query = kwargs['query']
+                mm_node = query.args[0].args[0]
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(torch.ops._C.rotary_embedding.default,
+                                        kwargs=kwargs)
+
+                # Remove the auto_functionalized node
+                # Since the node may have outputs, we need to handle its users
+                # Replace uses of the outputs (getitem nodes) with mm_node
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        # Remove the getitem node
+                        for getitem_user in list(user.users):
+                            if (getitem_user.op == 'call_function'
+                                    and getitem_user.target
+                                    == torch.ops.aten.slice_scatter.default):
+                                # Replace the uses of slice_scatter node
+                                # with mm_node
+                                getitem_user.replace_all_uses_with(mm_node)
+                                nodes_to_remove.append(getitem_user)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.fused_add_rms_norm.default:
+                # manual replace for fused_add_rms_norm
+                # this is the most effective optimization for llama
+                # failing to do this will result in many unnecessary copies
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                residual = kwargs['residual']
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs)
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        # Remove the getitem node
+                        if user.args[1] == 1:
+                            replace_node = input
+                        elif user.args[1] == 2:
+                            replace_node = residual
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.rms_norm.default:
+                # manual replace for rms_norm
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                out = kwargs['out']
+                weight = kwargs['weight']
+                epsilon = kwargs['epsilon']
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.rms_norm.default,
+                        args=(out, input, weight, epsilon),
+                    )
+
+                replace_node = out
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.silu_and_mul.default:
+                # manual replace for silu_and_mul
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                out = kwargs['out']
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.silu_and_mul.default,
+                        args=(out, input),
+                    )
+                replace_node = out
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+    # Remove the nodes all at once
+    for node in nodes_to_remove:
+        graph.erase_node(node)
+
+    # debug code, if we want to see the graph after the transformation
+    # with open("after.py", "w") as f:
+    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
+
+
+def vllm_backend(graph, example_inputs):
+    from torch._inductor import config
+    current_config = config.shallow_copy_dict()
+    from torch._inductor.compile_fx import compile_fx
+    current_config['post_grad_custom_post_pass'] = fix_functionalization
+    return compile_fx(graph, example_inputs, config_patches=current_config)
diff --git a/vllm/config.py b/vllm/config.py
index 46c89a222e6db..1644bc618cb7a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -281,12 +281,13 @@ def _parse_quant_hf_config(self):
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
-            "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8"
+            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
+            "fbgemm_fp8"
         ]
         optimized_quantization_methods = [
-            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
-            "fbgemm_fp8", "compressed_tensors", "compressed-tensors",
-            "experts_int8"
+            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
+            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
+            "compressed-tensors", "experts_int8"
         ]
         tpu_supported_quantization = ["tpu_int8"]
         neuron_supported_quantization = ["neuron_quant"]
@@ -381,7 +382,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        if self.enforce_eager:
+        if device_config.device_type == "cuda" and self.enforce_eager:
             logger.warning(
                 "To see benefits of async output processing, enable CUDA "
                 "graph. Since, enforce-eager is enabled, async output "
@@ -775,7 +776,7 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
-            
+
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -871,6 +872,13 @@ def __init__(
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
 
+        if current_platform.is_tpu() and self.world_size > 1:
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "ray"
+            if self.distributed_executor_backend != "ray":
+                raise ValueError(
+                    "TPU backend only supports Ray for distributed inference.")
+
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
@@ -878,7 +886,8 @@ def __init__(
             from vllm.executor import ray_utils
             backend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if cuda_device_count_stateless() < self.world_size:
+            if (current_platform.is_cuda()
+                    and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
                                      "required for multi-node inference, "
@@ -1561,14 +1570,6 @@ class PromptAdapterConfig:
     prompt_adapter_dtype: Optional[torch.dtype] = None
 
     def __post_init__(self):
-        library_name = 'peft'
-        try:
-            __import__(library_name)
-        except ImportError as e:
-            raise ImportError(
-                f"'{library_name}' is not installed for prompt adapter support."
-                f"Please install it using 'pip install {library_name}'."
-            ) from e
 
         if self.max_prompt_adapters < 1:
             raise ValueError(f"max_prompt_adapters "
@@ -1744,8 +1745,11 @@ def _get_and_verify_max_len(
                     "with rope_scaling. Please raise an issue so we can "
                     "investigate.")
 
-            assert "factor" in rope_scaling
-            scaling_factor = rope_scaling["factor"]
+            if rope_type == "mrope":
+                scaling_factor = 1
+            else:
+                assert "factor" in rope_scaling
+                scaling_factor = rope_scaling["factor"]
             if rope_type == "yarn":
                 derived_max_model_len = rope_scaling[
                     "original_max_position_embeddings"]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5fe48e11ca9e3..0d831657bcad9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -26,6 +26,16 @@
 
 ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
 
+DEVICE_OPTIONS = [
+    "auto",
+    "cuda",
+    "neuron",
+    "cpu",
+    "openvino",
+    "tpu",
+    "xpu",
+]
+
 
 def nullable_str(val: str):
     if not val or val == "None":
@@ -553,10 +563,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument("--device",
                             type=str,
                             default=EngineArgs.device,
-                            choices=[
-                                "auto", "cuda", "neuron", "cpu", "openvino",
-                                "tpu", "xpu"
-                            ],
+                            choices=DEVICE_OPTIONS,
                             help='Device type for vLLM execution.')
         parser.add_argument('--num-scheduler-steps',
                             type=int,
@@ -836,6 +843,13 @@ def create_engine_config(self) -> EngineConfig:
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
+        if model_config.is_multimodal_model:
+            if self.enable_prefix_caching:
+                logger.warning(
+                    "--enable-prefix-caching is currently not "
+                    "supported for multimodal models and has been disabled.")
+            self.enable_prefix_caching = False
+
         cache_config = CacheConfig(
             block_size=self.block_size if self.device != "neuron" else
             self.max_model_len,  # neuron needs block_size = max_model_len
@@ -867,7 +881,10 @@ def create_engine_config(self) -> EngineConfig:
             # If not explicitly set, enable chunked prefill by default for
             # long context (> 32K) models. This is to avoid OOM errors in the
             # initial memory profiling phase.
-            if use_long_context:
+
+            # Chunked prefill is currently disabled for multimodal models by
+            # default.
+            if use_long_context and not model_config.is_multimodal_model:
                 is_gpu = device_config.device_type == "cuda"
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
@@ -878,7 +895,6 @@ def create_engine_config(self) -> EngineConfig:
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and not self.enable_prefix_caching
                         and not has_seqlen_agnostic_layers):
                     self.enable_chunked_prefill = True
                     logger.warning(
@@ -1029,7 +1045,6 @@ def create_engine_config(self) -> EngineConfig:
 @dataclass
 class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
-    engine_use_ray: bool = False
     disable_log_requests: bool = False
 
     @staticmethod
@@ -1037,16 +1052,6 @@ def add_cli_args(parser: FlexibleArgumentParser,
                      async_args_only: bool = False) -> FlexibleArgumentParser:
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
-        parser.add_argument('--engine-use-ray',
-                            action='store_true',
-                            help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.'
-                            '(DEPRECATED. This argument is deprecated '
-                            'and will be removed in a future update. '
-                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
-                            'use it. See '
-                            'https://github.com/vllm-project/vllm/issues/7045.'
-                            ')')
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6ed1a6bba08ea..8a07ce1c965e1 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -4,22 +4,18 @@
 from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
                     Mapping, Optional, Set, Tuple, Type, Union)
 
-from typing_extensions import assert_never
-
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine,
-                                    PromptComponents, SchedulerOutputState)
+from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import initialize_ray_cluster, ray
-from vllm.inputs import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
-                         SingletonPromptInputs)
-from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
+from vllm.executor.gpu_executor import GPUExecutorAsync
+from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -30,7 +26,6 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -404,139 +399,6 @@ async def stop_remote_worker_execution_loop_async(self) -> None:
         """Stop the remote worker execution loop."""
         await self.model_executor.stop_remote_worker_execution_loop_async()
 
-    async def _tokenize_prompt_async(
-        self,
-        prompt: str,
-        request_id: str,
-        lora_request: Optional[LoRARequest],
-    ) -> List[int]:
-        """Async version of :meth:`_tokenize_prompt`."""
-        tokenizer = self.get_tokenizer_group(
-            missing_msg="prompts must be None if skip_tokenizer_init is True")
-
-        return await tokenizer.encode_async(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
-
-    async def _extract_prompt_components_async(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
-        """Async version of :meth:`_extract_prompt_components`."""
-        if isinstance(inputs, str):
-            prompt = inputs
-            prompt_token_ids = await self._tokenize_prompt_async(
-                prompt,
-                request_id=request_id,
-                lora_request=lora_request,
-            )
-            multi_modal_data = None
-        elif isinstance(inputs, dict):
-            if "prompt_token_ids" in inputs:
-                prompt = None
-                prompt_token_ids = inputs["prompt_token_ids"]
-            else:
-                # NOTE: This extra assignment is required to pass mypy
-                prompt = parsed_prompt = inputs["prompt"]
-                prompt_token_ids = await self._tokenize_prompt_async(
-                    parsed_prompt,
-                    request_id=request_id,
-                    lora_request=lora_request,
-                )
-
-            multi_modal_data = inputs.get("multi_modal_data")
-        else:
-            assert_never(inputs)
-
-        return prompt, prompt_token_ids, multi_modal_data
-
-    async def _process_encoder_decoder_prompt_async(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-    ) -> EncoderDecoderLLMInputs:
-        """Async version of :meth:`_process_encoder_decoder_prompt`."""
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
-
-        if is_explicit_encoder_decoder_prompt(inputs):
-            encoder_task = self._extract_prompt_components_async(
-                inputs["encoder_prompt"],
-                request_id=request_id,
-            )
-
-            if (decoder_input := inputs["decoder_prompt"]) is None:
-                encoder_comps = await encoder_task
-                decoder_comps = None, None, None
-            else:
-                decoder_task = self._extract_prompt_components_async(
-                    decoder_input,
-                    request_id=request_id,
-                )
-
-                encoder_comps, decoder_comps = await asyncio.gather(
-                    encoder_task, decoder_task)
-        else:
-            encoder_comps = await self._extract_prompt_components_async(
-                inputs,
-                request_id=request_id,
-            )
-
-            decoder_comps = None, None, None
-
-        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
-
-    async def _process_decoder_only_prompt_async(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> LLMInputs:
-        """Async version of :meth:`_process_decoder_only_prompt`."""
-        prompt_comps = await self._extract_prompt_components_async(
-            inputs,
-            request_id=request_id,
-            lora_request=lora_request,
-        )
-
-        return self._build_decoder_only_llm_inputs(
-            prompt_comps,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-    async def process_model_inputs_async(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
-        """Async version of :meth:`process_model_inputs`."""
-        if self.is_encoder_decoder_model():
-            # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
-            model_inputs = await self._process_encoder_decoder_prompt_async(
-                inputs,
-                request_id=request_id,
-            )
-        else:
-            if is_explicit_encoder_decoder_prompt(inputs):
-                raise ValueError("Cannot pass encoder-decoder prompt "
-                                 "to decoder-only models")
-
-            # Decoder-only operation
-            model_inputs = await self._process_decoder_only_prompt_async(
-                inputs,
-                request_id=request_id,
-                lora_request=lora_request,
-                prompt_adapter_request=prompt_adapter_request,
-            )
-
-        return self.input_processor(model_inputs)
-
     async def add_request_async(
         self,
         request_id: str,
@@ -554,12 +416,13 @@ async def add_request_async(
         if arrival_time is None:
             arrival_time = time.time()
 
-        processed_inputs = await self.process_model_inputs_async(
+        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
             inputs,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
+        processed_inputs = self.input_processor(preprocessed_inputs)
 
         self._add_processed_request(
             request_id=request_id,
@@ -590,9 +453,6 @@ class AsyncLLMEngine:
         worker_use_ray: Whether to use Ray for model workers. Required for
             distributed execution. Should be the same as
             `parallel_config.worker_use_ray`.
-        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
-            async frontend will be executed in a separate process as the
-            model workers.
         log_requests: Whether to log the requests.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
@@ -604,41 +464,23 @@ class AsyncLLMEngine:
 
     def __init__(self,
                  worker_use_ray: bool,
-                 engine_use_ray: bool,
                  *args,
                  log_requests: bool = True,
                  start_engine_loop: bool = True,
                  **kwargs) -> None:
         self.worker_use_ray = worker_use_ray
-        self.engine_use_ray = engine_use_ray
         self.log_requests = log_requests
-        self.engine = self._init_engine(*args, **kwargs)
+        self.engine = self._engine_class(*args, **kwargs)
 
         # This ensures quick processing of request outputs
         # so the append to asyncio queues is not delayed,
         # especially for multi-step.
         #
-        # TODO: Currently, disabled for engine_use_ray, ask
-        # Cody/Will/Woosuk about this case.
-        self.use_process_request_outputs_callback = not self.engine_use_ray
+        self.use_process_request_outputs_callback = True
         if self.use_process_request_outputs_callback:
             self.engine.process_request_outputs_callback = \
                 self.process_request_outputs
 
-        if self.engine_use_ray:
-            print_warning_once(
-                "DEPRECATED. `--engine-use-ray` is deprecated and will "
-                "be removed in a future update. "
-                "See https://github.com/vllm-project/vllm/issues/7045.")
-
-            if envs.VLLM_ALLOW_ENGINE_USE_RAY:
-                print_warning_once(
-                    "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
-            else:
-                raise ValueError("`--engine-use-ray` is deprecated. "
-                                 "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
-                                 "force use it")
-
         self.background_loop: Optional[asyncio.Future] = None
         # We need to keep a reference to unshielded
         # task as well to prevent it from being garbage
@@ -725,16 +567,11 @@ def from_engine_args(
         # Create the engine configs.
         engine_config = engine_args.create_engine_config()
 
-        if engine_args.engine_use_ray:
-            from vllm.executor import ray_utils
-            ray_utils.assert_ray_available()
-
         executor_class = cls._get_executor_cls(engine_config)
 
         # Create the async LLM engine.
         engine = cls(
             executor_class.uses_ray,
-            engine_args.engine_use_ray,
             **engine_config.to_dict(),
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
@@ -777,10 +614,6 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        if self.engine_use_ray:
-            return await self.engine.get_tokenizer.remote(  # type: ignore
-                lora_request)
-
         return await (self.engine.get_tokenizer_group().
                       get_lora_tokenizer_async(lora_request))
 
@@ -814,26 +647,6 @@ def shutdown_background_loop(self) -> None:
             self._background_loop_unshielded = None
         self.background_loop = None
 
-    def _init_engine(self, *args,
-                     **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
-        if not self.engine_use_ray:
-            engine_class = self._engine_class
-        elif self.worker_use_ray:
-            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
-        else:
-            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
-            # order of the arguments.
-            cache_config = kwargs["cache_config"]
-            parallel_config = kwargs["parallel_config"]
-            if (parallel_config.tensor_parallel_size == 1
-                    and parallel_config.pipeline_parallel_size == 1):
-                num_gpus = cache_config.gpu_memory_utilization
-            else:
-                num_gpus = 1
-            engine_class = ray.remote(num_gpus=num_gpus)(
-                self._engine_class).remote
-        return engine_class(*args, **kwargs)
-
     async def engine_step(self, virtual_engine: int) -> bool:
         """Kick the engine to process the waiting requests.
 
@@ -844,13 +657,8 @@ async def engine_step(self, virtual_engine: int) -> bool:
 
         for new_request in new_requests:
             # Add the request into the vLLM engine's waiting queue.
-            # TODO: Maybe add add_request_batch to reduce Ray overhead
             try:
-                if self.engine_use_ray:
-                    await self.engine.add_request.remote(  # type: ignore
-                        **new_request)
-                else:
-                    await self.engine.add_request_async(**new_request)
+                await self.engine.add_request_async(**new_request)
             except ValueError as e:
                 # TODO: use a vLLM specific error for failed validation
                 self._request_tracker.process_exception(
@@ -862,10 +670,7 @@ async def engine_step(self, virtual_engine: int) -> bool:
         if aborted_requests:
             await self._engine_abort(aborted_requests)
 
-        if self.engine_use_ray:
-            request_outputs = await self.engine.step.remote()  # type: ignore
-        else:
-            request_outputs = await self.engine.step_async(virtual_engine)
+        request_outputs = await self.engine.step_async(virtual_engine)
 
         # Put the outputs into the corresponding streams.
         # If used as a callback, then already invoked inside
@@ -891,16 +696,10 @@ def process_request_outputs(self, request_outputs) -> bool:
         return all_finished
 
     async def _engine_abort(self, request_ids: Iterable[str]):
-        if self.engine_use_ray:
-            await self.engine.abort_request.remote(request_ids)  # type: ignore
-        else:
-            self.engine.abort_request(request_ids)
+        self.engine.abort_request(request_ids)
 
     async def run_engine_loop(self):
-        if self.engine_use_ray:
-            pipeline_parallel_size = 1  # type: ignore
-        else:
-            pipeline_parallel_size = \
+        pipeline_parallel_size = \
                 self.engine.parallel_config.pipeline_parallel_size
         has_requests_in_progress = [False] * pipeline_parallel_size
         while True:
@@ -912,12 +711,7 @@ async def run_engine_loop(self):
                 # timeout, and unblocks the RPC thread in the workers so that
                 # they can process any other queued control plane messages,
                 # such as add/remove lora adapters.
-                if self.engine_use_ray:
-                    await (self.engine.stop_remote_worker_execution_loop.
-                           remote()  # type: ignore
-                           )
-                else:
-                    await self.engine.stop_remote_worker_execution_loop_async()
+                await self.engine.stop_remote_worker_execution_loop_async()
                 await self._request_tracker.wait_for_new_requests()
                 logger.debug("Got new requests!")
                 requests_in_progress = [
@@ -938,17 +732,9 @@ async def run_engine_loop(self):
                 for task in done:
                     result = task.result()
                     virtual_engine = requests_in_progress.index(task)
-                    if self.engine_use_ray:
-                        has_unfinished_requests = (
-                            await (self.engine.
-                                   has_unfinished_requests_for_virtual_engine.
-                                   remote(  # type: ignore
-                                       virtual_engine)))
-                    else:
-                        has_unfinished_requests = (
-                            self.engine.
-                            has_unfinished_requests_for_virtual_engine(
-                                virtual_engine))
+                    has_unfinished_requests = (
+                        self.engine.has_unfinished_requests_for_virtual_engine(
+                            virtual_engine))
                     if result or has_unfinished_requests:
                         requests_in_progress[virtual_engine] = (
                             asyncio.create_task(
@@ -1190,52 +976,29 @@ def _abort(self, request_id: str) -> None:
 
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_model_config.remote()  # type: ignore
-        else:
-            return self.engine.get_model_config()
+        return self.engine.get_model_config()
 
     async def get_parallel_config(self) -> ParallelConfig:
         """Get the parallel configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_parallel_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_parallel_config()
+        return self.engine.get_parallel_config()
 
     async def get_decoding_config(self) -> DecodingConfig:
         """Get the decoding configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_decoding_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_decoding_config()
+        return self.engine.get_decoding_config()
 
     async def get_scheduler_config(self) -> SchedulerConfig:
         """Get the scheduling configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_scheduler_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_scheduler_config()
+        return self.engine.get_scheduler_config()
 
     async def get_lora_config(self) -> LoRAConfig:
         """Get the lora configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_lora_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_lora_config()
+        return self.engine.get_lora_config()
 
     async def do_log_stats(
             self,
             scheduler_outputs: Optional[SchedulerOutputs] = None,
             model_output: Optional[List[SamplerOutput]] = None) -> None:
-        if self.engine_use_ray:
-            await self.engine.do_log_stats.remote(  # type: ignore
-                scheduler_outputs, model_output)
-        else:
-            self.engine.do_log_stats()
+        self.engine.do_log_stats()
 
     async def check_health(self) -> None:
         """Raises an error if engine is unhealthy."""
@@ -1244,40 +1007,30 @@ async def check_health(self) -> None:
         if self.is_stopped:
             raise AsyncEngineDeadError("Background loop is stopped.")
 
-        if self.engine_use_ray:
-            try:
-                await self.engine.check_health.remote()  # type: ignore
-            except ray.exceptions.RayActorError as e:
-                raise RuntimeError("Engine is dead.") from e
-        else:
-            await self.engine.check_health_async()
+        await self.engine.check_health_async()
         logger.debug("Health check took %fs", time.perf_counter() - t)
 
     async def is_tracing_enabled(self) -> bool:
-        if self.engine_use_ray:
-            return await self.engine.is_tracing_enabled.remote(  # type: ignore
-            )
-        else:
-            return self.engine.is_tracing_enabled()
+        return self.engine.is_tracing_enabled()
 
     def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        if self.engine_use_ray:
-            ray.get(
-                self.engine.add_logger.remote(  # type: ignore
-                    logger_name=logger_name, logger=logger))
-        else:
-            self.engine.add_logger(logger_name=logger_name, logger=logger)
+        self.engine.add_logger(logger_name=logger_name, logger=logger)
 
     def remove_logger(self, logger_name: str) -> None:
-        if self.engine_use_ray:
-            ray.get(
-                self.engine.remove_logger.remote(  # type: ignore
-                    logger_name=logger_name))
-        else:
-            self.engine.remove_logger(logger_name=logger_name)
+        self.engine.remove_logger(logger_name=logger_name)
 
     async def start_profile(self) -> None:
-        self.engine.model_executor._run_workers("start_profile")
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes
+        if type(self.engine.model_executor) == GPUExecutorAsync:
+            self.engine.model_executor.start_profile()
+        else:
+            self.engine.model_executor._run_workers("start_profile")
 
     async def stop_profile(self) -> None:
-        self.engine.model_executor._run_workers("stop_profile")
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes
+        if type(self.engine.model_executor) == GPUExecutorAsync:
+            self.engine.model_executor.stop_profile()
+        else:
+            self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 49ef3fcbc036a..374bbf6287bc6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -3,13 +3,13 @@
 from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List,
-                    Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Type, Union
+from typing import Set, Type, Union
 
 import torch
-from typing_extensions import TypeVar, assert_never
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
@@ -26,20 +26,19 @@
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
-                         InputRegistry, LLMInputs, PromptInputs,
-                         SingletonPromptInputs)
-from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
+                         InputRegistry, LLMInputs, PromptInputs)
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                            Sequence, SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
@@ -75,11 +74,6 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
 
-PromptComponents = Tuple[Optional[str], List[int],
-                         Optional[MultiModalDataDict]]
-DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
-                                Optional[MultiModalDataDict]]
-
 
 @dataclass
 class SchedulerOutputState:
@@ -225,9 +219,6 @@ def __init__(
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
-        # To improve performance, only final requests outputs may be required.
-        # If this set to true, then no intermediate outputs will be returned.
-        step_return_finished_only: bool = False,
     ) -> None:
         logger.info(
             "Initializing an LLM engine (v%s) with config: "
@@ -295,7 +286,6 @@ def __init__(
         self.observability_config = observability_config or ObservabilityConfig(
         )
         self.log_stats = log_stats
-        self.step_return_finished_only = step_return_finished_only
 
         if not self.model_config.skip_tokenizer_init:
             self.tokenizer = self._init_tokenizer()
@@ -317,6 +307,9 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
 
+        self.input_preprocessor = InputPreprocessor(model_config,
+                                                    self.tokenizer)
+
         self.input_registry = input_registry
         self.input_processor = input_registry.create_input_processor(
             model_config)
@@ -397,7 +390,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
 
         # Currently used by AsyncLLMEngine to ensure quick append
         # of request outputs to asyncio queues
-        self.process_request_outputs_callback = None
+        self.process_request_outputs_callback: Optional[Callable] = None
 
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
@@ -575,19 +568,15 @@ def __del__(self):
         if model_executor := getattr(self, "model_executor", None):
             model_executor.shutdown()
 
-    MISSING_TOKENIZER_GROUP_MSG = ("Unable to get tokenizer because "
-                                   "skip_tokenizer_init is True")
-
     def get_tokenizer_group(
         self,
         group_type: Type[_G] = BaseTokenizerGroup,
-        *,
-        missing_msg: str = MISSING_TOKENIZER_GROUP_MSG,
     ) -> _G:
         tokenizer_group = self.tokenizer
 
         if tokenizer_group is None:
-            raise ValueError(missing_msg)
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
         if not isinstance(tokenizer_group, group_type):
             raise TypeError("Invalid type of tokenizer group. "
                             f"Expected type: {group_type}, but "
@@ -619,52 +608,6 @@ def _verify_args(self) -> None:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
 
-    def _get_bos_token_id(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        if self.tokenizer is None:
-            logger.warning("Using None for BOS token id because tokenizer "
-                           "is not initialized")
-            return None
-
-        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
-
-    def _get_eos_token_id(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        if self.tokenizer is None:
-            logger.warning("Using None for EOS token id because tokenizer "
-                           "is not initialized")
-            return None
-
-        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
-
-    def _get_decoder_start_token_id(self) -> Optional[int]:
-        '''
-        Obtain the decoder start token id employed by an encoder/decoder
-        model. Returns None for non-encoder/decoder models or if the
-        model config is unavailable.
-        '''
-
-        if not self.is_encoder_decoder_model():
-            logger.warning("Using None for decoder start token id because "
-                           "this is not an encoder/decoder model.")
-            return None
-
-        if (self.model_config is None or self.model_config.hf_config is None):
-            logger.warning("Using None for decoder start token id because "
-                           "model config is not available.")
-            return None
-
-        dec_start_token_id = getattr(self.model_config.hf_config,
-                                     'decoder_start_token_id', None)
-        if dec_start_token_id is None:
-            logger.warning("Falling back on <BOS> for decoder start token id "
-                           "because decoder start token id is not available.")
-            dec_start_token_id = self._get_bos_token_id()
-
-        return dec_start_token_id
-
     def _add_processed_request(
         self,
         request_id: str,
@@ -679,7 +622,7 @@ def _add_processed_request(
         # Create the sequences.
         block_size = self.cache_config.block_size
         seq_id = next(self.seq_counter)
-        eos_token_id = self._get_eos_token_id(lora_request)
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
                        lora_request, prompt_adapter_request)
@@ -729,334 +672,6 @@ def _add_processed_request(
     def stop_remote_worker_execution_loop(self) -> None:
         self.model_executor.stop_remote_worker_execution_loop()
 
-    _LLMInputComponentsType = Tuple[str, List[int]]
-
-    def _prepare_decoder_input_ids_for_generation(
-        self,
-        decoder_input_ids: Optional[List[int]],
-    ) -> List[int]:
-        """
-        Prepares `decoder_input_ids` for generation with encoder-decoder models.
-
-        Based on
-
-        https://github.com/huggingface/transformers/blob/
-        4037a2b5b1278736e566aec12e169100275545ea/
-        src/transformers/generation/utils.py
-
-        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
-
-        Arguments:
-
-        * decoder_input_ids: input token ids to preprocess
-
-        Returns:
-
-        * Processed token list
-        """
-
-        decoder_start_token_id = self._get_decoder_start_token_id()
-        assert decoder_start_token_id is not None
-
-        if decoder_input_ids is None:
-            # no decoder prompt input ->
-            # use decoder_start_token_id as decoder_input_ids
-            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
-
-        if (len(decoder_input_ids) == 0
-                or decoder_input_ids[0] != decoder_start_token_id):
-            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
-
-        return decoder_input_ids
-
-    def _tokenize_prompt(
-        self,
-        prompt: str,
-        request_id: str,
-        lora_request: Optional[LoRARequest],
-    ) -> List[int]:
-        '''
-        Wrapper around application of the model's tokenizer.
-
-        Arguments:
-
-        * prompt
-        * request_id
-        * lora_request
-
-        Returns:
-
-        * prompt token ids
-        '''
-
-        tokenizer = self.get_tokenizer_group(
-            missing_msg="prompts must be None if skip_tokenizer_init is True")
-
-        return tokenizer.encode(request_id=request_id,
-                                prompt=prompt,
-                                lora_request=lora_request)
-
-    def _extract_prompt_components(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
-        '''
-        Extract the components of any single encoder or decoder input prompt.
-
-        Arguments:
-
-        * request_id
-        * inputs: single encoder or decoder input prompt
-        * lora_request: this is only valid for decoder prompts
-
-        Returns:
-
-        * prompt
-        * prompt_token_ids
-        * multi_modal_data
-        '''
-
-        if isinstance(inputs, str):
-            prompt = inputs
-            prompt_token_ids = self._tokenize_prompt(
-                prompt,
-                request_id=request_id,
-                lora_request=lora_request,
-            )
-            multi_modal_data = None
-        elif isinstance(inputs, dict):
-            if "prompt_token_ids" in inputs:
-                prompt = None
-                prompt_token_ids = inputs["prompt_token_ids"]
-            else:
-                # NOTE: This extra assignment is required to pass mypy
-                prompt = parsed_prompt = inputs["prompt"]
-                prompt_token_ids = self._tokenize_prompt(
-                    parsed_prompt,
-                    request_id=request_id,
-                    lora_request=lora_request,
-                )
-
-            multi_modal_data = inputs.get("multi_modal_data")
-        else:
-            assert_never(inputs)
-
-        return prompt, prompt_token_ids, multi_modal_data
-
-    def _apply_prompt_adapter(
-        self,
-        prompt_token_ids: List[int],
-        prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> List[int]:
-        if prompt_adapter_request:
-            prompt_token_ids = (
-                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
-                + prompt_token_ids)
-
-        return prompt_token_ids
-
-    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
-        '''
-        Specifically for encoder/decoder models:
-        generate a default decoder prompt for when
-        the user specifies only the encoder prompt.
-
-        Encoder/decoder models utilize the decoder
-        prompt in different ways; as new models are
-        added, it is intended that this function
-        will be extended to produce differing
-        default decoder prompts, depending on the
-        model variety.
-
-        Absent a special case, the default behavior
-        of this method is to mirror the behavior of
-        the HuggingFace (HF) GenerationMixin for a None
-        decoder prompt, which is to employ a logit processor
-        setting to force the first decoded token to be <BOS>.
-        Here, this behavior is approximated by having the
-        "default" decoder prompt be <BOS>.
-
-        However, it is possible that in the future
-        other models may have different or more 
-        complex logic for the default decoder prompt.
-        This motivates having a special helper method
-        for default decoder prompts.
-
-        Returns:
-
-        * prompt_token_ids
-        '''
-
-        bos_token_id = self._get_bos_token_id()
-        assert bos_token_id is not None
-        return [bos_token_id]
-
-    def _build_enc_dec_llm_inputs(
-        self,
-        encoder_comps: PromptComponents,
-        decoder_comps: DecoderPromptComponents,
-    ) -> EncoderDecoderLLMInputs:
-        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
-        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
-
-        if encoder_mm_data is not None or decoder_mm_data is not None:
-            raise ValueError("Multi-modal encoder-decoder models are "
-                             "not supported yet")
-
-        decoder_prompt_ids = (
-            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
-
-        return EncoderDecoderLLMInputs(
-            prompt_token_ids=decoder_prompt_ids,
-            prompt=decoder_prompt,
-            encoder_prompt_token_ids=encoder_prompt_ids,
-            encoder_prompt=encoder_prompt,
-        )
-
-    def _process_encoder_decoder_prompt(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-    ) -> EncoderDecoderLLMInputs:
-        '''
-        For encoder/decoder models only:
-        Process an input prompt into an
-        :class:`EncoderDecoderLLMInputs` instance.
-
-        There are two types of input prompts:
-        singleton prompts which carry only the
-        encoder prompt, and explicit encoder/decoder
-        prompts which carry both the encoder and the
-        decoder prompts as member variables.
-
-        This function handles the following scenarios:
-        * Singleton encoder prompt: extract encoder prompt
-          token ids & infer default decoder prompt token ids
-        * Explicit encoder/decoder prompt: extract encoder
-          and decoder prompt token ids
-
-        Note that for Explicit encoder/decoder prompts,
-        each sub-prompt (encoder or decoder prompt) can
-        have any possible singleton type; thus this
-        method relies on helper functions to obtain
-        token ids for the sub-prompts.
-        
-        Arguments:
-
-        * inputs: an input prompt
-        * request_id
-
-        Returns:
-
-        * :class:`EncoderDecoderLLMInputs` instance
-        '''
-
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
-
-        if is_explicit_encoder_decoder_prompt(inputs):
-            encoder_comps = self._extract_prompt_components(
-                inputs["encoder_prompt"],
-                request_id=request_id,
-            )
-
-            if (decoder_input := inputs["decoder_prompt"]) is None:
-                decoder_comps = None, None, None
-            else:
-                decoder_comps = self._extract_prompt_components(
-                    decoder_input,
-                    request_id=request_id,
-                )
-        else:
-            encoder_comps = self._extract_prompt_components(
-                inputs,
-                request_id=request_id,
-            )
-
-            decoder_comps = None, None, None
-
-        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
-
-    def _build_decoder_only_llm_inputs(
-        self,
-        prompt_comps: PromptComponents,
-        prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> LLMInputs:
-        prompt, prompt_token_ids, multi_modal_data = prompt_comps
-
-        prompt_token_ids = self._apply_prompt_adapter(
-            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
-
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=prompt,
-                         multi_modal_data=multi_modal_data)
-
-    def _process_decoder_only_prompt(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> LLMInputs:
-        '''
-        For decoder-only models:
-        Process an input prompt into an :class:`LLMInputs` instance.
-
-        Arguments:
-
-        * inputs: input prompt
-        * request_id
-        * lora_request
-        * prompt_adapter_request
-
-        Returns:
-
-        * :class:`LLMInputs` instance
-        '''
-
-        prompt_comps = self._extract_prompt_components(
-            inputs,
-            request_id=request_id,
-            lora_request=lora_request,
-        )
-
-        return self._build_decoder_only_llm_inputs(
-            prompt_comps,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-    def process_model_inputs(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
-
-        if self.is_encoder_decoder_model():
-            # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
-            model_inputs = self._process_encoder_decoder_prompt(
-                inputs,
-                request_id=request_id,
-            )
-        else:
-            if is_explicit_encoder_decoder_prompt(inputs):
-                raise ValueError("Cannot pass encoder-decoder prompt "
-                                 "to decoder-only models")
-
-            # Decoder-only operation
-            model_inputs = self._process_decoder_only_prompt(
-                inputs,
-                request_id=request_id,
-                lora_request=lora_request,
-                prompt_adapter_request=prompt_adapter_request,
-            )
-
-        return self.input_processor(model_inputs)
-
     def add_request(
         self,
         request_id: str,
@@ -1115,12 +730,13 @@ def add_request(
         if arrival_time is None:
             arrival_time = time.time()
 
-        processed_inputs = self.process_model_inputs(
+        preprocessed_inputs = self.input_preprocessor.preprocess(
             inputs,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
+        processed_inputs = self.input_processor(preprocessed_inputs)
 
         self._add_processed_request(
             request_id=request_id,
@@ -1273,7 +889,7 @@ def _process_model_outputs(self,
 
         ctx: The virtual engine context to work on
         request_id: If provided, then only this request is going to be processed
-        
+
         """
         now = time.time()
 
@@ -1378,7 +994,8 @@ def _process_model_outputs(self,
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
             request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
+                ctx.request_outputs.append(request_output)
 
         # When we process a single request, we skip it for the next time,
         # and invoke the request output callback (if there was final output)
@@ -1415,14 +1032,19 @@ def _process_model_outputs(self,
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
-            if (seq_group.is_finished()
-                    if self.step_return_finished_only else True):
-                request_output = RequestOutputFactory.create(seq_group)
+            request_output = RequestOutputFactory.create(seq_group)
+            if request_output:
                 ctx.request_outputs.append(request_output)
 
         for seq_group in scheduler_outputs.ignored_seq_groups:
+            params = seq_group.sampling_params
+            if params is not None and params.output_kind == (
+                    RequestOutputKind.DELTA) and not seq_group.is_finished():
+                continue
+
             request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
+                ctx.request_outputs.append(request_output)
 
         # Immediately process request outputs here (if callback is given)
         if (ctx.request_outputs
@@ -1435,7 +1057,8 @@ def _process_model_outputs(self,
         # LLMEngine/AsyncLLMEngine directly
         if is_async:
             # Log stats.
-            self.do_log_stats(scheduler_outputs, outputs, finished_before)
+            self.do_log_stats(scheduler_outputs, outputs, finished_before,
+                              skip)
 
             # Tracing
             self.do_tracing(scheduler_outputs)
@@ -1742,18 +1365,20 @@ def remove_logger(self, logger_name: str) -> None:
     def do_log_stats(self,
                      scheduler_outputs: Optional[SchedulerOutputs] = None,
                      model_output: Optional[List[SamplerOutput]] = None,
-                     finished_before: Optional[List[int]] = None) -> None:
+                     finished_before: Optional[List[int]] = None,
+                     skip: Optional[List[int]] = None) -> None:
         """Forced log when no requests active."""
         if self.log_stats:
             stats = self._get_stats(scheduler_outputs, model_output,
-                                    finished_before)
+                                    finished_before, skip)
             for logger in self.stat_loggers.values():
                 logger.log(stats)
 
     def _get_stats(self,
                    scheduler_outputs: Optional[SchedulerOutputs],
                    model_output: Optional[List[SamplerOutput]] = None,
-                   finished_before: Optional[List[int]] = None) -> Stats:
+                   finished_before: Optional[List[int]] = None,
+                   skip: Optional[List[int]] = None) -> Stats:
         """Get Stats to be Logged to Prometheus.
 
         Args:
@@ -1761,6 +1386,10 @@ def _get_stats(self,
                 the scheduled batch,
             model_output: Optional, used to emit speculative decoding metrics
                 which are created by the workers.
+            finished_before: Optional, indices of sequences that were finished
+                before. These sequences will be ignored.
+            skip: Optional, indices of sequences that were preempted. These
+                sequences will be ignored.
         """
         now = time.time()
 
@@ -1835,6 +1464,11 @@ def _get_stats(self,
                     actual_num_batched_tokens -= 1
                     continue
 
+                # Currently, skip == preempted sequences, so we need to skip
+                # their log stats
+                if skip and idx in skip:
+                    continue
+
                 group_was_prefill = idx < scheduler_outputs.num_prefill_groups
                 seq_group = scheduled_seq_group.seq_group
 
@@ -1964,10 +1598,20 @@ def check_health(self) -> None:
         self.model_executor.check_health()
 
     def start_profile(self) -> None:
-        self.model_executor.start_profile()
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes (MultiprocessingGPUExecutor)
+        if type(self.model_executor) == GPUExecutor:
+            self.model_executor.start_profile()
+        else:
+            self.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
-        self.model_executor.stop_profile()
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes (MultiprocessingGPUExecutor)
+        if type(self.model_executor) == GPUExecutor:
+            self.model_executor.stop_profile()
+        else:
+            self.model_executor._run_workers("stop_profile")
 
     def is_tracing_enabled(self) -> bool:
         return self.tracer is not None
@@ -2041,7 +1685,7 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
                     metrics.model_execute_time)
 
     def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+        return self.input_preprocessor.is_encoder_decoder_model()
 
     def is_embedding_model(self):
         return self.model_config.is_embedding_model
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f9f9536a7c160..f1ce2c36fcceb 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -23,6 +23,7 @@
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
 from pydantic import ConfigDict
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -31,7 +32,7 @@
 from vllm.multimodal.utils import (async_get_and_parse_audio,
                                    async_get_and_parse_image,
                                    get_and_parse_audio, get_and_parse_image)
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 
 logger = init_logger(__name__)
 
@@ -107,7 +108,7 @@ class ConversationMessage(TypedDict, total=False):
     """The tool calls generated by the model, such as function calls."""
 
 
-ModalityStr = Literal["image", "audio"]
+ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")
 
 
@@ -147,7 +148,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return f"<|image_{current_count}|>"
             if model_type == "minicpmv":
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
+            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
+                              "pixtral"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
@@ -157,12 +159,18 @@ def _placeholder_str(self, modality: ModalityStr,
                                               hf_config.image_token_index)
             if model_type in ("chameleon", "internvl_chat"):
                 return "<image>"
+            if model_type == "qwen2_vl":
+                return "<|vision_start|><|image_pad|><|vision_end|>"
 
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
                 return "<|reserved_special_token_0|>"
             raise TypeError(f"Unknown model type: {model_type}")
+        elif modality == "video":
+            if model_type == "qwen2_vl":
+                return "<|vision_start|><|video_pad|><|vision_end|>"
+            raise TypeError(f"Unknown model type: {model_type}")
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
@@ -379,6 +387,9 @@ def _parse_chat_message_content_parts(
             audio_url = _AudioParser(part)["audio_url"]
 
             mm_parser.parse_audio(audio_url["url"])
+        elif part_type == "refusal":
+            text = _RefusalParser(part)["refusal"]
+            texts.append(text)
         else:
             raise NotImplementedError(f"Unknown part type: {part_type}")
 
@@ -433,6 +444,21 @@ def _parse_chat_message_content(
     return result
 
 
+def _postprocess_messages(messages: List[ConversationMessage]) -> None:
+    # per the Transformers docs & maintainers, tool call arguments in
+    # assistant-role messages with tool_calls need to be dicts not JSON str -
+    # this is how tool-use chat templates will expect them moving forwards
+    # so, for messages that have tool_calls, parse the string (which we get
+    # from openAI format) to dict
+    for message in messages:
+        if (message["role"] == "assistant" and "tool_calls" in message
+                and isinstance(message["tool_calls"], list)):
+
+            for item in message["tool_calls"]:
+                item["function"]["arguments"] = json.loads(
+                    item["function"]["arguments"])
+
+
 def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
@@ -446,6 +472,8 @@ def parse_chat_messages(
 
         conversation.extend(sub_messages)
 
+    _postprocess_messages(conversation)
+
     return conversation, mm_tracker.all_mm_data()
 
 
@@ -462,41 +490,44 @@ def parse_chat_messages_futures(
 
         conversation.extend(sub_messages)
 
+    _postprocess_messages(conversation)
+
     return conversation, mm_tracker.all_mm_data()
 
 
-def apply_chat_template(
-    tokenizer: AnyTokenizer,
+def apply_hf_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     conversation: List[ConversationMessage],
     chat_template: Optional[str],
     *,
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
-) -> Union[str, List[int]]:
+) -> str:
     if chat_template is None and tokenizer.chat_template is None:
         raise ValueError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one.")
 
-    # per the Transformers docs & maintainers, tool call arguments in
-    # assistant-role messages with tool_calls need to be dicts not JSON str -
-    # this is how tool-use chat templates will expect them moving forwards
-    # so, for messages that have tool_calls, parse the string (which we get
-    # from openAI format) to dict
-    for message in conversation:
-        if (message["role"] == "assistant" and "tool_calls" in message
-                and isinstance(message["tool_calls"], list)):
-
-            for i in range(len(message["tool_calls"])):
-                args: str = message["tool_calls"][i]["function"]["arguments"]
-                parsed_args: Dict = json.loads(args)
-                message["tool_calls"][i]["function"]["arguments"] = parsed_args
-
-    prompt = tokenizer.apply_chat_template(
-        conversation=conversation,
+    return tokenizer.apply_chat_template(
+        conversation=conversation,  # type: ignore[arg-type]
         chat_template=chat_template,
         tokenize=tokenize,
         **kwargs,
     )
-    return prompt
+
+
+def apply_mistral_chat_template(
+    tokenizer: MistralTokenizer,
+    messages: List[ChatCompletionMessageParam],
+    chat_template: Optional[str] = None,
+    **kwargs: Any,
+) -> List[int]:
+    if chat_template is not None:
+        logger.warning(
+            "'chat_template' cannot be overridden for mistral tokenizer.")
+
+    return tokenizer.apply_chat_template(
+        messages=messages,
+        **kwargs,
+    )
diff --git a/vllm/entrypoints/fast_sync_llm.py b/vllm/entrypoints/fast_sync_llm.py
index 982748a84f0a5..c948fc97feeb9 100644
--- a/vllm/entrypoints/fast_sync_llm.py
+++ b/vllm/entrypoints/fast_sync_llm.py
@@ -77,7 +77,8 @@ def run_engine(self):
             RayGPUExecutor), "Ray is not supported in sync openai mode"
 
         self.result_queue.put(("Ready", None, None))
-        request_stats = {}
+        prompt_lens = {}
+        tokens = {}  # type: ignore
         log_interval = 100
         poll_interval = envs.VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS
         try:
@@ -103,25 +104,22 @@ def run_engine(self):
                 for output in step_outputs:
                     assert len(output.outputs) == 1  # type: ignore
                     first_out = output.outputs[0]  # type: ignore
-                    output_len = len(first_out.text)
                     stats = None
-                    if output_len >= 0 and (output.request_id
-                                            not in request_stats):
-                        request_stats[output.request_id] = output_len
-                        result = first_out.text
-                    else:
-                        result = first_out.text[
-                            request_stats[output.request_id]:output_len]
+                    result = first_out.text
+                    tokens[output.request_id] = tokens.get(
+                        output.request_id, 0) + len(first_out.token_ids)
+                    if output.prompt_token_ids is not None:
+                        prompt_lens[output.request_id] = len(
+                            output.prompt_token_ids)
                     if output.finished:
+                        assert output.request_id in prompt_lens
                         stats = {
-                            "prompt": len(output.prompt_token_ids),
-                            "tokens": len(first_out.token_ids),
+                            "prompt": prompt_lens[output.request_id],
+                            "tokens": tokens[output.request_id],
                             "finish_reason": first_out.finish_reason,
                             "stop_reason": first_out.stop_reason,
                         }
-                        del request_stats[output.request_id]
-                    else:
-                        request_stats[output.request_id] = output_len
+                        del prompt_lens[output.request_id]
                     self.result_queue.put_nowait(
                         (output.request_id, result, stats))
         except Exception as e:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1e4432eaaa665..c01bffeb4289d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -6,7 +6,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         apply_chat_template,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
                                          parse_chat_messages)
 from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
@@ -18,8 +19,8 @@
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
@@ -393,12 +394,21 @@ def chat(
         conversation, mm_data = parse_chat_messages(messages, model_config,
                                                     tokenizer)
 
-        prompt = apply_chat_template(
-            tokenizer,
-            conversation,
-            chat_template=chat_template,
-            add_generation_prompt=add_generation_prompt,
-        )
+        prompt: Union[str, List[int]]
+        if isinstance(tokenizer, MistralTokenizer):
+            prompt = apply_mistral_chat_template(
+                tokenizer,
+                messages=messages,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+            )
+        else:
+            prompt = apply_hf_chat_template(
+                tokenizer,
+                conversation=conversation,
+                chat_template=chat_template,
+                add_generation_prompt=add_generation_prompt,
+            )
 
         inputs: PromptInputs
         if is_list_of(prompt, int):
@@ -632,14 +642,12 @@ def _validate_and_add_requests(
             raise ValueError("The lengths of prompts and lora_request "
                              "must be the same.")
 
-        if isinstance(params, list):
-            params = [
-                self._add_guided_processor(param, guided_options)
-                if isinstance(param, SamplingParams) else param
-                for param in params
-            ]
-        elif isinstance(params, SamplingParams):
-            params = self._add_guided_processor(params, guided_options)
+        for sp in params if isinstance(params, list) else (params, ):
+            if isinstance(sp, SamplingParams):
+                self._add_guided_processor(sp, guided_options)
+
+                # We only care about the final output
+                sp.output_kind = RequestOutputKind.FINAL_ONLY
 
         # Add requests to the engine.
         for i, request_inputs in enumerate(inputs):
@@ -699,9 +707,6 @@ def _run_engine(
                          f"output: {0:.2f} toks/s"),
             )
 
-        # In the loop below, only finished outputs are used
-        self.llm_engine.step_return_finished_only = True
-
         # Run the engine.
         outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
         total_in_toks = 0
@@ -714,6 +719,7 @@ def _run_engine(
                     if use_tqdm:
                         if isinstance(output, RequestOutput):
                             # Calculate tokens only for RequestOutput
+                            assert output.prompt_token_ids is not None
                             total_in_toks += len(output.prompt_token_ids)
                             in_spd = total_in_toks / pbar.format_dict["elapsed"]
                             total_out_toks += sum(
@@ -725,9 +731,6 @@ def _run_engine(
                                 f"output: {out_spd:.2f} toks/s")
                         pbar.update(1)
 
-        # Restore original behavior
-        self.llm_engine.step_return_finished_only = False
-
         if use_tqdm:
             pbar.close()
         # Sort the outputs by request ID.
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 374196044b7e8..7e9f53b1816d1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -12,7 +12,8 @@
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import LogitsProcessor, SamplingParams
+from vllm.sampling_params import (LogitsProcessor, RequestOutputKind,
+                                  SamplingParams)
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -316,6 +317,8 @@ def to_sampling_params(
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
         )
 
     @model_validator(mode="before")
@@ -559,6 +562,8 @@ def to_sampling_params(
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
         )
 
     @model_validator(mode="before")
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 278be8cd11a12..b745410fe6b3b 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,5 @@
 import asyncio
+from http import HTTPStatus
 from io import StringIO
 from typing import Awaitable, Callable, List, Optional
 
@@ -135,6 +136,25 @@ async def write_file(path_or_url: str, data: str) -> None:
             f.write(data)
 
 
+def make_error_request_output(request: BatchRequestInput,
+                              error_msg: str) -> BatchRequestOutput:
+    batch_output = BatchRequestOutput(
+        id=f"vllm-{random_uuid()}",
+        custom_id=request.custom_id,
+        response=BatchResponseData(
+            status_code=HTTPStatus.BAD_REQUEST,
+            request_id=f"vllm-batch-{random_uuid()}",
+        ),
+        error=error_msg,
+    )
+    return batch_output
+
+
+async def make_async_error_request_output(
+        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
+    return make_error_request_output(request, error_msg)
+
+
 async def run_request(serving_engine_func: Callable,
                       request: BatchRequestInput,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
@@ -158,7 +178,8 @@ async def run_request(serving_engine_func: Callable,
             error=response,
         )
     else:
-        raise ValueError("Request must not be sent in stream mode")
+        batch_output = make_error_request_output(
+            request, error_msg="Request must not be sent in stream mode")
 
     tracker.completed()
     return batch_output
@@ -174,7 +195,6 @@ async def main(args):
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
 
-    # When using single vLLM without engine_use_ray
     model_config = await engine.get_model_config()
 
     if args.disable_log_requests:
@@ -225,8 +245,12 @@ async def main(args):
                             tracker))
             tracker.submitted()
         else:
-            raise ValueError("Only /v1/chat/completions and /v1/embeddings are"
-                             "supported in the batch endpoint.")
+            response_futures.append(
+                make_async_error_request_output(
+                    request,
+                    error_msg="Only /v1/chat/completions and "
+                    "/v1/embeddings are supported in the batch endpoint.",
+                ))
 
     with tracker.pbar():
         responses = await asyncio.gather(*response_futures)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 8ed81e9c88cb2..58e42fb5363fb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -11,7 +11,8 @@
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
-                                         apply_chat_template,
+                                         apply_hf_chat_template,
+                                         apply_mistral_chat_template,
                                          load_chat_template,
                                          parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
@@ -35,7 +36,7 @@
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import iterate_with_cancellation, random_uuid
 
 logger = init_logger(__name__)
@@ -121,15 +122,27 @@ async def create_chat_completion(
                 tool.model_dump() for tool in request.tools
             ]
 
-            prompt = apply_chat_template(
-                tokenizer,
-                conversation=conversation,
-                chat_template=request.chat_template or self.chat_template,
-                add_generation_prompt=request.add_generation_prompt,
-                tools=tool_dicts,
-                documents=request.documents,
-                **(request.chat_template_kwargs or {}),
-            )
+            prompt: Union[str, List[int]]
+            if isinstance(tokenizer, MistralTokenizer):
+                prompt = apply_mistral_chat_template(
+                    tokenizer,
+                    messages=request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                    tools=tool_dicts,
+                    documents=request.documents,
+                    **(request.chat_template_kwargs or {}),
+                )
+            else:
+                prompt = apply_hf_chat_template(
+                    tokenizer,
+                    conversation=conversation,
+                    chat_template=request.chat_template or self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                    tools=tool_dicts,
+                    documents=request.documents,
+                    **(request.chat_template_kwargs or {}),
+                )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
@@ -233,8 +246,7 @@ async def create_chat_completion(
     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
         if request.add_generation_prompt:
             return self.response_role
-        else:
-            return request.messages[-1]["role"]
+        return request.messages[-1]["role"]
 
     async def chat_completion_stream_generator(
         self,
@@ -251,15 +263,37 @@ async def chat_completion_stream_generator(
 
         # Send response for each token for each request.n (index)
         num_choices = 1 if request.n is None else request.n
-        previous_texts = [""] * num_choices
         previous_num_tokens = [0] * num_choices
         finish_reason_sent = [False] * num_choices
 
+        num_prompt_tokens = 0
+
         tool_parser: Optional[ToolParser] = self.tool_parser(
             tokenizer) if self.tool_parser else None
 
+        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
+            tool_choice_function_name = request.tool_choice.function.name
+        else:
+            tool_choice_function_name = None
+
+        # Determine whether tools are in use with "auto" tool choice
+        tool_choice_auto = (
+            not tool_choice_function_name
+            and self._should_stream_with_auto_tool_parsing(request))
+
+        all_previous_token_ids: Optional[List[List[int]]]
+        if tool_choice_auto:
+            # These are only required in "auto" tool choice case
+            previous_texts = [""] * num_choices
+            all_previous_token_ids = [[]] * num_choices
+        else:
+            previous_texts, all_previous_token_ids = None, None
+
         try:
             async for res in result_generator:
+                if res.prompt_token_ids is not None:
+                    num_prompt_tokens = len(res.prompt_token_ids)
+
                 # We need to do it here, because if there are exceptions in
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
@@ -292,10 +326,10 @@ async def chat_completion_stream_generator(
                                 and request.stream_options.include_usage):
                             # if continuous usage stats are requested, add it
                             if request.stream_options.continuous_usage_stats:
-                                prompt_tokens = len(res.prompt_token_ids)
-                                usage = UsageInfo(prompt_tokens=prompt_tokens,
-                                                  completion_tokens=0,
-                                                  total_tokens=prompt_tokens)
+                                usage = UsageInfo(
+                                    prompt_tokens=num_prompt_tokens,
+                                    completion_tokens=0,
+                                    total_tokens=num_prompt_tokens)
                                 chunk.usage = usage
                             # otherwise don't
                             else:
@@ -307,11 +341,10 @@ async def chat_completion_stream_generator(
                     # Send response to echo the input portion of the
                     # last message
                     if request.echo:
-                        last_msg_content: Optional[str] = ""
-                        if conversation and conversation[-1].get(
-                                "content") and conversation[-1].get(
-                                    "role") == role:
-                            last_msg_content = conversation[-1]["content"]
+                        last_msg_content: str = ""
+                        if conversation and "content" in conversation[
+                                -1] and conversation[-1].get("role") == role:
+                            last_msg_content = conversation[-1]["content"] or ""
 
                         if last_msg_content:
                             for i in range(num_choices):
@@ -332,12 +365,10 @@ async def chat_completion_stream_generator(
                                         request.stream_options.include_usage):
                                     if (request.stream_options.
                                             continuous_usage_stats):
-                                        prompt_tokens = len(
-                                            res.prompt_token_ids)
                                         usage = UsageInfo(
-                                            prompt_tokens=prompt_tokens,
+                                            prompt_tokens=num_prompt_tokens,
                                             completion_tokens=0,
-                                            total_tokens=prompt_tokens)
+                                            total_tokens=num_prompt_tokens)
                                         chunk.usage = usage
                                     else:
                                         chunk.usage = None
@@ -348,65 +379,66 @@ async def chat_completion_stream_generator(
                     first_iteration = False
 
                 for output in res.outputs:
-
                     i = output.index
 
                     if finish_reason_sent[i]:
                         continue
 
-                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                    out_logprobs = output.logprobs[
-                        previous_num_tokens[i]:] if output.logprobs else None
-
                     if request.logprobs and request.top_logprobs is not None:
-                        assert out_logprobs is not None, (
+                        assert output.logprobs is not None, (
                             "Did not output logprobs")
                         logprobs = self._create_chat_logprobs(
-                            token_ids=delta_token_ids,
-                            top_logprobs=out_logprobs,
+                            token_ids=output.token_ids,
+                            top_logprobs=output.logprobs,
                             tokenizer=tokenizer,
                             num_output_top_logprobs=request.top_logprobs,
                         )
                     else:
                         logprobs = None
 
-                    delta_text = output.text[len(previous_texts[i]):]
-                    delta_message: Optional[DeltaMessage] = None
+                    delta_text = output.text
+                    delta_message: Optional[DeltaMessage]
 
                     # handle streaming deltas for tools with named tool_choice
-                    if (request.tool_choice and type(request.tool_choice) is
-                            ChatCompletionNamedToolChoiceParam):
+                    if tool_choice_function_name:
                         delta_message = DeltaMessage(tool_calls=[
                             DeltaToolCall(function=DeltaFunctionCall(
-                                name=request.tool_choice.function.name,
+                                name=tool_choice_function_name,
                                 arguments=delta_text),
                                           index=i)
                         ])
 
                     # handle streaming deltas for tools with "auto" tool choice
-                    elif (self._should_stream_with_auto_tool_parsing(request)
-                          and tool_parser):
+                    elif tool_choice_auto:
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        assert tool_parser is not None
+                        #TODO optimize manipulation of these lists
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+                        current_token_ids = previous_token_ids + list(
+                            output.token_ids)
+
                         delta_message = (
                             tool_parser.extract_tool_calls_streaming(
-                                previous_text=previous_texts[i],
-                                current_text=output.text,
+                                previous_text=previous_text,
+                                current_text=current_text,
                                 delta_text=delta_text,
-                                previous_token_ids= \
-                                    output.token_ids[
-                                    :-1 * len(delta_token_ids)
-                                    ],
-                                current_token_ids=output.token_ids,
-                                delta_token_ids=delta_token_ids
-                            )
-                        )
+                                previous_token_ids=previous_token_ids,
+                                current_token_ids=current_token_ids,
+                                delta_token_ids=output.token_ids))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
 
                     # handle streaming just a content delta
                     else:
                         delta_message = DeltaMessage(content=delta_text)
 
                     # set the previous values for the next iteration
-                    previous_texts[i] = output.text
-                    previous_num_tokens[i] = len(output.token_ids)
+                    previous_num_tokens[i] += len(output.token_ids)
 
                     # if the message delta is None (e.g. because it was a
                     # "control token" for tool calls or the parser otherwise
@@ -433,13 +465,12 @@ async def chat_completion_stream_generator(
                         # handle usage stats if requested & if continuous
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
-                                prompt_tokens = len(res.prompt_token_ids)
+                            if request.stream_options.continuous_usage_stats:
                                 completion_tokens = len(output.token_ids)
                                 usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                     completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                     completion_tokens,
                                 )
                                 chunk.usage = usage
@@ -470,7 +501,7 @@ async def chat_completion_stream_generator(
                                 tool_parser.prev_tool_call_arr[index].get(
                                     "arguments", {}))
 
-                            # get what we've streamed so for for arguments
+                            # get what we've streamed so far for arguments
                             # for the current tool
                             actual_call = tool_parser.streamed_args_for_tool[
                                 index]
@@ -488,7 +519,6 @@ async def chat_completion_stream_generator(
                             ])
 
                         # Send the finish response for each request.n only once
-                        prompt_tokens = len(res.prompt_token_ids)
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
@@ -506,13 +536,12 @@ async def chat_completion_stream_generator(
                             model=model_name)
                         if (request.stream_options
                                 and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
-                                prompt_tokens = len(res.prompt_token_ids)
+                            if request.stream_options.continuous_usage_stats:
                                 completion_tokens = len(output.token_ids)
                                 usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                     completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                     completion_tokens,
                                 )
                                 chunk.usage = usage
@@ -526,10 +555,11 @@ async def chat_completion_stream_generator(
             # is sent, send the usage
             if (request.stream_options
                     and request.stream_options.include_usage):
+                completion_tokens = previous_num_tokens[i]
                 final_usage = UsageInfo(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=previous_num_tokens[i],
-                    total_tokens=prompt_tokens + previous_num_tokens[i],
+                    prompt_tokens=num_prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=num_prompt_tokens + completion_tokens,
                 )
 
                 final_usage_chunk = ChatCompletionStreamResponse(
@@ -595,7 +625,7 @@ async def chat_completion_full_generator(
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if not (self.enable_auto_tools
+            if (not self.enable_auto_tools
                     or not self.tool_parser) and not isinstance(
                         request.tool_choice,
                         ChatCompletionNamedToolChoiceParam):
@@ -659,8 +689,8 @@ async def chat_completion_full_generator(
 
         if request.echo:
             last_msg_content = ""
-            if conversation and conversation[-1].get(
-                    "content") and conversation[-1].get("role") == role:
+            if conversation and "content" in conversation[-1] and conversation[
+                    -1].get("role") == role:
                 last_msg_content = conversation[-1]["content"] or ""
 
             for choice in choices:
@@ -668,6 +698,7 @@ async def chat_completion_full_generator(
                                                    or "")
                 choice.message.content = full_message
 
+        assert final_res.prompt_token_ids is not None
         num_prompt_tokens = len(final_res.prompt_token_ids)
         num_generated_tokens = sum(
             len(output.token_ids) for output in final_res.outputs)
@@ -777,9 +808,9 @@ def _should_check_for_unstreamed_tool_arg_tokens(
         return bool(
             # if there is a delta message that includes tool calls which
             # include a function that has arguments
-            self.enable_auto_tools and self.tool_parser and delta_message
+            output.finish_reason is not None
+            and self.enable_auto_tools and self.tool_parser and delta_message
             and delta_message.tool_calls and delta_message.tool_calls[0]
             and delta_message.tool_calls[0].function
             and delta_message.tool_calls[0].function.arguments is not None
-            and output.finish_reason is not None
         )
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 34f1200753f8d..42142efb5f23e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -223,9 +223,10 @@ async def completion_stream_generator(
         tokenizer: AnyTokenizer,
     ) -> AsyncGenerator[str, None]:
         num_choices = 1 if request.n is None else request.n
-        previous_texts = [""] * num_choices * num_prompts
+        previous_text_lens = [0] * num_choices * num_prompts
         previous_num_tokens = [0] * num_choices * num_prompts
         has_echoed = [False] * num_choices * num_prompts
+        num_prompt_tokens = [0] * num_prompts
 
         try:
             async for prompt_idx, res in result_generator:
@@ -233,6 +234,10 @@ async def completion_stream_generator(
                 prompt_logprobs = res.prompt_logprobs
                 prompt_text = res.prompt
 
+                # Prompt details are excluded from later streamed outputs
+                if res.prompt_token_ids is not None:
+                    num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
+
                 delta_token_ids: GenericSequence[int]
                 out_logprobs: Optional[GenericSequence[Optional[Dict[
                     int, Logprob]]]]
@@ -244,6 +249,7 @@ async def completion_stream_generator(
 
                     assert request.max_tokens is not None
                     if request.echo and request.max_tokens == 0:
+                        assert prompt_token_ids is not None
                         assert prompt_text is not None
                         # only return the prompt
                         delta_text = prompt_text
@@ -252,6 +258,7 @@ async def completion_stream_generator(
                         has_echoed[i] = True
                     elif (request.echo and request.max_tokens > 0
                           and not has_echoed[i]):
+                        assert prompt_token_ids is not None
                         assert prompt_text is not None
                         assert prompt_logprobs is not None
                         # echo the prompt and first token
@@ -266,11 +273,9 @@ async def completion_stream_generator(
                         has_echoed[i] = True
                     else:
                         # return just the delta
-                        delta_text = output.text[len(previous_texts[i]):]
-                        delta_token_ids = output.token_ids[
-                            previous_num_tokens[i]:]
-                        out_logprobs = output.logprobs[previous_num_tokens[
-                            i]:] if output.logprobs else None
+                        delta_text = output.text
+                        delta_token_ids = output.token_ids
+                        out_logprobs = output.logprobs
 
                     if request.logprobs is not None:
                         assert out_logprobs is not None, (
@@ -280,13 +285,13 @@ async def completion_stream_generator(
                             top_logprobs=out_logprobs,
                             num_output_top_logprobs=request.logprobs,
                             tokenizer=tokenizer,
-                            initial_text_offset=len(previous_texts[i]),
+                            initial_text_offset=previous_text_lens[i],
                         )
                     else:
                         logprobs = None
 
-                    previous_texts[i] = output.text
-                    previous_num_tokens[i] = len(output.token_ids)
+                    previous_text_lens[i] += len(output.text)
+                    previous_num_tokens[i] += len(output.token_ids)
                     finish_reason = output.finish_reason
                     stop_reason = output.stop_reason
 
@@ -307,8 +312,8 @@ async def completion_stream_generator(
                             and request.stream_options.include_usage):
                         if (request.stream_options.continuous_usage_stats
                                 or output.finish_reason is not None):
-                            prompt_tokens = len(prompt_token_ids)
-                            completion_tokens = len(output.token_ids)
+                            prompt_tokens = num_prompt_tokens[prompt_idx]
+                            completion_tokens = previous_num_tokens[i]
                             usage = UsageInfo(
                                 prompt_tokens=prompt_tokens,
                                 completion_tokens=completion_tokens,
@@ -356,6 +361,7 @@ def request_output_to_completion_response(
 
         for final_res in final_res_batch:
             prompt_token_ids = final_res.prompt_token_ids
+            assert prompt_token_ids is not None
             prompt_logprobs = final_res.prompt_logprobs
             prompt_text = final_res.prompt
 
@@ -411,9 +417,9 @@ def request_output_to_completion_response(
                 )
                 choices.append(choice_data)
 
+                num_generated_tokens += len(output.token_ids)
+
             num_prompt_tokens += len(prompt_token_ids)
-            num_generated_tokens += sum(
-                len(output.token_ids) for output in final_res.outputs)
 
         usage = UsageInfo(
             prompt_tokens=num_prompt_tokens,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 69a5ad5b62cfa..6e802b71ae2b4 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,7 +2,8 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
-from vllm.entrypoints.chat_utils import (apply_chat_template,
+from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
+                                         apply_mistral_chat_template,
                                          load_chat_template,
                                          parse_chat_messages_futures)
 from vllm.entrypoints.logger import RequestLogger
@@ -18,6 +19,7 @@
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -66,6 +68,7 @@ async def create_tokenize(
 
         tokenizer = await self.async_engine_client.get_tokenizer(lora_request)
 
+        prompt: Union[str, List[int]]
         if isinstance(request, TokenizeChatRequest):
             model_config = self.model_config
 
@@ -77,12 +80,20 @@ async def create_tokenize(
                 logger.warning(
                     "Multi-modal inputs are ignored during tokenization")
 
-            prompt = apply_chat_template(
-                tokenizer,
-                conversation=conversation,
-                chat_template=self.chat_template,
-                add_generation_prompt=request.add_generation_prompt,
-            )
+            if isinstance(tokenizer, MistralTokenizer):
+                prompt = apply_mistral_chat_template(
+                    tokenizer,
+                    messages=request.messages,
+                    chat_template=self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                )
+            else:
+                prompt = apply_hf_chat_template(
+                    tokenizer,
+                    conversation=conversation,
+                    chat_template=self.chat_template,
+                    add_generation_prompt=request.add_generation_prompt,
+                )
         else:
             prompt = request.prompt
 
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index bde9b47ce60d5..ad6f536838a88 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -33,7 +33,6 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: List[Dict] = []
         self.current_tool_id: int = -1
-        self.current_tool_name_sent = False
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 4eebaa6c46903..68f38df56b929 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -64,13 +64,13 @@
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_TEST_FORCE_FP8_MARLIN: bool = False
     VLLM_RPC_GET_DATA_TIMEOUT_MS: int = 5000
-    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
     VLLM_PLUGINS: Optional[List[str]] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SYNC_SERVER_ACCUM_REQUESTS: int = 1
     VLLM_SYNC_SERVER_ENGINE_STEPS_BETWEEN_POLLS: int = 1
     VLLM_MOE_PADDING: bool = True
+    VLLM_FP8_PADDING: bool = False
 
 
 def get_default_cache_root():
@@ -227,6 +227,16 @@ def get_default_config_root():
     (os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in
      ("true", "1")),
 
+    # Internal flag to control whether we use custom op,
+    # or use the native pytorch implementation
+    "VLLM_TEST_COMPILE_NO_CUSTOM_OPS":
+    lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")),
+
+    # Internal flag to enable Dynamo fullgraph capture
+    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
+    lambda: bool(
+        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
+
     # small gemms custom implementation for MI3* cards
     "VLLM_USE_ROCM_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_USE_ROCM_SKINNY_GEMM", "True").lower() in
@@ -425,14 +435,6 @@ def get_default_config_root():
     "VLLM_RPC_GET_DATA_TIMEOUT_MS":
     lambda: int(os.getenv("VLLM_RPC_GET_DATA_TIMEOUT_MS", "5000")),
 
-    # If set, allow running the engine as a separate ray actor,
-    # which is a deprecated feature soon to be removed.
-    # See https://github.com/vllm-project/vllm/issues/7045
-    "VLLM_ALLOW_ENGINE_USE_RAY":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
-     ("1", "true")),
-
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index ec9b24ce1318f..7380b73ad6548 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -5,7 +5,8 @@
 import torch
 
 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
@@ -60,6 +61,8 @@ def _init_executor(self) -> None:
         self.cache_config = _verify_and_get_cache_config(self.cache_config)
         self.scheduler_config = _verify_and_get_scheduler_config(
             self.scheduler_config)
+        self.parallel_config = _verify_and_get_parallel_config(
+            self.parallel_config)
 
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
@@ -359,6 +362,16 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
     return config
 
 
+def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
+    if (config.distributed_executor_backend is not None
+            and config.distributed_executor_backend != "mp"):
+        logger.warning(
+            "%s is not supported on CPU, fallback to mp distributed executor "
+            "backend.", config.distributed_executor_backend)
+        config.distributed_executor_backend = "mp"
+    return config
+
+
 def _driver_method_invoker(driver, method: str, *args, **kwargs):
     return getattr(driver, method)(*args, **kwargs)
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 1359a0d310a70..b124fe2e08ea6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -242,6 +242,9 @@ def sort_by_driver_then_worker_ip(worker):
             VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
+            **({
+                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
+            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
 
         self._env_vars_for_all_workers = (
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
index 8c8b5f741488b..732b69d6e5954 100644
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
@@ -68,8 +68,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             )
 
             assert self.speculative_config is None
-            worker_module_name = "vllm.worker.tpu_worker"
-            worker_class_name = "TPUWorker"
+            if self.scheduler_config.is_multi_step:
+                worker_module_name = "vllm.worker.multi_step_tpu_worker"
+                worker_class_name = "MultiStepTPUWorker"
+            else:
+                worker_module_name = "vllm.worker.tpu_worker"
+                worker_class_name = "TPUWorker"
 
             # GKE does not fetch environment information from metadata server
             # and instead sets these from within the Ray process. Therefore we
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 0af8ba41e24d5..972649dedf33e 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -62,11 +62,17 @@ def _create_worker(
         rank: int = 0,
         distributed_init_method: Optional[str] = None,
     ):
-        from vllm.worker.tpu_worker import TPUWorker
-
-        worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
-                                                     distributed_init_method))
-        return worker
+        if self.scheduler_config.is_multi_step:
+            from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
+            worker = MultiStepTPUWorker(**self._get_worker_kwargs(
+                local_rank, rank, distributed_init_method))
+            return worker
+        else:
+            from vllm.worker.tpu_worker import TPUWorker
+
+            worker = TPUWorker(**self._get_worker_kwargs(
+                local_rank, rank, distributed_init_method))
+            return worker
 
     def initialize_cache(
         self,
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index b5e8ef7860598..ac9d355c64c80 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -5,7 +5,8 @@
 from vllm.utils import is_list_of
 
 from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
-                   LLMInputs, PromptInputs)
+                   LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt,
+                   TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -60,8 +61,38 @@ def parse_and_batch_prompt(
                     for elem in prompt
                 ]
 
-    raise ValueError("prompt must be a string, array of strings, "
-                     "array of tokens, or array of token arrays")
+    raise TypeError("prompt must be a string, array of strings, "
+                    "array of tokens, or array of token arrays")
+
+
+class ParsedStrPrompt(TypedDict):
+    type: Literal["str"]
+    content: str
+
+
+class ParsedTextPrompt(TypedDict):
+    type: Literal["text"]
+    content: TextPrompt
+
+
+class ParsedTokensPrompt(TypedDict):
+    type: Literal["tokens"]
+    content: TokensPrompt
+
+
+def parse_singleton_prompt(
+    inputs: SingletonPromptInputs,
+) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
+    if isinstance(inputs, str):
+        return ParsedStrPrompt(type="str", content=inputs)
+    elif isinstance(inputs, dict):
+        if "prompt_token_ids" in inputs:
+            return ParsedTokensPrompt(type="tokens",
+                                      content=inputs)  # type: ignore
+        elif "prompt" in inputs:
+            return ParsedTextPrompt(type="text", content=inputs)
+
+    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
 
 
 def is_explicit_encoder_decoder_prompt(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
new file mode 100644
index 0000000000000..be2aa5f8cb7d0
--- /dev/null
+++ b/vllm/inputs/preprocess.py
@@ -0,0 +1,536 @@
+import asyncio
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+
+from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
+                   SingletonPromptInputs)
+from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+
+logger = init_logger(__name__)
+
+PromptComponents = Tuple[Optional[str], List[int],
+                         Optional["MultiModalDataDict"]]
+DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
+                                Optional["MultiModalDataDict"]]
+
+
+class InputPreprocessor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[BaseTokenizerGroup],
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+
+    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("You cannot pass text prompts when "
+                             "`skip_tokenizer_init` is True")
+
+        return self.tokenizer
+
+    def get_bos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for BOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+
+    def get_eos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for EOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
+
+    def get_decoder_start_token_id(self) -> Optional[int]:
+        '''
+        Obtain the decoder start token id employed by an encoder/decoder
+        model. Returns None for non-encoder/decoder models or if the
+        model config is unavailable.
+        '''
+
+        if not self.is_encoder_decoder_model():
+            logger.warning("Using None for decoder start token id because "
+                           "this is not an encoder/decoder model.")
+            return None
+
+        if (self.model_config is None or self.model_config.hf_config is None):
+            logger.warning("Using None for decoder start token id because "
+                           "model config is not available.")
+            return None
+
+        dec_start_token_id = getattr(self.model_config.hf_config,
+                                     'decoder_start_token_id', None)
+        if dec_start_token_id is None:
+            logger.warning("Falling back on <BOS> for decoder start token id "
+                           "because decoder start token id is not available.")
+            dec_start_token_id = self.get_bos_token_id()
+
+        return dec_start_token_id
+
+    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
+        '''
+        Specifically for encoder/decoder models:
+        generate a default decoder prompt for when
+        the user specifies only the encoder prompt.
+
+        Encoder/decoder models utilize the decoder
+        prompt in different ways; as new models are
+        added, it is intended that this function
+        will be extended to produce differing
+        default decoder prompts, depending on the
+        model variety.
+
+        Absent a special case, the default behavior
+        of this method is to mirror the behavior of
+        the HuggingFace (HF) GenerationMixin for a None
+        decoder prompt, which is to employ a logit processor
+        setting to force the first decoded token to be <BOS>.
+        Here, this behavior is approximated by having the
+        "default" decoder prompt be <BOS>.
+
+        However, it is possible that in the future
+        other models may have different or more 
+        complex logic for the default decoder prompt.
+        This motivates having a special helper method
+        for default decoder prompts.
+
+        Returns:
+
+        * prompt_token_ids
+        '''
+
+        bos_token_id = self.get_bos_token_id()
+        assert bos_token_id is not None
+        return [bos_token_id]
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        decoder_input_ids: Optional[List[int]],
+    ) -> List[int]:
+        """
+        Prepares `decoder_input_ids` for generation with encoder-decoder models.
+
+        Based on
+
+        https://github.com/huggingface/transformers/blob/
+        4037a2b5b1278736e566aec12e169100275545ea/
+        src/transformers/generation/utils.py
+
+        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
+
+        Arguments:
+
+        * decoder_input_ids: input token ids to preprocess
+
+        Returns:
+
+        * Processed token list
+        """
+
+        decoder_start_token_id = self.get_decoder_start_token_id()
+        assert decoder_start_token_id is not None
+
+        if decoder_input_ids is None:
+            # no decoder prompt input ->
+            # use decoder_start_token_id as decoder_input_ids
+            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
+
+        if (len(decoder_input_ids) == 0
+                or decoder_input_ids[0] != decoder_start_token_id):
+            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
+
+        return decoder_input_ids
+
+    def _apply_prompt_adapter(
+        self,
+        prompt_token_ids: List[int],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> List[int]:
+        if prompt_adapter_request:
+            prompt_token_ids = (
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
+                + prompt_token_ids)
+
+        return prompt_token_ids
+
+    def _tokenize_prompt(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """
+        Apply the model's tokenizer to a text prompt, returning the
+        corresponding token IDs.
+        """
+        tokenizer = self.get_tokenizer_group()
+
+        return tokenizer.encode(request_id=request_id,
+                                prompt=prompt,
+                                lora_request=lora_request)
+
+    async def _tokenize_prompt_async(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """Async version of :meth:`_tokenize_prompt`."""
+        tokenizer = self.get_tokenizer_group()
+
+        return await tokenizer.encode_async(request_id=request_id,
+                                            prompt=prompt,
+                                            lora_request=lora_request)
+
+    def _extract_prompt_components(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        '''
+        Extract the components of any single encoder or decoder input prompt.
+
+        Arguments:
+
+        * request_id
+        * inputs: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+
+        Returns:
+
+        * prompt
+        * prompt_token_ids
+        * multi_modal_data
+        '''
+
+        parsed = parse_singleton_prompt(inputs)
+
+        if parsed["type"] == "str":
+            prompt = parsed["content"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+        elif parsed["type"] == "tokens":
+            prompt = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        elif parsed["type"] == "text":
+            prompt = parsed["content"]["prompt"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        else:
+            assert_never(parsed)
+
+        return prompt, prompt_token_ids, multi_modal_data
+
+    async def _extract_prompt_components_async(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        """Async version of :meth:`_extract_prompt_components`."""
+        parsed = parse_singleton_prompt(inputs)
+
+        if parsed["type"] == "str":
+            prompt = parsed["content"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+        elif parsed["type"] == "tokens":
+            prompt = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        elif parsed["type"] == "text":
+            prompt = parsed["content"]["prompt"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        else:
+            assert_never(parsed)
+
+        return prompt, prompt_token_ids, multi_modal_data
+
+    def _build_enc_dec_llm_inputs(
+        self,
+        encoder_comps: PromptComponents,
+        decoder_comps: DecoderPromptComponents,
+    ) -> EncoderDecoderLLMInputs:
+        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
+        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
+
+        if encoder_mm_data is not None or decoder_mm_data is not None:
+            raise ValueError("Multi-modal encoder-decoder models are "
+                             "not supported yet")
+
+        decoder_prompt_ids = (
+            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
+
+        return EncoderDecoderLLMInputs(
+            prompt_token_ids=decoder_prompt_ids,
+            prompt=decoder_prompt,
+            encoder_prompt_token_ids=encoder_prompt_ids,
+            encoder_prompt=encoder_prompt,
+        )
+
+    def _process_encoder_decoder_prompt(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        '''
+        For encoder/decoder models only:
+        Process an input prompt into an
+        :class:`EncoderDecoderLLMInputs` instance.
+
+        There are two types of input prompts:
+        singleton prompts which carry only the
+        encoder prompt, and explicit encoder/decoder
+        prompts which carry both the encoder and the
+        decoder prompts as member variables.
+
+        This function handles the following scenarios:
+        * Singleton encoder prompt: extract encoder prompt
+          token ids & infer default decoder prompt token ids
+        * Explicit encoder/decoder prompt: extract encoder
+          and decoder prompt token ids
+
+        Note that for Explicit encoder/decoder prompts,
+        each sub-prompt (encoder or decoder prompt) can
+        have any possible singleton type; thus this
+        method relies on helper functions to obtain
+        token ids for the sub-prompts.
+        
+        Arguments:
+
+        * inputs: an input prompt
+        * request_id
+
+        Returns:
+
+        * :class:`EncoderDecoderLLMInputs` instance
+        '''
+
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            encoder_comps = self._extract_prompt_components(
+                inputs["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := inputs["decoder_prompt"]) is None:
+                decoder_comps = None, None, None
+            else:
+                decoder_comps = self._extract_prompt_components(
+                    decoder_input,
+                    request_id=request_id,
+                )
+        else:
+            encoder_comps = self._extract_prompt_components(
+                inputs,
+                request_id=request_id,
+            )
+
+            decoder_comps = None, None, None
+
+        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
+
+    async def _process_encoder_decoder_prompt_async(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        """Async version of :meth:`_process_encoder_decoder_prompt`."""
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            encoder_task = self._extract_prompt_components_async(
+                inputs["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := inputs["decoder_prompt"]) is None:
+                encoder_comps = await encoder_task
+                decoder_comps = None, None, None
+            else:
+                decoder_task = self._extract_prompt_components_async(
+                    decoder_input,
+                    request_id=request_id,
+                )
+
+                encoder_comps, decoder_comps = await asyncio.gather(
+                    encoder_task, decoder_task)
+        else:
+            encoder_comps = await self._extract_prompt_components_async(
+                inputs,
+                request_id=request_id,
+            )
+
+            decoder_comps = None, None, None
+
+        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
+
+    def _build_decoder_only_llm_inputs(
+        self,
+        prompt_comps: PromptComponents,
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> LLMInputs:
+        prompt, prompt_token_ids, multi_modal_data = prompt_comps
+
+        prompt_token_ids = self._apply_prompt_adapter(
+            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
+
+        return LLMInputs(prompt_token_ids=prompt_token_ids,
+                         prompt=prompt,
+                         multi_modal_data=multi_modal_data)
+
+    def _process_decoder_only_prompt(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        '''
+        For decoder-only models:
+        Process an input prompt into an :class:`LLMInputs` instance.
+
+        Arguments:
+
+        * inputs: input prompt
+        * request_id
+        * lora_request
+        * prompt_adapter_request
+
+        Returns:
+
+        * :class:`LLMInputs` instance
+        '''
+
+        prompt_comps = self._extract_prompt_components(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def _process_decoder_only_prompt_async(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        """Async version of :meth:`_process_decoder_only_prompt`."""
+        prompt_comps = await self._extract_prompt_components_async(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def preprocess(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Preprocess the input prompt."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return self._process_encoder_decoder_prompt(
+                inputs,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return self._process_decoder_only_prompt(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def preprocess_async(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Async version of :meth:`preprocess`."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return await self._process_encoder_decoder_prompt_async(
+                inputs,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return await self._process_decoder_only_prompt_async(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def is_encoder_decoder_model(self):
+        return self.model_config.is_encoder_decoder_model
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 49247cd5de42a..9102b5e19ebec 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,5 +1,6 @@
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.platforms import current_platform
 from vllm.utils import is_cpu, is_hip, is_xpu
 
@@ -53,6 +54,10 @@ def forward_gaudi(self, *args, **kwargs):
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
+
+        if envs.VLLM_TEST_COMPILE_NO_CUSTOM_OPS:
+            return self.forward_native
+
         if is_hip():
             return self.forward_hip
         elif is_cpu():
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 589a9b3bdc3f6..1c9a22c115826 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -123,9 +123,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 
-        out = torch.empty_like(x)
-        ops.gelu_new(out, x)
-        return out
+        return ops.gelu_new(x)
 
 
 class FastGELU(CustomOp):
@@ -145,9 +143,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 
-        out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
-        return out
+        return ops.gelu_fast(x)
 
 
 class QuickGELU(CustomOp):
@@ -164,6 +160,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.gelu_quick(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
+
     # TODO implement forward_xpu for QuickGELU
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 56f86a1bfa593..97465b66edf60 100755
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -2,17 +2,22 @@
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
-__all__ = ["FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported"]
+__all__ = [
+    "FusedMoE",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+]
 
 if HAS_TRITON:
-
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_marlin_moe, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk, invoke_fused_moe_kernel,
-        moe_align_block_size)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk, invoke_fused_moe_kernel, moe_align_block_size)
 
     __all__ += [
         "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
new file mode 100644
index 0000000000000..200a6148978aa
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -0,0 +1,219 @@
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Any, Dict, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
+
+
+def single_marlin_moe(
+        hidden_states: torch.Tensor,
+        w: torch.Tensor,
+        scales: torch.Tensor,
+        gating_output: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor:
+    """
+    This function computes the multiplication of hidden_states with expert
+    weights used in Marlin MoE, using weights w and top-k gating mechanism.
+    Its purpose is testing and debugging the fused MoE kernel.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
+    - w (torch.Tensor): The set of expert weights.
+    - scales (torch.Tensor): The quantization scales.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx (torch.Tensor): The act_order indices.
+    - perm (torch.Tensor): The act_order input permutation.
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype == torch.float16
+
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // 2
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, perm, workspace, M, N, K, True, E, topk, block_size_m, True,
+        False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
+
+def fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    perm1: torch.Tensor,
+    perm2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    override_config: Optional[Dict[str, Any]] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (torch.Tensor): The first set of act_order indices.
+    - g_idx2 (torch.Tensor): The second set of act_order indices.
+    - perm1 (torch.Tensor): The first act_order input permutation.
+    - perm2 (torch.Tensor): The second act_order input permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype == torch.float16
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+    topk = topk_ids.shape[1]
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        g_idx1,
+        perm1,
+        workspace,
+        M,
+        2 * N,
+        K,
+        True,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        g_idx2,
+        perm2,
+        workspace,
+        M,
+        K,
+        N,
+        True,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 624e7f24739d1..14601e39fc43c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -324,15 +324,22 @@ def get_moe_configs(E: int, N: int,
     return None
 
 
-def get_default_config(M: int, E: int, N: int, K: int, topk: int,
-                       dtype: Optional[str],
-                       is_marlin: bool) -> Dict[str, int]:
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+) -> Dict[str, int]:
     config = {
         'BLOCK_SIZE_M': 64,
         'BLOCK_SIZE_N': 64,
         'BLOCK_SIZE_K': 32,
         'GROUP_SIZE_M': 8
     }
+    # A heuristic: fused marlin works faster with this config for small M
     if M <= E or (is_marlin and M <= 32):
         config = {
             'BLOCK_SIZE_M': 16,
@@ -343,14 +350,15 @@ def get_default_config(M: int, E: int, N: int, K: int, topk: int,
     return config
 
 
-def try_get_optimal_moe_config(w1_shape: Tuple[int, ...],
-                               w2_shape: Tuple[int, ...],
-                               top_k: int,
-                               dtype: Optional[str],
-                               M: int,
-                               override_config: Optional[Dict[str,
-                                                              Any]] = None,
-                               is_marlin: bool = False):
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
+):
     if override_config:
         config = override_config
     else:
@@ -392,6 +400,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
+
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -402,6 +411,7 @@ def fused_topk(
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
     return topk_weights, topk_ids
 
 
@@ -435,114 +445,8 @@ def grouped_topk(hidden_states: torch.Tensor,
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    return topk_weights, topk_ids
-
-
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     g_idx1: torch.Tensor,
-                     g_idx2: torch.Tensor,
-                     rand_perm1: torch.Tensor,
-                     rand_perm2: torch.Tensor,
-                     topk: int,
-                     custom_routing_function: Optional[Callable] = None,
-                     renormalize: bool = True,
-                     override_config: Optional[Dict[str, Any]] = None,
-                     use_fp8: bool = False,
-                     w1_scale: Optional[torch.Tensor] = None,
-                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
-    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w1.
-    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w2.
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-
-    #TODO fp8 is not implemented yet
-    assert not use_fp8
-
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-
-    if custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
-    else:
-        topk_weights, topk_ids = custom_routing_function(
-            hidden_states, gating_output, topk, renormalize)
-
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w1.shape,
-                                        w2.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
-        block_size_m, True, False)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
-        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
-        block_size_m, False, True)
 
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+    return topk_weights, topk_ids.to(torch.int32)
 
 
 def get_config_dtype_str(dtype: torch.dtype,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index dea77e7a399ec..4d0ff111a9397 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -321,10 +321,28 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
+    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
+                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+
+        if shard_id == "w2":
+            self._load_w2(shard_id=shard_id,
+                          shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank)
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
+        # compressed-tensors represents weights on disk which are flipped
+        loaded_weight = loaded_weight.t().contiguous() if (
+            self.quant_method.__class__.__name__
+            == "CompressedTensorsMoEMethod") else loaded_weight
+
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                              f"got {shard_id}.")
@@ -340,19 +358,41 @@ def weight_loader(self, param: torch.nn.Parameter,
         expert_data = param.data[expert_id]
         tp_rank = get_tensor_model_parallel_rank()
 
-        # is_transposed: whether or not the parameter is transposed on disk
-        # If transposed, the loaded weight will be transposed and the dim
-        # to shard the loaded weight will be flipped.
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
-        # Case weight_scales
-        if "weight_scale" in weight_name:
-            # load the weight scaling based on the quantization scheme
-            # supported weight scales can be found in
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            if param.data[expert_id] != 1 and (param.data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}")
+
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return
+
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(shard_dim=0,
+                             shard_id=shard_id,
+                             loaded_weight=loaded_weight,
+                             expert_data=expert_data,
+                             tp_rank=tp_rank)
+            return
+
+        # Case weight scales and zero_points
+        if ("scale" in weight_name or "zero" in weight_name):
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
             # TODO @dsikka: once hardened, refactor to use vLLM Parameters
             # specific to each case
@@ -381,22 +421,9 @@ def weight_loader(self, param: torch.nn.Parameter,
                     f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
             return
 
+        # Case weight_shape
         if "weight_shape" in weight_name:
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
-            return
-
-        # Case input scale
-        if "input_scale" in weight_name:
-            # Note: input_scale loading is only supported for fp8
-            if param.data[expert_id] != 1 and (param.data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param.data[expert_id]} "
-                    f"vs. {loaded_weight}")
-
+            # only required by compressed-tensors
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a2a825b27b632..0fd295a32da15 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -94,14 +94,11 @@ def forward_xpu(
                 self.variance_epsilon,
             )
             return x, residual
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
+        return ops.rms_norm(
             x,
             self.weight.data,
             self.variance_epsilon,
         )
-        return out
 
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5de66bf209d20..4fe13ce0ba0b1 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -26,7 +26,8 @@
     "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
     "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
-    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod"
+    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
+    "ModelOptFp8LinearMethod"
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index aa5c288962d91..3c38f0a006070 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
 from vllm.model_executor.layers.quantization.neuron_quant import (
     NeuronQuantConfig)
 from vllm.model_executor.layers.quantization.qqq import QQQConfig
@@ -34,6 +35,7 @@
     "tpu_int8": Int8TpuConfig,
     "fp8": Fp8Config,
     "fbgemm_fp8": FBGEMMFp8Config,
+    "modelopt": ModelOptFp8Config,
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
     "marlin": MarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0768b37044aac..b5b2570966600 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -116,15 +116,19 @@ def get_config_filenames(cls) -> List[str]:
     def _check_scheme_supported(self,
                                 min_capability: int,
                                 error: bool = True) -> bool:
-        capability = current_platform.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        supported = capability >= min_capability
-        if error and not supported:
-            raise RuntimeError(
-                "Quantization scheme is not supported for ",
-                f"the current GPU. Min capability: {min_capability}. ",
-                f"Current capability: {capability}.")
-        return supported
+        capability = current_platform.get_device_capability()  # type: ignore
+
+        if capability is not None:
+            capability = capability[0] * 10 + capability[1]
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.")
+            return supported
+        else:
+            return False
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
@@ -232,7 +236,8 @@ def _get_scheme_from_parts(
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
-                    group_size=weight_quant.group_size)
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder)
 
         # Detect If Activation Quantization.
         # TODO @dsikka: clean-up conditions
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 36323493d601e..49c29c2775cb6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -5,9 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    WNA16_SUPPORTED_BITS)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat)
 from vllm.model_executor.utils import set_weight_attrs
@@ -40,11 +38,10 @@ def __init__(
 
         if not (self.quant_config.quant_format
                 == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
+                and self.num_bits == 4):
             raise ValueError("For Fused MoE layers, only ",
                              f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
+                             "is supported for 4 bits")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size: int,
@@ -269,19 +266,30 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             fused_marlin_moe)
 
-        return fused_marlin_moe(x,
-                                layer.w13_weight_packed,
-                                layer.w2_weight_packed,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                custom_routing_function,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_weight_scale,
-                                w2_scale=layer.w2_weight_scale)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+        )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7ca8eecb9283e..3cade3d3fbcd0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -5,20 +5,24 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    ActivationOrdering)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_marlin_supported,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
 from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
-    8: scalar_types.uint8b128,
+    8: scalar_types.uint8b128
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
@@ -28,11 +32,13 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
     def __init__(self,
                  strategy: str,
                  num_bits: int,
-                 group_size: Optional[int] = None):
+                 group_size: Optional[int] = None,
+                 actorder: Optional[ActivationOrdering] = None):
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.group_size = -1 if group_size is None else group_size
+        self.has_g_idx = actorder == ActivationOrdering.GROUP
 
         if self.group_size == -1 and self.strategy != "channel":
             raise ValueError("Marlin kernels require group quantization or "
@@ -64,12 +70,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         output_size_per_partition = sum(output_partition_sizes)
 
         # If group_size is -1, we are in channelwise case.
-        channelwise = (self.group_size == -1)
         group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = (input_size != input_size_per_partition)
-        # In the case of channelwise quantization, we need to replicate the
-        # scales across all gpus.
-        partition_scales = (row_parallel and not channelwise)
+        partition_scales = not marlin_repeat_scales_on_all_ranks(
+            self.has_g_idx, self.group_size, row_parallel)
 
         verify_marlin_supports_shape(
             output_size_per_partition=output_size_per_partition,
@@ -123,6 +127,16 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
 
+        # group index (for activation reordering)
+        if self.has_g_idx:
+            weight_g_idx = RowvLLMParameter(data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+                                            input_dim=0,
+                                            weight_loader=weight_loader)
+            layer.register_parameter("weight_g_idx", weight_g_idx)
+
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
@@ -137,9 +151,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.workspace = marlin_make_workspace(
             layer.output_size_per_partition, device)
 
-        # Act-order not supported in compressed-tensors yet, so set to empty.
-        layer.g_idx = marlin_make_empty_g_idx(device)
-        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+        # Handle sorting for activation reordering if needed.
+        if self.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.weight_g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+            replace_tensor(layer, "weight_g_idx", g_idx)
+        else:
+            layer.weight_g_idx = marlin_make_empty_g_idx(device)
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
         # No zero-point
         layer.weight_zp = marlin_make_empty_g_idx(device)
@@ -159,9 +178,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_tensor(layer, "weight_packed", marlin_qweight)
 
         # Permute scales from compressed-tensors format to marlin format.
+        # scale is required on all partitions if activation reordering
         marlin_scales = marlin_permute_scales(
             layer.weight_scale,
-            size_k=layer.input_size_per_partition,
+            size_k=(layer.input_size
+                    if self.has_g_idx else layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
             group_size=layer.group_size)
         replace_tensor(layer, "weight_scale", marlin_scales)
@@ -174,7 +195,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
             weight=layer.weight_packed,
             weight_scale=layer.weight_scale,
             weight_zp=layer.weight_zp,
-            g_idx=layer.g_idx,
+            g_idx=layer.weight_g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
             wtype=self.quant_type,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 7912cbde5721f..fc531b9d666e3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,8 +1,8 @@
 import re
 from enum import Enum
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Optional, Union
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from torch.nn import Module
 
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -40,6 +40,19 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
 
 
+class ActivationOrdering(str, Enum):
+    """
+    Enum storing strategies for activation ordering
+
+    Group: reorder groups and weight\n
+    Weight: only reorder weight, not groups. Slightly lower latency and
+    accuracy compared to group actorder\n
+    """
+
+    GROUP = "group"
+    WEIGHT = "weight"
+
+
 class QuantizationArgs(BaseModel):
     """
     User facing arguments used to define a quantization config 
@@ -58,6 +71,8 @@ class QuantizationArgs(BaseModel):
         observed with every sample. Defaults to False for static
         quantization. Note that enabling dynamic quantization 
         will change the default observer to a memoryless one
+    :param actorder: whether to apply group quantization in decreasing order of
+        activation. Defaults to None for arbitrary ordering
     """
 
     num_bits: int = 8
@@ -67,6 +82,7 @@ class QuantizationArgs(BaseModel):
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
     dynamic: bool = False
+    actorder: Union[ActivationOrdering, bool, None] = None
     observer: str = Field(
         default="minmax",
         description=("The class to use to compute the quantization param - "
@@ -79,6 +95,16 @@ class QuantizationArgs(BaseModel):
          "Observers constructor excluding quantization range or symmetry"),
     )
 
+    @field_validator("actorder", mode="before")
+    def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
+        if isinstance(value, bool):
+            return ActivationOrdering.GROUP if value else None
+
+        if isinstance(value, str):
+            return ActivationOrdering(value.lower())
+
+        return value
+
 
 def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index a6a1ed5b0dee5..dc83017bcc7f9 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -55,7 +55,10 @@ def get_scaled_act_names(self) -> List[str]:
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
     # use dequantize mulmat for IQmatrix, mmq for k-quants
-    if qweight_type >= 16:
+    if x.shape[0] == 1:
+        # enable mmvq in contiguous batching
+        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    elif qweight_type >= 16:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
         weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b06ff7bd2bace..3617a32f80fc1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,18 +1,22 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -33,8 +37,14 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -105,11 +115,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -179,7 +192,8 @@ def create_weights(
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
@@ -299,7 +313,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -308,7 +323,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=(layer.input_size if self.quant_config.desc_act else
                     layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
@@ -329,4 +345,270 @@ def apply(
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w2_scales", marlin_w2_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
+
+        # The input must currently be float16
+        orig_dtype = x.dtype
+        x = x.half()
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=None)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+        ).to(orig_dtype)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
new file mode 100644
index 0000000000000..dc5f47eb9b0fb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -0,0 +1,163 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
+from vllm.model_executor.parameter import (ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+ACTIVATION_SCHEMES = ["static"]
+
+
+class ModelOptFp8Config(QuantizationConfig):
+    """Config class for ModelOpt FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
+                           " the format is experimental and could change.")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
+        quant_config = cls.get_from_keys(config, ["quantization"])
+        quant_method = quant_config["quant_algo"]
+        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
+        if not is_checkpoint_fp8_serialized:
+            raise ValueError("ModelOpt currently only supports static FP8"
+                             "quantization in vLLM. Please check the "
+                             "`hf_quant_config.json` file for your model's "
+                             "quant configuration.")
+        return cls(is_checkpoint_fp8_serialized)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            return ModelOptFp8LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return ModelOptFp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__(quant_config)
+
+
+class ModelOptFp8LinearMethod(LinearMethodBase):
+    """Linear method for Model Optimizer static quantization.
+    Supports loading FP8 checkpoints with static weight scale and
+    activation scale. Future support might be added for dynamic 
+    scales.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn datatype 
+        Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=weight_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+            weight_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("weight_scale", weight_scale)
+            # INPUT SCALE
+            scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                            weight_loader=weight_loader)
+
+            scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", scale)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        max_w_scale, weight = requantize_with_max_scale(
+            layer.weight, layer.weight_scale, layer.logical_widths)
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(),
+                                      requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0ec68ac5b0f21..699d5f1844146 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -176,6 +176,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
 def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 7d08ac6f87469..4a06c5d63d52d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,6 +1,6 @@
 """Utility functions used for tests and benchmarks"""
 
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -92,8 +92,11 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
-                    act_order: bool):
+def marlin_quantize(w: torch.Tensor,
+                    quant_type: ScalarType,
+                    group_size: int,
+                    act_order: bool,
+                    test_perm: Optional[torch.Tensor] = None):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -104,7 +107,7 @@ def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        w, quant_type, group_size, act_order)
+        w, quant_type, group_size, act_order, test_perm)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 33f24ff5d54d3..bdfda31de852b 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List
+from typing import List, Optional
 
 import numpy
 import torch
@@ -53,7 +53,10 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
@@ -64,7 +67,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
         g_idx[i] = i // group_size
 
     # Simulate act_order by doing a random permutation on K
-    rand_perm = torch.randperm(k_size)
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
@@ -164,8 +167,11 @@ def reshape_w(w):
     )
 
 
-def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
-                          group_size: int, act_order: bool):
+def gptq_quantize_weights(w: torch.Tensor,
+                          quant_type: ScalarType,
+                          group_size: int,
+                          act_order: bool,
+                          test_perm: Optional[torch.Tensor] = None):
     size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
@@ -186,7 +192,8 @@ def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
+                                                    test_perm)
 
     return w_ref, w_q, w_s, g_idx, rand_perm
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 20c96fbcaed90..e3d9f6929adfc 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -87,7 +87,7 @@ def apply_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    out_dtype: torch.dtype,
+    out_dtype: Optional[torch.dtype] = None,
     input_scale: Optional[torch.Tensor] = None,
     input_scale_ub: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
@@ -98,6 +98,9 @@ def apply_fp8_linear(
     #   If dynamic, layer.input_scale is None and x_scale computed from x.
     #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
+    if out_dtype is None:
+        out_dtype = input.dtype
+
     # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
     if cutlass_fp8_supported:
         qinput, x_scale = ops.scaled_fp8_quant(
@@ -168,11 +171,13 @@ def apply_fp8_linear(
             # GEMM
             # This computes C = (X * W).
             # Output in fp32 to allow subsequent ops to happen in-place
-            output, _ = torch._scaled_mm(qinput,
-                                         weight,
-                                         scale_a=TORCH_DEVICE_IDENTITY,
-                                         scale_b=TORCH_DEVICE_IDENTITY,
-                                         out_dtype=torch.float32)
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      scale_a=TORCH_DEVICE_IDENTITY,
+                                      scale_b=TORCH_DEVICE_IDENTITY,
+                                      out_dtype=torch.float32)
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
             # Unpad (undo num_token_padding)
             output = torch.narrow(output, 0, 0, input.shape[0])
             x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d323f6cc432a2..d4e9ed87ed54f 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -712,6 +712,179 @@ def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
         return new_freqs
 
 
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[List[int]] = None,
+    ) -> None:
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+        self.mrope_section = mrope_section
+        if self.mrope_section:
+            assert sum(self.mrope_section) == rotary_dim // 2
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            cos = torch.cat([
+                m[i]
+                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+            sin = torch.cat([
+                m[i]
+                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
+            ],
+                            dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_input_positions(
+        input_tokens: List[int],
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        image_token_id: int,
+        video_token_id: int,
+        vision_start_token_id: int,
+        vision_end_token_id: int,
+        spatial_merge_size: int,
+        context_len: int = 0,
+    ) -> Tuple[List[List[int]], int]:
+        """Get mrope input positions and delta value."""
+
+        if isinstance(image_grid_thw, torch.Tensor):
+            image_grid_thw = image_grid_thw.tolist()
+        if isinstance(video_grid_thw, torch.Tensor):
+            video_grid_thw = video_grid_thw.tolist()
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w).flatten()
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> List[List[int]]:
+        return [
+            list(
+                range(context_len + mrope_position_delta,
+                      seq_len + mrope_position_delta)) for _ in range(3)
+        ]
+
+
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
 
@@ -752,7 +925,7 @@ def get_rope(
         # The correct one should be "longrope" but keep "su" here
         # for backward compatible
         if scaling_type not in {"su", "longrope"}:
-            scaling_factor = rope_scaling["factor"]
+            scaling_factor = rope_scaling.get("factor", 1.0)
         if scaling_type == "llama3":
             low_freq_factor = rope_scaling["low_freq_factor"]
             high_freq_factor = rope_scaling["high_freq_factor"]
@@ -816,6 +989,16 @@ def get_rope(
                 head_size, rotary_dim, max_position, original_max_position,
                 base, is_neox_style, dtype, short_factor, long_factor,
                 **extra_kwargs)
+        elif scaling_type == "mrope":
+            rotary_emb = MRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_scaling["mrope_section"],
+            )
         else:
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     _ROPE_DICT[key] = rotary_emb
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f59eb805ea907..ac869e56ce198 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -95,8 +95,9 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        if not current_platform.is_tpu():
-            capability = current_platform.get_device_capability()
+        capability = current_platform.get_device_capability()  # type: ignore
+
+        if capability is not None:
             capability = capability[0] * 10 + capability[1]
             if capability < quant_config.get_min_capability():
                 raise ValueError(
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4bb943ab3afe4..0052489d99dc4 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,10 +24,18 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = ["fp8", "compressed-tensors"]
+    # for gptq_marlin, only run fused MoE for int4
+    if model_config.quantization == "gptq_marlin":
+        hf_quant_config = getattr(model_config.hf_config,
+                                  "quantization_config", None)
+        if hf_quant_config and hf_quant_config.get("bits") == 4:
+            mixtral_supported.append("gptq_marlin")
+
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
+
     return ModelRegistry.resolve_model_cls(architectures)
 
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 075451292a8e4..5051d45dd1154 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -192,6 +192,13 @@ def get_quant_config(model_config: ModelConfig,
 
         if model_config.quantization == "bitsandbytes":
             config["adapter_name_or_path"] = model_name_or_path
+        elif model_config.quantization == "modelopt":
+            if config["producer"]["name"] == "modelopt":
+                return quant_cls.from_config(config)
+            else:
+                raise ValueError(
+                    f"Unsupported quantization config"
+                    f" found for {model_config.quantization} in {f}.")
 
     return quant_cls.from_config(config)
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 8345caebedca2..2763606c7c729 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -44,6 +44,7 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
@@ -54,6 +55,8 @@
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
+    "Qwen2VLForConditionalGeneration":
+    ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
@@ -81,14 +84,20 @@
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration":
     ("llava", "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration":
-    ("llava_next", "LlavaNextForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration": ("llava_next",
+                                          "LlavaNextForConditionalGeneration"),
+    "LlavaNextVideoForConditionalGeneration":
+    ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "PaliGemmaForConditionalGeneration": ("paligemma",
                                           "PaliGemmaForConditionalGeneration"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "PixtralForConditionalGeneration": ("pixtral",
+                                        "PixtralForConditionalGeneration"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
+                                        "Qwen2VLForConditionalGeneration"),
+    "UltravoxModel": ("ultravox", "UltravoxModel"),
 }
 _CONDITIONAL_GENERATION_MODELS = {
     "BartModel": ("bart", "BartForConditionalGeneration"),
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 70f1522ae2524..078928f281c26 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -355,6 +355,19 @@ def __init__(self,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override)
 
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {config.num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+        elif len(self.encoder.layers) == config.num_hidden_layers:
+            self.post_layernorm = nn.LayerNorm(embed_dim,
+                                               eps=config.layer_norm_eps)
+        else:
+            # post_layernorm is unused when we extract intermediate features
+            # In this case, we can skip it to conserve memory
+            self.post_layernorm = None
+
     def forward(
         self,
         pixel_values: torch.Tensor,
@@ -364,7 +377,10 @@ def forward(
         hidden_states = self.pre_layrnorm(hidden_states)
         hidden_states = self.encoder(inputs_embeds=hidden_states)
 
-        return hidden_states
+        if self.post_layernorm is None:
+            return hidden_states
+
+        return self.post_layernorm(hidden_states)
 
 
 class CLIPVisionModel(nn.Module):
@@ -386,9 +402,12 @@ def __init__(self,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override)
 
-    def forward(self, pixel_values: Optional[torch.Tensor] = None):
+    @property
+    def _require_post_layernorm(self) -> bool:
+        return self.vision_model.post_layernorm is not None
 
-        return self.vision_model(pixel_values=pixel_values)
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(pixel_values)
 
     @property
     def device(self):
@@ -408,8 +427,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         for name, loaded_weight in weights:
             # post_layernorm is not needed in CLIPVisionModel
-            if "vision_model.post_layernorm" in name:
+            if ("vision_model.post_layernorm" in name
+                    and not self._require_post_layernorm):
                 continue
+
             # omit layers when num_hidden_layers_override is set
             if "vision_model.encoder.layers." in name:
                 layer_idx = int(name.split(".")[3])
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 90449ec51ef0b..f9d9f9e7567c8 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -312,6 +312,14 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
     # Gemma does not apply LoRA to the embedding layer.
     embedding_modules = {}
     embedding_padding_modules = []
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 0cf63d9e1fb22..507d7014714a2 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -17,6 +17,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
+from vllm.distributed import get_pp_group
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -269,6 +270,7 @@ def input_mapper_for_internvl(ctx: InputContext, data: object):
         # Add an N dimension for number of images per prompt (currently 1).
         data = data.unsqueeze(0)
     elif is_list_of(data, Image.Image):
+        # we can't stack here because the images may have different num_patches
         data = [
             image_to_pixel_values(img,
                                   image_size,
@@ -276,7 +278,6 @@ def input_mapper_for_internvl(ctx: InputContext, data: object):
                                   max_num,
                                   use_thumbnail=use_thumbnail) for img in data
         ]
-        data = torch.stack(data)
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
@@ -448,11 +449,12 @@ def _parse_and_validate_image_input(
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
-
+            # We need to flatten (B, N, P) to (B*N*P),
+            # so we call flatten_bn twice.
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True).flatten(0, 1)),
+                    flatten_bn(flatten_bn(pixel_values), concat=True)),
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -480,7 +482,7 @@ def forward(
         **kwargs: object,
     ) -> SamplerOutput:
         image_input = self._parse_and_validate_image_input(**kwargs)
-        if image_input is not None:
+        if image_input is not None and get_pp_group().is_first_rank:
             inputs_embeds = self.language_model.model.get_input_embeddings(
                 input_ids)
             vision_embeddings = self._process_image_input(image_input)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 29dd09afac5ad..9b7cc22869765 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -733,7 +733,7 @@ def _clean_up_first_bs_blocks(self, batch_size: int,
                                   indices_for_current_run: List[int]):
         # move out all of the occupied but currently not running blocks
         # outside of the first n blocks
-        destination_indices = set([range(batch_size)])
+        destination_indices = range(batch_size)
         max_possible_batch_size = self.mamba_cache[0].shape[1]
         for destination_index in destination_indices:
             if destination_index in self._get_all_occupied_indices() and  \
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
new file mode 100644
index 0000000000000..7fe85e5e4ab3d
--- /dev/null
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -0,0 +1,471 @@
+import itertools
+import math
+from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import (CLIPVisionConfig, LlavaNextVideoConfig,
+                          SiglipVisionConfig)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   repeat_and_pad_placeholder_tokens)
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
+
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .interfaces import SupportsMultiModal
+from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
+                     dummy_seq_data_for_siglip)
+from .utils import (filter_weights, init_vllm_registered_model,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 32
+_MAX_NUM_VIDEOS = 1
+
+
+class LlavaNextVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size, num_frames, num_channels, height, width)`
+
+    Note that `num_frames` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+
+    Note that it only supports one video input for one batch.
+    """
+
+
+def get_llava_next_video_frame_feature_size(
+        hf_config: LlavaNextVideoConfig) -> int:
+    # Support both CLIPVisionConfig and SiglipVisionConfig
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    spatial_pool_stride = hf_config.spatial_pool_stride
+
+    return int((image_size / patch_size / spatial_pool_stride)**2)
+
+
+def _get_max_llm_tokens(ctx: InputContext) -> int:
+    """
+    Calculated from the maximum video frames under the context length
+    constraints of the language model.
+    """
+    hf_text_config = ctx.model_config.hf_text_config
+    model_config = ctx.model_config
+    max_tokens = model_config.max_model_len
+    rope_scaling = model_config.rope_scaling
+
+    if rope_scaling:
+        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
+    else:
+        rope_scaling_factor = 1
+
+    max_tokens *= rope_scaling_factor
+
+    return max_tokens
+
+
+def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
+    # Currently set to 32 frames
+    # TODO: max_tokens = _get_max_llm_tokens(ctx)
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
+
+
+def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
+                                    mm_counts: Mapping[str, int]):
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    # TODO: support multiple videos
+    num_videos = mm_counts["video"]
+    if num_videos != _MAX_NUM_VIDEOS:
+        raise NotImplementedError(
+            f"Only {_MAX_NUM_VIDEOS} videos are supported")
+
+    # TODO: support configuring the number of frames
+    frames_per_video = _MAX_FRAMES_PER_VIDEO
+    # num_images = num_videos * frames_per_video
+
+    # fills the sequence with as longer video data as possible
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    video_feature_size = frames_per_video * tokens_per_frame
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+
+        pil_frame = dummy_image_for_clip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+    elif isinstance(vision_config, SiglipVisionConfig):
+        seq_data = dummy_seq_data_for_siglip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+
+        pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_llava_next_video(ctx: InputContext,
+                                         llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "video" not in multi_modal_data:
+        return llm_inputs
+    video_data = multi_modal_data["video"]
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(video_data, np.ndarray):
+        # Supports both CLIP and Siglip
+        num_frames = video_data.shape[0]
+        frame_feature_size = \
+            get_llava_next_video_frame_feature_size(hf_config)
+        video_feature_size = num_frames * frame_feature_size
+
+        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+            tokenizer,
+            llm_inputs.get("prompt"),
+            llm_inputs["prompt_token_ids"],
+            placeholder_token_id=hf_config.video_token_index,
+            repeat_count=video_feature_size,
+        )
+
+        return LLMInputs(prompt_token_ids=new_token_ids,
+                         prompt=new_prompt,
+                         multi_modal_data=multi_modal_data)
+
+    elif is_list_of(video_data, np.ndarray):
+        raise NotImplementedError(
+            "Processing multiple videos is not supported")
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def _init_vision_tower(hf_config: LlavaNextVideoConfig):
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the required feature layer
+    vision_feature_layer = hf_config.vision_feature_layer
+    if vision_feature_layer < 0:
+        num_hidden_layers = hf_config.vision_config.num_hidden_layers \
+            + vision_feature_layer + 1
+    else:
+        num_hidden_layers = vision_feature_layer + 1
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+# adopted from transformers modeling_llava_next_video.py
+class LlavaNextVideoPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        image_size = config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.image_size = image_size // patch_size**2
+
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        else:
+            # TODO: Support Conv2d pooling layer, need to load weights
+            raise ValueError(
+                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
+
+    def forward(self, image_features):
+        ori_width = int(
+            math.sqrt(image_features.shape[1] * self.image_size //
+                      self.image_size))
+        ori_height = int(ori_width * self.image_size // self.image_size)
+
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features \
+            .view(batch_size, ori_height, ori_height, dim) \
+            .permute(0, 3, 1, 2)
+        image_features_spatial = self.pool(image_features_spatial)
+
+        return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
+
+
+class LlavaNextMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
+                 projector_hidden_act: str):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(vision_hidden_size,
+                                  text_hidden_size,
+                                  bias=True)
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = nn.Linear(text_hidden_size,
+                                  text_hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_input_mapper("video")
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "video", get_max_llava_next_video_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self,
+                 config: LlavaNextVideoConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = _init_vision_tower(config)
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act)
+        self.language_model = init_vllm_registered_model(
+            config.text_config, cache_config, quant_config)
+        self.vision_resampler = LlavaNextVideoPooler(config)
+
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values_videos" : 
+                List[b, Tensor(nb_frames, nb_channels, height, width)]
+        }
+        """
+        pixel_values = kwargs.pop("pixel_values_videos", None)
+
+        if pixel_values is None:
+            return None
+
+        if not (is_list_of(pixel_values,
+                           (torch.Tensor))  # different shape videos 
+                or isinstance(pixel_values,
+                              torch.Tensor)):  # same shape videos
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values_videos",
+            data=pixel_values,
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        image_features = self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        image_features = self.vision_resampler(image_features)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
+        assert self.vision_tower is not None
+
+        video_pixels = inputs["data"]
+
+        if isinstance(video_pixels, torch.Tensor):
+            # TODO: support multiple videos per input
+            b, num_videos, num_frames, c, h, w = video_pixels.shape
+            assert (num_videos == 1)
+            stacked_pixels = video_pixels.view(b * num_videos * num_frames, c,
+                                               h, w)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return stacked_embeddings.view(b, num_frames,
+                                           *stacked_embeddings.shape[1:])
+
+        elif is_list_of(video_pixels, torch.Tensor):
+            frames_per_videos = [v.shape[0] for v in video_pixels]
+            stacked_pixels = torch.cat(video_pixels, dim=0)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
+
+        else:
+            raise ValueError(
+                f"Unsupported type of video input {type(video_pixels)}")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for LlaVA-NeXT-Video.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: Pixels in each frames for each input videos.
+        """
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        # merge video embeddings into input embeddings
+        if video_input is not None:
+            video_embeddings = self._process_video_pixels(video_input)
+            inputs_embeds = self.language_model \
+                .model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, video_embeddings,
+                self.config.video_token_index)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators
+        vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
+            weights, 4)
+
+        # load vision encoder
+        vit_weights = filter_weights(vit_weights, "vision_tower")
+        self.vision_tower.load_weights(vit_weights)
+
+        # load mlp projector
+        mlp_weights = filter_weights(mlp_weights, "multi_modal_projector")
+        mlp_params_dict = dict(self.multi_modal_projector.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        # load llm backbone
+        llm_weights = filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index a135118bc748e..963ad7553fe1d 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -270,38 +270,47 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.max_position_embeddings = getattr(config,
+                                               "max_position_embeddings", 8192)
+        self._init_attn_block()
+        self._init_ffn_block()
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
         self.self_attn = MiniCPMAttention(
             hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
+            num_heads=self.config.num_attention_heads,
+            num_kv_heads=self.config.num_key_value_heads,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
         )
+
+    def _init_ffn_block(self):
+        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
+                                                eps=self.config.rms_norm_eps)
         self.num_experts = getattr(self.config, "num_experts", 0)
         if self.num_experts == 0:
             self.mlp = MiniCPMMLP(
                 hidden_size=self.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
+                intermediate_size=self.config.intermediate_size,
+                hidden_act=self.config.hidden_act,
+                quant_config=self.quant_config,
             )
         else:
-            self.mlp = MiniCPMMoE(num_experts=config.num_experts,
-                                  top_k=config.num_experts_per_tok,
-                                  hidden_size=config.hidden_size,
-                                  intermediate_size=config.intermediate_size)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+            self.mlp = MiniCPMMoE(
+                num_experts=self.config.num_experts,
+                top_k=self.config.num_experts_per_tok,
+                hidden_size=self.config.hidden_size,
+                intermediate_size=self.config.intermediate_size)
 
     def forward(
         self,
@@ -344,6 +353,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self.config = config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -354,11 +365,15 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
+        self._init_layers()
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def _init_layers(self):
         self.layers = nn.ModuleList([
-            MiniCPMDecoderLayer(config, cache_config, quant_config)
-            for _ in range(config.num_hidden_layers)
+            MiniCPMDecoderLayer(self.config, self.cache_config,
+                                self.quant_config)
+            for _ in range(self.config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
@@ -431,13 +446,11 @@ def __init__(
 
         self.config = config
         self.lora_config = lora_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
 
         self.num_experts = getattr(self.config, "num_experts", 0)
-        self.quant_config = quant_config
-        self.model = MiniCPMModel(config,
-                                  cache_config,
-                                  quant_config,
-                                  lora_config=lora_config)
+        self._init_model()
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -458,6 +471,12 @@ def __init__(
                                                 config.vocab_size)
         self.sampler = Sampler()
 
+    def _init_model(self):
+        self.model = MiniCPMModel(config=self.config,
+                                  cache_config=self.cache_config,
+                                  quant_config=self.quant_config,
+                                  lora_config=self.lora_config)
+
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
new file mode 100644
index 0000000000000..a048a3dba0415
--- /dev/null
+++ b/vllm/model_executor/models/minicpm3.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2024 The ModelBest team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
+from typing import Any, Dict, Optional
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.models.minicpm import (MiniCPMDecoderLayer,
+                                                MiniCPMForCausalLM,
+                                                MiniCPMModel)
+
+
+class MiniCPM3Attention(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                         self.q_lora_rank,
+                                         bias=False,
+                                         quant_config=quant_config)
+        self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+        self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                             self.num_heads * self.qk_head_dim,
+                                             bias=False,
+                                             quant_config=quant_config)
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
+                                                   self.kv_lora_rank +
+                                                   self.qk_rope_head_dim,
+                                                   bias=False,
+                                                   quant_config=quant_config)
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config)
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rotary_emb = get_rope(
+            self.qk_rope_head_dim,
+            rotary_dim=self.qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_local_heads,
+                              self.qk_head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        q, _ = self.q_a_proj(hidden_states)
+        q = self.q_a_layernorm(q)
+        q, _ = self.q_b_proj(q)
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                          dim=-1)
+        latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv, _ = self.kv_b_proj(kv_a)
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        q_pe, k_pe = self.rotary_emb(
+            positions,
+            q_pe.reshape(-1, self.num_local_heads * self.qk_rope_head_dim),
+            k_pe.reshape(-1, self.qk_rope_head_dim))
+        q_pe = q_pe.view(-1, self.num_local_heads, self.qk_rope_head_dim)
+        k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
+
+        q[..., self.qk_nope_head_dim:] = q_pe
+
+        k = torch.empty_like(q)
+
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+
+        q = q.reshape(-1, self.num_local_heads * self.qk_head_dim)
+        k = k.view(-1, self.num_local_heads * self.qk_head_dim)
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
+
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
+
+    def _init_attn_block(self):
+        self.input_layernorm = RMSNorm(self.config.hidden_size,
+                                       eps=self.config.rms_norm_eps)
+        self.self_attn = MiniCPM3Attention(
+            config=self.config,
+            hidden_size=self.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            qk_nope_head_dim=self.config.qk_nope_head_dim,
+            qk_rope_head_dim=self.config.qk_rope_head_dim,
+            v_head_dim=self.config.v_head_dim,
+            q_lora_rank=self.config.q_lora_rank,
+            kv_lora_rank=self.config.kv_lora_rank,
+            rope_theta=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            max_position_embeddings=self.max_position_embeddings,
+            cache_config=self.cache_config,
+            quant_config=self.quant_config,
+        )
+
+
+class MiniCPM3Model(MiniCPMModel):
+
+    def _init_layers(self):
+        self.layers = nn.ModuleList([
+            MiniCPM3DecoderLayer(self.config, self.cache_config,
+                                 self.quant_config)
+            for _ in range(self.config.num_hidden_layers)
+        ])
+
+
+class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
+
+    def _init_model(self):
+        self.model = MiniCPM3Model(config=self.config,
+                                   cache_config=self.cache_config,
+                                   quant_config=self.quant_config,
+                                   lora_config=self.lora_config)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e744e36ac08bf..10cbfcf6432b3 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -435,7 +435,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -454,6 +455,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -464,7 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 25bc0590c745c..5036f55803c20 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -600,7 +600,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader(
                         param,
                         loaded_weight,
-                        weight_name,
+                        name,
                         shard_id=shard_id,
                         expert_id=expert_id,
                     )
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
new file mode 100644
index 0000000000000..682b78bbed093
--- /dev/null
+++ b/vllm/model_executor/models/pixtral.py
@@ -0,0 +1,566 @@
+from array import array
+from dataclasses import dataclass, fields
+from itertools import tee
+from typing import Iterable, List, Mapping, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mistral_common.protocol.instruct.messages import ImageChunk
+from PIL import Image
+from transformers import PretrainedConfig
+from xformers.ops.fmha import memory_efficient_attention
+from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import merge_multimodal_embeddings
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
+                           SequenceData)
+
+from .interfaces import SupportsMultiModal
+from .utils import init_vllm_registered_model
+
+
+def get_max_pixtral_image_tokens(ctx: InputContext):
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        tokenizer_mode=ctx.model_config.tokenizer_mode)
+    mm_encoder = tokenizer.instruct.mm_encoder
+
+    max_image_size = mm_encoder.mm_config.max_image_size
+    image_patch_size = mm_encoder.mm_config.image_patch_size
+
+    return ((max_image_size // image_patch_size)**2)
+
+
+def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
+                           mm_counts: Mapping[str, int]):
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        tokenizer_mode=ctx.model_config.tokenizer_mode)
+
+    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
+    patch_size = mm_encoder.mm_config.image_patch_size
+    image_token_id = mm_encoder.special_ids.img
+
+    mm_config = ctx.model_config.multimodal_config
+    num_images = mm_config.limit_per_prompt.get("image", 1)
+
+    # dummy size
+    size = 256
+    image = Image.new("RGB", (size, size), color=0)
+
+    image_feature_size = (size**2) // (patch_size**2)
+
+    num_image_tokens = image_feature_size * num_images
+
+    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * num_image_tokens
+    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - num_image_tokens)
+
+    seq_data = SequenceData(token_ids)
+    mm_data = {"image": num_images * [image]}
+    return seq_data, mm_data
+
+
+def input_mapper_for_pixtral(ctx: InputContext,
+                             data: object) -> MultiModalInputs:
+    """Maps the input data to its MultiModalInputs (if any).
+
+    Args:
+        ctx: Context of the loaded model.
+        data: data potentially containing image/image embeddings to be mapped
+            to pixel_values in .forward() for a visual QWenLMHeadModel model.
+
+    Returns:
+        MultiModalInputs containing the stacked normalized images tensor or
+        image embeddings.
+    """
+    # Early exit if we have provided an image to a language only Qwen model
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(
+        model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
+
+    data_list = data if isinstance(data, list) else [data]
+
+    images = []
+    for image_data in data_list:
+        image = ImageChunk(image=image_data)
+        encoding = tokenizer.instruct.mm_encoder(image)
+        image = torch.from_numpy(encoding.image).to(device="cuda",
+                                                    dtype=torch.float16)
+        images.append(image)
+
+    return MultiModalInputs({"images": images})
+
+
+def input_processor_for_pixtral(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is not None and "image" in multi_modal_data:
+        tokenizer = cached_get_tokenizer(
+            ctx.model_config.tokenizer,
+            tokenizer_mode=ctx.model_config.tokenizer_mode)
+
+        mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
+        image_token_id = mm_encoder.special_ids.img
+
+        if image_token_id not in llm_inputs['prompt_token_ids']:
+            raise ValueError(
+                (f"You've passed {llm_inputs=} without {image_token_id=}"
+                 " Make sure to process your input via mistral_common's"
+                 " tokenizer or pass a chat completion request. For more"
+                 " For more info, see: "
+                 "https://github.com/vllm-project/vllm/issues/8411."))
+
+    return llm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_pixtral_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_pixtral)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_pixtral)
+class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        dataclass_fields = {field.name for field in fields(VisionEncoderArgs)}
+        vision_args = {
+            key: value
+            for key, value in self.config.vision_config.to_dict().items()
+            if key in dataclass_fields
+        }
+
+        self.vision_args = VisionEncoderArgs(**vision_args)
+
+        # init MistralForCausalLM
+        self.language_model = init_vllm_registered_model(
+            config.text_config, cache_config, quant_config)
+
+        self.vision_encoder = VisionTransformer(self.vision_args)
+        self.vision_language_adapter = VisionLanguageAdapter(
+            self.vision_args, dim=config.text_config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for pixtral.
+
+        TODO
+
+        """
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids)
+
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.vision_args.image_token_id)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def _parse_and_validate_image_input(
+        self,
+        images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
+                               torch.Tensor]] = None
+    ) -> Optional[List[torch.Tensor]]:
+        if images is None:
+            return None
+
+        if isinstance(images, torch.Tensor):
+            # if passed as batch take all images
+            N, B, C, W, H = images.shape
+            images = images.reshape(N * B, C, W, H)
+            images = [images[i] for i in range(images.size(0))]
+        elif isinstance(images, list):
+            # if passed as list flatten lists of tensors
+            flatten_images = []
+            for imgs_per_req in images:
+                imgs_per_req = [
+                    imgs_per_req[i] for i in range(imgs_per_req.size(0))
+                ] if isinstance(imgs_per_req, torch.Tensor) else imgs_per_req
+
+                flatten_images.extend(imgs_per_req)
+
+            images = flatten_images
+
+        return images
+
+    def _process_image_input(self,
+                             image_input: List[torch.Tensor]) -> torch.Tensor:
+        return self.vision_language_adapter(self.vision_encoder(image_input))
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_encoder")
+
+        def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_language_adapter")
+
+        def is_vision_weights(weight: Tuple[str, torch.Tensor]):
+            return is_vision_encoder_weights(
+                weight) or is_vision_lang_adapter_weights(weight)
+
+        llm_weights, vision_encoder_weights, vision_lang_adapter_weights = tee(
+            weights, 3)
+
+        # llm
+        llm_weights = filter(lambda x: not is_vision_weights(x), llm_weights)
+        self.language_model.load_weights(llm_weights)
+
+        # vision encoder
+        vision_encoder_weights = filter(is_vision_encoder_weights,
+                                        vision_encoder_weights)
+        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        for name, loaded_weight in vision_encoder_weights:
+            # cut 'vision_encoder.'
+            name = '.'.join(name.split(".")[1:])
+            param = vision_encoder_dict[name]
+
+            default_weight_loader(param, loaded_weight)
+
+        # adapter
+        vision_lang_adapter_weights = filter(is_vision_lang_adapter_weights,
+                                             vision_lang_adapter_weights)
+        vision_lang_adpter_dict = dict(
+            self.vision_language_adapter.named_parameters())
+        for name, loaded_weight in vision_lang_adapter_weights:
+            # cut 'vision_language_adapter.'
+            name = '.'.join(name.split(".")[1:])
+            param = vision_lang_adpter_dict[name]
+            default_weight_loader(param, loaded_weight)
+
+
+# Vision encoder
+@dataclass
+class VisionEncoderArgs:
+    hidden_size: int
+    num_channels: int
+    image_size: int
+    patch_size: int
+    intermediate_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    rope_theta: float  # for rope-2D
+    image_token_id: int
+
+
+def _reshape_for_broadcast(freqs_cis: torch.Tensor,
+                           x: torch.Tensor) -> torch.Tensor:
+    """
+    freqs_cis: complex - (seq_len, head_dim / 2)
+    x: complex - (bsz, seq_len, head_dim / 2)
+    """
+    ndim = x.ndim
+    assert ndim > 1
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
+        freqs_cis.shape,
+        (x.shape[1], x.shape[-1]),
+    )
+    shape = [
+        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
+    ]
+    return freqs_cis.view(*shape)
+
+
+def precompute_freqs_cis_2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float,
+) -> torch.Tensor:
+    """
+    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
+        to be indexed by (height, width) position tuples
+    """
+    # (dim / 2) frequency bases
+    freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
+
+    h = torch.arange(height, device=freqs.device)
+    w = torch.arange(width, device=freqs.device)
+
+    freqs_h = torch.outer(h, freqs[::2]).float()
+    freqs_w = torch.outer(w, freqs[1::2]).float()
+    freqs_2d = torch.cat(
+        [
+            freqs_h[:, None, :].repeat(1, width, 1),
+            freqs_w[None, :, :].repeat(height, 1, 1),
+        ],
+        dim=-1,
+    )
+    return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
+
+
+def apply_rotary_emb_vit(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.dtype == torch.complex64
+    freqs_cis = _reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        assert args.intermediate_size is not None
+        self.w1 = nn.Linear(args.hidden_size,
+                            args.intermediate_size,
+                            bias=False)
+        self.w2 = nn.Linear(args.intermediate_size,
+                            args.hidden_size,
+                            bias=False)
+        self.w3 = nn.Linear(args.hidden_size,
+                            args.intermediate_size,
+                            bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class Attention(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        assert not args.hidden_size % args.num_attention_heads
+        self.n_heads = args.num_attention_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+
+        self.wq = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wk = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wv = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wo = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: BlockDiagonalMask,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        batch, patches, _ = x.shape
+
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.reshape(batch, patches, self.n_heads, self.head_dim)
+        k = k.reshape(batch, patches, self.n_heads, self.head_dim)
+        v = v.reshape(batch, patches, self.n_heads, self.head_dim)
+
+        q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
+        out = memory_efficient_attention(q, k, v, attn_bias=mask)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        return self.wo(out)
+
+
+class TransformerBlock(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args.hidden_size, eps=1e-5)
+        self.ffn_norm = RMSNorm(args.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: BlockDiagonalMask,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        r = self.attention.forward(self.attention_norm(x),
+                                   mask=mask,
+                                   freqs_cis=freqs_cis)
+        h = x + r
+        r = self.feed_forward.forward(self.ffn_norm(h))
+        out = h + r
+        return out
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+        for _ in range(args.num_hidden_layers):
+            self.layers.append(TransformerBlock(args))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: BlockDiagonalMask,
+        freqs_cis: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, mask=mask, freqs_cis=freqs_cis)
+        return x
+
+
+def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor:
+    positions = torch.cat([
+        torch.stack(
+            torch.meshgrid(
+                torch.arange(p.shape[-2]),
+                torch.arange(p.shape[-1]),
+                indexing="ij",
+            ),
+            dim=-1,
+        ).reshape(-1, 2) for p in patch_embeds_list
+    ])
+    return positions
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        self.patch_conv = nn.Conv2d(
+            in_channels=args.num_channels,
+            out_channels=args.hidden_size,
+            kernel_size=args.patch_size,
+            stride=args.patch_size,
+            bias=False,
+        )
+        self.ln_pre = RMSNorm(args.hidden_size, eps=1e-5)
+        self.transformer = Transformer(args)
+
+        head_dim = self.args.hidden_size // self.args.num_attention_heads
+        assert head_dim % 2 == 0, "ROPE requires even head_dim"
+        self._freqs_cis: Optional[torch.Tensor] = None
+
+    @property
+    def max_patches_per_side(self) -> int:
+        return self.args.image_size // self.args.patch_size
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.device:
+        return next(self.parameters()).dtype
+
+    @property
+    def freqs_cis(self) -> torch.Tensor:
+        if self._freqs_cis is None:
+            self._freqs_cis = precompute_freqs_cis_2d(
+                dim=self.args.hidden_size // self.args.num_attention_heads,
+                height=self.max_patches_per_side,
+                width=self.max_patches_per_side,
+                theta=self.args.rope_theta,
+            )
+
+        if self._freqs_cis.device != self.device:
+            self._freqs_cis = self._freqs_cis.to(device=self.device)
+
+        return self._freqs_cis
+
+    def forward(
+        self,
+        images: List[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            images: list of N_img images of variable sizes, 
+                each of shape (C, H, W)
+        Returns:
+            image_features: tensor of token features for 
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds_list = [
+            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
+        ]
+
+        # flatten to a single sequence
+        patch_embeds = torch.cat(
+            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        # positional embeddings
+        positions = position_meshgrid(patch_embeds_list).to(self.device)
+        freqs_cis = self.freqs_cis[positions[:, 0], positions[:, 1]]
+
+        # pass through Transformer with a block diagonal mask delimiting images
+        mask = BlockDiagonalMask.from_seqlens(
+            [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
+        out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
+
+        # remove batch dimension of the single sequence
+        return out.squeeze(0)
+
+
+class VisionLanguageAdapter(nn.Module):
+
+    def __init__(self, args: VisionEncoderArgs, dim: int):
+        super().__init__()
+        assert isinstance(args, VisionEncoderArgs)
+        self.w_in = nn.Linear(
+            args.hidden_size,
+            dim,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index a726ec10984c0..18bc6b303f485 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -47,6 +47,7 @@
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
+from vllm.utils import is_list_of
 
 from .utils import flatten_bn, is_pp_missing_parameter, make_layers
 
@@ -684,9 +685,12 @@ def input_processor_for_qwen(ctx: InputContext,
             raise ValueError(
                 f"Expected img embeds to be have 3 dimensions, got {num_dims}")
         num_images = 1 if num_dims == 2 else image_data.shape[0]
-    else:
-        # TODO - handle multiple image inputs once the API is solidified
+    elif isinstance(image_data, Image.Image):
         num_images = 1
+    elif is_list_of(image_data, Image.Image):
+        num_images = len(image_data)
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
 
     if prompt is None:
         prompt = tokenizer.decode(prompt_token_ids)
@@ -767,11 +771,11 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
                 f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
                 f"received shape [{data.shape}]")
         pixel_values = data
-
     else:
         transform = build_normalization_transform(image_size)
-        # TODO - handle multiple image inputs once the API is solidified
-        transformed_images = [transform(data)]
+        if not isinstance(data, (list, tuple)):
+            data = [data]
+        transformed_images = [transform(datum) for datum in data]
         pixel_values = torch.stack(transformed_images, dim=0)
     return MultiModalInputs({"pixel_values": pixel_values})
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 56129515ca8d1..d80064601d993 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -469,7 +469,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -490,6 +491,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -500,7 +505,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
new file mode 100644
index 0000000000000..179399a12a3d5
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -0,0 +1,1094 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+from array import array
+from functools import lru_cache, partial
+from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
+                    Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from PIL import Image
+from transformers import Qwen2VLConfig
+from transformers.image_utils import (get_image_size,
+                                      infer_channel_dimension_format,
+                                      to_numpy_array)
+from transformers.models.qwen2_vl.configuration_qwen2_vl import (
+    Qwen2VLVisionConfig)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
+    make_batched_images, make_batched_videos, smart_resize)
+
+import vllm.envs as envs
+from vllm.attention import AttentionMetadata
+from vllm.attention.selector import (_Backend, backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalInputs)
+from vllm.multimodal.base import MultiModalData
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.platforms import current_platform
+from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
+                           SequenceData)
+from vllm.transformers_utils.processor import get_processor
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class Qwen2VLImageInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: 
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2VLVideoInputs(TypedDict):
+    pixel_values_videos: torch.Tensor
+    """Shape: 
+    `(num_patches, 
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+# === Vision Encoder === #
+
+
+class Qwen2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(in_features,
+                                        hidden_features,
+                                        quant_config=quant_config)
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(hidden_features,
+                                     in_features,
+                                     quant_config=quant_config)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
+    return output
+
+
+class Qwen2VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: Optional[int] = None,
+        num_heads: Optional[int] = None,
+        projection_size: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size)
+
+        self.qkv = ColumnParallelLinear(input_size=embed_dim,
+                                        output_size=3 * projection_size,
+                                        quant_config=quant_config)
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config)
+
+        # Detect attention implementation.
+        selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+        if selected_backend is None:
+            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+            if backend_by_env_var is not None:
+                selected_backend = backend_name_to_enum(backend_by_env_var)
+        if selected_backend is None:
+            # For Volta and Turing GPUs, use xformers instead.
+            device_available = current_platform.get_device_capability()[0] >= 8
+            if device_available:
+                from transformers.utils import is_flash_attn_2_available
+
+                if is_flash_attn_2_available():
+                    self._use_flash_attn = True
+                else:
+                    logger.warning(
+                        "Current Qwen2-VL implementation has a bug with "
+                        "`vllm-flash-attn` inside vision module, so we use "
+                        "xformers backend instead. You can run `pip install "
+                        "flash-attn to use flash-attention backend.")
+                    self._use_flash_attn = False
+            else:
+                self._use_flash_attn = False
+        else:
+            if selected_backend == _Backend.FLASH_ATTN:
+                self._use_flash_attn = True
+            elif selected_backend == _Backend.XFORMERS:
+                self._use_flash_attn = False
+            else:
+                raise RuntimeError(
+                    f"Qwen2-VL does not support {selected_backend} backend now."
+                )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        x = x.view(*new_x_shape)
+
+        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        batch_size = q.shape[1]
+
+        q, k, v = [
+            rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+        ]
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self._use_flash_attn:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
+
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        else:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = Qwen2VisionAttention(embed_dim=dim,
+                                         num_heads=num_heads,
+                                         projection_size=dim,
+                                         quant_config=quant_config)
+        self.mlp = Qwen2VisionMLP(dim,
+                                  mlp_hidden_dim,
+                                  act_layer=act_layer,
+                                  quant_config=quant_config)
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x),
+                          cu_seqlens=cu_seqlens,
+                          rotary_pos_emb=rotary_pos_emb)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_chans: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_chans,
+                              embed_dim,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
+        return x
+
+
+class Qwen2VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Type[nn.Module] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        in_chans: int = vision_config.in_chans
+        hidden_size: int = vision_config.hidden_size
+        embed_dim: int = vision_config.embed_dim
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        mlp_ratio: float = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+
+        self.patch_embed = Qwen2VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2VisionBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+            ) for _ in range(depth)
+        ])
+        self.merger = Qwen2VisionPatchMerger(
+            d_model=hidden_size,
+            context_dim=embed_dim,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(x, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+
+        # adapter
+        x = self.merger(x)
+        return x
+
+
+# === Vision input helpers === #
+
+cached_get_processor = lru_cache(get_processor)
+
+
+def mm_input_mapper_for_qwen2_vl(
+    ctx: InputContext,
+    data: MultiModalData[object],
+    data_type_key: str,
+) -> MultiModalInputs:
+    """Input mapper for Qwen2-VL."""
+    model_config = ctx.model_config
+    image_processor = cached_get_image_processor(
+        model_config.model, trust_remote_code=model_config.trust_remote_code)
+    if image_processor is None:
+        raise RuntimeError("No HuggingFace processor is available "
+                           "to process the image object")
+
+    images = None
+    videos = None
+    if data_type_key == "image":
+        images = data
+    else:
+        assert data_type_key == "video"
+        videos = data
+
+    try:
+        batch_data = image_processor \
+            .preprocess(images=images, videos=videos, return_tensors="pt") \
+            .data
+    except Exception:
+        logger.error("Failed to process image (%s)", data)
+        raise
+
+    return MultiModalInputs(batch_data)
+
+
+image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
+                                          data_type_key="image")
+video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
+                                          data_type_key="video")
+
+
+def _get_vision_info(
+    image_processor,
+    height: int,
+    width: int,
+    min_pixels: int,
+    max_pixels: int,
+    do_resize: bool = True,
+    data_type_key: str = "image",
+    mm_count: int = 1,
+):
+    """Get information (resized height / width and number of vision tokens)
+    of input image / video frame."""
+
+    if do_resize:
+        resized_height, resized_width = smart_resize(
+            height=height,
+            width=width,
+            factor=image_processor.patch_size * image_processor.merge_size,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    else:
+        resized_height, resized_width = height, width
+
+    if data_type_key == "image":
+        grid_t = mm_count
+    else:
+        assert data_type_key == "video"
+        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
+
+    grid_h = resized_height // image_processor.patch_size
+    grid_w = resized_width // image_processor.patch_size
+    vision_tokens = grid_t * grid_h * grid_w
+    llm_num_vision_tokens = (vision_tokens // image_processor.merge_size //
+                             image_processor.merge_size)
+
+    return resized_height, resized_width, llm_num_vision_tokens
+
+
+def _get_max_image_info(
+    image_processor,
+    data_type_key: str = "image",
+    mm_count: int = 1,
+):
+    return _get_vision_info(
+        image_processor,
+        height=9999999,
+        width=9999999,
+
+        # Limit min / max pixels.
+        min_pixels=max(image_processor.min_pixels, 28 * 28),
+        max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28),
+        data_type_key=data_type_key,
+        mm_count=mm_count,
+    )
+
+
+def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int:
+    image_processor = cached_get_image_processor(ctx.model_config.model)
+    max_resized_height, max_resized_width, max_llm_image_tokens = \
+        _get_max_image_info(image_processor, data_type_key=data_type_key,
+                            mm_count=1)
+    return max_llm_image_tokens
+
+
+get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens,
+                                        data_type_key="image")
+get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
+                                        data_type_key="video")
+
+
+def dummy_data_for_qwen2_vl(
+    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
+) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
+    image_processor = cached_get_image_processor(ctx.model_config.model)
+
+    num_images = mm_counts["image"]
+    max_resized_height, max_resized_width, max_llm_image_tokens = \
+        _get_max_image_info(image_processor, data_type_key="image",
+                            mm_count=num_images)
+    if seq_len - max_llm_image_tokens - 2 < 0:
+        raise RuntimeError(
+            f"Qwen2-VL cannot process {num_images} images in a prompt, "
+            "please increase max_model_len or reduce image limit by "
+            "--limit-mm-per-prompt.")
+
+    # Check video counts.
+    num_videos = mm_counts["video"]
+    max_resized_height, max_resized_width, max_llm_video_tokens = \
+        _get_max_image_info(image_processor, data_type_key="video",
+                            mm_count=num_videos)
+    if seq_len - max_llm_video_tokens - 2 < 0:
+        raise RuntimeError(
+            f"Qwen2-VL cannot process {num_images} videos in a prompt, "
+            "please increase max_model_len or reduce video limit by "
+            "--limit-mm-per-prompt.")
+
+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                      [hf_config.vision_start_token_id])
+    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                       [hf_config.image_token_id]) * max_llm_image_tokens
+    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                       [hf_config.vision_end_token_id])
+    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - max_llm_image_tokens - 2)
+    dummy_seqdata = SequenceData(token_ids)
+    dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
+                            color=0)
+
+    return dummy_seqdata, {
+        "image": dummy_image if num_images == 1 else [dummy_image] * num_images
+    }
+
+
+def _get_llm_num_vision_tokens(
+    mm_inputs: list,
+    data_type_key: str,
+    image_processor,
+):
+    """Get number of vision tokens of multimodal inputs.
+
+    This method is derived from `transformers.models.qwen2_vl.
+    image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
+    """
+    image = to_numpy_array(mm_inputs[0])
+    input_data_format = infer_channel_dimension_format(image)
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    _, _, llm_num_vision_tokens = _get_vision_info(
+        image_processor,
+        height=height,
+        width=width,
+        min_pixels=image_processor.min_pixels,
+        max_pixels=image_processor.max_pixels,
+        do_resize=image_processor.do_resize,
+        data_type_key=data_type_key,
+        mm_count=len(mm_inputs),
+    )
+    return llm_num_vision_tokens
+
+
+def input_processor_for_qwen2_vl(ctx: InputContext,
+                                 llm_inputs: LLMInputs) -> LLMInputs:
+    multi_modal_data = llm_inputs.get("multi_modal_data", None)
+    if multi_modal_data is None:
+        return llm_inputs
+
+    image_inputs = multi_modal_data.get("image", None)
+    video_inputs = multi_modal_data.get("video", None)
+
+    processor = cached_get_processor(ctx.model_config.model)
+    image_processor = processor.image_processor
+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+
+    # To avoid redundant processing of vision objects (resize, rescale, etc.),
+    # we extract code of calculating number of vision tokens from
+    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
+    #
+    # The following code is equivalent to:
+    #    prompt = llm_inputs["prompt"]
+    #    inputs = processor(text=[prompt],
+    #                       images=image_inputs,
+    #                       videos=video_inputs,
+    #                       padding=True,
+    #                       return_tensors="pt")
+    #    prompt_token_ids = inputs["input_ids"][0].tolist()
+
+    prompt_token_ids = llm_inputs.get("prompt_token_ids", None)
+    if prompt_token_ids is None:
+        prompt = llm_inputs["prompt"]
+        prompt_token_ids = processor.tokenizer(
+            prompt,
+            padding=True,
+            return_tensors=None,
+        )["input_ids"]
+
+    # Expand image pad tokens.
+    if image_inputs is not None:
+        image_indices = [
+            idx for idx, token in enumerate(prompt_token_ids)
+            if token == hf_config.image_token_id
+        ]
+        image_inputs = make_batched_images(image_inputs)
+        assert len(image_indices) == len(image_inputs)
+
+        prompt_token_ids_with_image = []
+        for image_cnt, image in enumerate(image_inputs):
+            num_image_tokens = _get_llm_num_vision_tokens(
+                [image],
+                data_type_key="image",
+                image_processor=image_processor,
+            )
+            if image_cnt == 0:
+                non_image_tokens = prompt_token_ids[:image_indices[image_cnt]]
+            else:
+                non_image_tokens = prompt_token_ids[image_indices[image_cnt -
+                                                                  1] +
+                                                    1:image_indices[image_cnt]]
+            prompt_token_ids_with_image.extend(non_image_tokens)
+            prompt_token_ids_with_image.extend(
+                hf_config.image_token_id for _ in range(num_image_tokens))
+        prompt_token_ids_with_image.extend(prompt_token_ids[image_indices[-1] +
+                                                            1:])
+        prompt_token_ids = prompt_token_ids_with_image
+
+    # Expand video pad tokens.
+    if video_inputs is not None:
+        video_indices = [
+            idx for idx, token in enumerate(prompt_token_ids)
+            if token == hf_config.video_token_id
+        ]
+        video_inputs = make_batched_videos(video_inputs)
+        assert len(video_indices) == len(video_inputs)
+
+        prompt_token_ids_with_video = []
+        for video_cnt, video in enumerate(video_inputs):
+            num_video_tokens = _get_llm_num_vision_tokens(
+                video,
+                data_type_key="video",
+                image_processor=image_processor,
+            )
+            if video_cnt == 0:
+                non_video_tokens = prompt_token_ids[:video_indices[video_cnt]]
+            else:
+                non_video_tokens = prompt_token_ids[video_indices[video_cnt -
+                                                                  1] +
+                                                    1:video_indices[video_cnt]]
+            prompt_token_ids_with_video.extend(non_video_tokens)
+            prompt_token_ids_with_video.extend(
+                hf_config.video_token_id for _ in range(num_video_tokens))
+        prompt_token_ids_with_video.extend(prompt_token_ids[video_indices[-1] +
+                                                            1:])
+        prompt_token_ids = prompt_token_ids_with_video
+
+    return LLMInputs(
+        prompt_token_ids=prompt_token_ids,
+        prompt=llm_inputs["prompt"],
+        multi_modal_data=multi_modal_data,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(
+    image_input_mapper_for_qwen2_vl)
+@MULTIMODAL_REGISTRY.register_input_mapper("video",
+                                           video_input_mapper_for_qwen2_vl)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "video", get_max_qwen2_vl_video_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
+class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self,
+                 config: Qwen2VLConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        assert not cache_config.enable_prefix_caching, \
+            "Qwen2-VL currently does not support prefix caching"
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+
+            # NOTE: Qwen2-VL vision encoder does not support any
+            # quantization method now.
+            quant_config=None,
+        )
+
+        self.model = Qwen2Model(config, cache_config, quant_config)
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: Union[torch.Tensor,
+                                                        List[torch.Tensor]],
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim}")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        pixel_values = self._validate_and_reshape_mm_tensor(
+            pixel_values, "image pixel values")
+        image_grid_thw = self._validate_and_reshape_mm_tensor(
+            image_grid_thw, "image grid_thw")
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of image pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return Qwen2VLImageInputs(pixel_values=pixel_values,
+                                  image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        pixel_values_videos = self._validate_and_reshape_mm_tensor(
+            pixel_values_videos, "video pixel values")
+        video_grid_thw = self._validate_and_reshape_mm_tensor(
+            video_grid_thw, "video grid_thw")
+
+        return Qwen2VLVideoInputs(
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+
+    def _process_image_input(self,
+                             image_input: Qwen2VLImageInputs) -> torch.Tensor:
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values,
+                                   grid_thw=image_input["image_grid_thw"])
+        return image_embeds
+
+    def _process_video_input(self,
+                             video_input: Qwen2VLVideoInputs) -> torch.Tensor:
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos,
+                                   grid_thw=video_input["video_grid_thw"])
+        return video_embeds
+
+    def _merge_multimodal_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        multimodal_embeddings: torch.Tensor,
+        placeholder_token_id: int,
+    ) -> torch.Tensor:
+        mask = (input_ids == placeholder_token_id)
+        inputs_embeds[mask, :] = multimodal_embeddings
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        if image_input is None and video_input is None:
+            inputs_embeds = None
+        else:
+            if getattr(self.config, "rope_scaling", {}).get("type",
+                                                            None) == "mrope":
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}")
+
+            inputs_embeds = self.model.embed_tokens(input_ids)
+
+            if image_input is not None:
+                image_embeds = self._process_image_input(image_input)
+                inputs_embeds = self._merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                    placeholder_token_id=self.config.image_token_id,
+                )
+
+            if video_input is not None:
+                video_embeds = self._process_video_input(video_input)
+                inputs_embeds = self._merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    video_embeds,
+                    placeholder_token_id=self.config.video_token_id,
+                )
+
+            input_ids = None
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name and "qkv.weight" in name:
+                    visual_num_heads = self.config.vision_config.num_heads
+                    visual_embed_dim = self.config.vision_config.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size,
+                                                       visual_embed_dim)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
+                elif "visual" in name and "qkv.bias" in name:
+                    visual_num_heads = self.config.vision_config.num_heads
+                    visual_embed_dim = self.config.vision_config.embed_dim
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 13d09e4cd4c23..f7976eba7420b 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -443,27 +443,26 @@ def __init__(
         self.config = config
         embed_dim = config.hidden_size
 
-        if (num_hidden_layers_override is None
-                or num_hidden_layers_override == config.num_hidden_layers):
-            self.need_post_layernorm = True
-        elif num_hidden_layers_override > config.num_hidden_layers:
-            raise ValueError(
-                "num_hidden_layers_override cannot be greater than "
-                "num_hidden_layers")
-        else:
-            self.need_post_layernorm = False
-
         self.embeddings = SiglipVisionEmbeddings(config)
         self.encoder = SiglipEncoder(
             config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
         )
-        if self.need_post_layernorm:
+
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {config.num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+        elif len(self.encoder.layers) == config.num_hidden_layers:
             self.post_layernorm = nn.LayerNorm(embed_dim,
                                                eps=config.layer_norm_eps)
         else:
-            self.post_layernorm = nn.Identity()
+            # post_layernorm is unused when we extract intermediate features
+            # In this case, we can skip it to conserve memory
+            self.post_layernorm = None
+
         self.use_head = (True if not hasattr(config, "vision_use_head") else
                          config.vision_use_head)
         if self.use_head:
@@ -482,6 +481,9 @@ def forward(
 
         encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
+        if self.post_layernorm is None:
+            return encoder_outputs
+
         last_hidden_state = self.post_layernorm(encoder_outputs)
         # TODO: add this back when pooled_output is used in inference
         # if self.use_head:
@@ -512,8 +514,8 @@ def __init__(
         )
 
     @property
-    def need_post_layernorm(self):
-        return self.vision_model.need_post_layernorm
+    def _require_post_layernorm(self) -> bool:
+        return self.vision_model.post_layernorm is not None
 
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
@@ -541,7 +543,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             # post_layernorm is optional in SiglipVisionModel
             if ("vision_model.post_layernorm" in name
-                    and not self.need_post_layernorm):
+                    and not self._require_post_layernorm):
                 continue
 
             # omit layers when num_hidden_layers_override is set
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 17ef9938d0572..032964fe0ac4e 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -79,14 +79,12 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
         if len(inputs_list) == 0:
             return {}
 
-        keys = inputs_list[0].keys()
-
         item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
 
         for inputs in inputs_list:
-            if inputs.keys() != keys:
-                msg = f"Inputs do not share the same keys ({keys})"
-                raise ValueError(msg)
+            # For models that supports multiple modalities (e.g. Qwen2-VL),
+            # different modalities will return different data keys,
+            # so batch() should skip the same key check.
 
             for k, v in inputs.items():
                 item_lists[k].append(v)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index cd16cdcbd890c..745fc715caf45 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,6 +9,7 @@
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
                    MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
+from .video import VideoPlugin
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,7 @@ class MultiModalRegistry:
     :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
     """
 
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
 
     def __init__(
             self,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b76b765bc677a..3c801464383ad 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -4,6 +4,7 @@
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
+import numpy.typing as npt
 from PIL import Image
 
 from vllm.connections import global_http_connection
@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
+def try_import_video_packages() -> Any:
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError(
+            "Please install vllm[video] for video support.") from None
+    return cv2
+
+
+def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    cv2 = try_import_video_packages()
+
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    else:
+        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        sampled_frames = frames[frame_indices, ...]
+        return sampled_frames
+
+
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
new file mode 100644
index 0000000000000..4401d13157923
--- /dev/null
+++ b/vllm/multimodal/video.py
@@ -0,0 +1,71 @@
+from functools import lru_cache
+from typing import List, Union
+
+import numpy as np
+
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.image_processor import get_video_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_list_of
+
+from .base import MultiModalData, MultiModalInputs
+from .image import ImagePlugin
+
+logger = init_logger(__name__)
+
+cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+VideoInput = Union[
+    "np.ndarray",  # single video input
+    List["np.ndarray"],
+    # TODO: support more types
+    # List[Image.Image], List[List[Image.Image]],
+    # "torch.Tensor",
+    # List["torch.Tensor"],
+    # List[List["np.ndarrray"]],
+    # List[List["torch.Tensor"]],
+]
+
+
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+
+    def get_data_key(self) -> str:
+        return "video"
+
+    def _get_hf_video_processor(self, model_config: ModelConfig):
+        return cached_get_video_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
+
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: MultiModalData[object],
+    ) -> MultiModalInputs:
+        model_config = ctx.model_config
+
+        # single video input as np.ndarray
+        if isinstance(data, np.ndarray):
+            video_processor = self._get_hf_video_processor(model_config)
+            if video_processor is None:
+                raise RuntimeError("No HuggingFace processor is available "
+                                   "to process the image object")
+            try:
+                batch_data = video_processor(data, return_tensors="pt").data
+            except Exception:
+                logger.error("Failed to process image (%s)", data)
+                raise
+
+            return MultiModalInputs(batch_data)
+        elif is_list_of(data, np.ndarray):
+            raise NotImplementedError(
+                "Multi video for a prompt is not supported yet")
+
+        raise TypeError(f"Invalid video type: {type(data)}")
+
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096
diff --git a/vllm/outputs.py b/vllm/outputs.py
index e091b576f5972..85ea9196b25df 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -5,6 +5,7 @@
 from typing import Union
 
 from vllm.lora.request import LoRARequest
+from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceStatus)
 
@@ -92,7 +93,7 @@ def __init__(
         self,
         request_id: str,
         prompt: Optional[str],
-        prompt_token_ids: List[int],
+        prompt_token_ids: Optional[List[int]],
         prompt_logprobs: Optional[PromptLogprobs],
         outputs: List[CompletionOutput],
         finished: bool,
@@ -113,19 +114,26 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
 
     @classmethod
-    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
-        if seq_group.sampling_params is None:
+    def from_seq_group(cls,
+                       seq_group: SequenceGroup) -> Optional["RequestOutput"]:
+        sampling_params = seq_group.sampling_params
+        if sampling_params is None:
             raise ValueError(
                 "Sampling parameters are missing for a CompletionRequest.")
+        finished = seq_group.is_finished()
+        if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
+                not finished):
+            return None
+
         seqs = seq_group.get_seqs()
         if len(seqs) == 1:
             top_n_seqs = seqs
         else:
             # Get the top-n sequences.
-            n = seq_group.sampling_params.n
-            if seq_group.sampling_params.use_beam_search:
+            n = sampling_params.n
+            if sampling_params.use_beam_search:
                 sorting_key = lambda seq: seq.get_beam_search_score(
-                    seq_group.sampling_params.length_penalty)
+                    sampling_params.length_penalty)
             else:
                 sorting_key = lambda seq: seq.get_cumulative_logprob()
             sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
@@ -135,26 +143,49 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         # NOTE: We need omit logprobs here explicitly because the sequence
         # always has the logprobs of the sampled tokens even if the
         # logprobs are not requested.
-        include_logprobs = seq_group.sampling_params.logprobs is not None
-        text_buffer_length = seq_group.sampling_params.output_text_buffer_length
-        outputs = [
-            CompletionOutput(
-                seqs.index(seq),
-                seq.get_output_text_to_return(text_buffer_length),
-                seq.data._output_token_ids,
-                seq.get_cumulative_logprob() if include_logprobs else None,
-                seq.output_logprobs if include_logprobs else None,
-                SequenceStatus.get_finished_reason(seq.status),
-                seq.stop_reason) for seq in top_n_seqs
-        ]
+        include_logprobs = sampling_params.logprobs is not None
+        text_buffer_length = sampling_params.output_text_buffer_length
+        delta = sampling_params.output_kind == RequestOutputKind.DELTA
+
+        outputs = []
+        include_prompt = True
+        for seq in top_n_seqs:
+            output_text = seq.get_output_text_to_return(
+                text_buffer_length, delta)
+            output_token_ids = seq.get_output_token_ids_to_return(delta)
+            output_logprobs = seq.output_logprobs if include_logprobs else None
+
+            if delta:
+                # Slice logprobs delta if applicable
+                if output_logprobs:
+                    output_logprobs = output_logprobs[-len(output_token_ids):]
+                # Don't include prompt if this is after the first output
+                # containing decode token ids
+                if include_prompt and seq.get_output_len() > len(
+                        output_token_ids):
+                    include_prompt = False
+
+            outputs.append(
+                CompletionOutput(
+                    seqs.index(seq), output_text, output_token_ids,
+                    seq.get_cumulative_logprob() if include_logprobs else None,
+                    output_logprobs,
+                    SequenceStatus.get_finished_reason(seq.status),
+                    seq.stop_reason))
 
         # Every sequence in the sequence group should have the same prompt.
-        prompt = seq_group.prompt
-        prompt_token_ids = seq_group.prompt_token_ids
-        encoder_prompt = seq_group.encoder_prompt
-        encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
-        prompt_logprobs = seq_group.prompt_logprobs
-        finished = seq_group.is_finished()
+        if include_prompt:
+            prompt = seq_group.prompt
+            prompt_token_ids = seq_group.prompt_token_ids
+            encoder_prompt = seq_group.encoder_prompt
+            encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
+            prompt_logprobs = seq_group.prompt_logprobs
+        else:
+            prompt = None
+            prompt_token_ids = None
+            encoder_prompt = None
+            encoder_prompt_token_ids = None
+            prompt_logprobs = None
         finished_time = time.time() if finished else None
         seq_group.set_finished_time(finished_time)
         return cls(seq_group.request_id,
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index aedf3c3a950ee..a483614d067e9 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -42,6 +42,13 @@
 except Exception:
     pass
 
+is_cpu = False
+try:
+    from importlib.metadata import version
+    is_cpu = "cpu" in version("vllm")
+except Exception:
+    pass
+
 if is_tpu:
     # people might install pytorch built with cuda but run on tpu
     # so we need to check tpu first
@@ -53,6 +60,9 @@
 elif is_rocm:
     from .rocm import RocmPlatform
     current_platform = RocmPlatform()
+elif is_cpu:
+    from .cpu import CpuPlatform
+    current_platform = CpuPlatform()
 else:
     current_platform = UnspecifiedPlatform()
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
new file mode 100644
index 0000000000000..4736e898b6a52
--- /dev/null
+++ b/vllm/platforms/cpu.py
@@ -0,0 +1,15 @@
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+
+    @staticmethod
+    def get_device_name(device_id: int = 0) -> str:
+        return "cpu"
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 25b6f26676ef0..676f4c9fccf5a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,5 +1,5 @@
 import enum
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
     CUDA = enum.auto()
     ROCM = enum.auto()
     TPU = enum.auto()
+    CPU = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -23,9 +24,12 @@ def is_rocm(self) -> bool:
     def is_tpu(self) -> bool:
         return self._enum == PlatformEnum.TPU
 
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
     @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        raise NotImplementedError
+    def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
+        return None
 
     @staticmethod
     def get_device_name(device_id: int = 0) -> str:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 5e32bee1c5511..393fc230da0b9 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 
 from .interface import Platform, PlatformEnum
@@ -8,10 +6,6 @@
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
 
-    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        raise RuntimeError("TPU does not have device capability.")
-
     @staticmethod
     def inference_mode():
         return torch.no_grad()
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 765f74fe7356f..7939688ef0da3 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Callable, Optional, Union
 
 import vllm.envs as envs
 
@@ -29,3 +30,15 @@ def load_general_plugins():
             except Exception:
                 logger.exception("Failed to load general plugin: %s",
                                  plugin.name)
+
+
+_torch_compile_backend: Optional[Union[Callable, str]] = None
+
+
+def set_torch_compile_backend(backend: Union[Callable, str]):
+    global _torch_compile_backend
+    _torch_compile_backend = backend
+
+
+def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
+    return _torch_compile_backend
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 93eb3bde646ac..18a5f86c341a9 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -14,6 +14,7 @@
 from vllm.prompt_adapter.layers import (
     VocabParallelEmbeddingWithPromptAdapter)  # yapf: disable
 from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.utils import load_peft_weights
 
 logger = logging.getLogger(__name__)
 
@@ -90,7 +91,6 @@ def from_local_checkpoint(
         config: PromptAdapterConfig,
         device: str = "cuda",
     ) -> "PromptAdapterModel":
-        from peft.utils import load_peft_weights
 
         if num_virtual_tokens > config.max_prompt_adapter_token:
             raise ValueError(
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
new file mode 100644
index 0000000000000..989cc5a0f87c8
--- /dev/null
+++ b/vllm/prompt_adapter/utils.py
@@ -0,0 +1,93 @@
+# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
+
+import os
+from typing import Optional
+
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from safetensors.torch import load_file as safe_load_file
+
+WEIGHTS_NAME = "adapter_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
+
+
+# Get current device name based on available devices
+def infer_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+
+
+def load_peft_weights(model_id: str,
+                      device: Optional[str] = None,
+                      **hf_hub_download_kwargs) -> dict:
+    r"""
+    A helper method to load the PEFT weights from the HuggingFace Hub or locally
+
+    Args:
+        model_id (`str`):
+            The local path to the adapter weights or the name of the adapter to
+            load from the HuggingFace Hub.
+        device (`str`):
+            The device to load the weights onto.
+        hf_hub_download_kwargs (`dict`):
+            Additional arguments to pass to the `hf_hub_download` method when 
+            loading from the HuggingFace Hub.
+    """
+    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
+            if hf_hub_download_kwargs.get("subfolder", None) is not None else
+            model_id)
+
+    if device is None:
+        device = infer_device()
+
+    if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
+        filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
+        use_safetensors = True
+    elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
+        filename = os.path.join(path, WEIGHTS_NAME)
+        use_safetensors = False
+    else:
+        token = hf_hub_download_kwargs.get("token", None)
+        if token is None:
+            token = hf_hub_download_kwargs.get("use_auth_token", None)
+
+        hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
+                                     SAFETENSORS_WEIGHTS_NAME)
+                        if hf_hub_download_kwargs.get("subfolder", None)
+                        is not None else SAFETENSORS_WEIGHTS_NAME)
+        has_remote_safetensors_file = file_exists(
+            repo_id=model_id,
+            filename=hub_filename,
+            revision=hf_hub_download_kwargs.get("revision", None),
+            repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            token=token,
+        )
+        use_safetensors = has_remote_safetensors_file
+
+        if has_remote_safetensors_file:
+            # Priority 1: load safetensors weights
+            filename = hf_hub_download(
+                model_id,
+                SAFETENSORS_WEIGHTS_NAME,
+                **hf_hub_download_kwargs,
+            )
+        else:
+            try:
+                filename = hf_hub_download(model_id, WEIGHTS_NAME,
+                                           **hf_hub_download_kwargs)
+            except EntryNotFoundError:
+                raise ValueError(  # noqa: B904
+                    f"Can't find weights for {model_id} in {model_id} or \
+                    in the Hugging Face Hub. "
+                    f"Please check that the file {WEIGHTS_NAME} or \
+                    {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
+
+    if use_safetensors:
+        adapters_weights = safe_load_file(filename, device=device)
+    else:
+        adapters_weights = torch.load(filename,
+                                      map_location=torch.device(device))
+
+    return adapters_weights
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index c83ed5cca6791..5edbc8e424e81 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,6 +1,6 @@
 """Sampling parameters for text generation."""
 import copy
-from enum import IntEnum
+from enum import Enum, IntEnum
 from functools import cached_property
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
@@ -33,6 +33,15 @@ class SamplingType(IntEnum):
 to sample from."""
 
 
+class RequestOutputKind(Enum):
+    # Return entire output so far in every RequestOutput
+    CUMULATIVE = 0
+    # Return only deltas in each RequestOutput
+    DELTA = 1
+    # Do not return intermediate RequestOuputs
+    FINAL_ONLY = 2
+
+
 class SamplingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
@@ -147,6 +156,7 @@ class SamplingParams(
     logits_processors: Optional[Any] = None
     include_stop_str_in_output: bool = False
     truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
+    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
 
     # The below fields are not supposed to be used as an input.
     # They are set in post_init.
@@ -182,6 +192,7 @@ def from_optional(
         logits_processors: Optional[List[LogitsProcessor]] = None,
         truncate_prompt_tokens: Optional[Annotated[int,
                                                    msgspec.Meta(ge=1)]] = None,
+        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
     ) -> "SamplingParams":
         return SamplingParams(
             n=1 if n is None else n,
@@ -213,6 +224,7 @@ def from_optional(
             spaces_between_special_tokens=spaces_between_special_tokens,
             logits_processors=logits_processors,
             truncate_prompt_tokens=truncate_prompt_tokens,
+            output_kind=output_kind,
         )
 
     def __post_init__(self) -> None:
@@ -317,6 +329,9 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop.")
+        if self.best_of != self.n and self.output_kind == (
+                RequestOutputKind.DELTA):
+            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_beam_search(self) -> None:
         if self.best_of == 1:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a5ebf152ce776..07ceccf123541 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,8 +5,9 @@
 from array import array
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
-                    Optional, Set, Tuple, Union, cast)
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional
+from typing import Sequence as GenericSequence
+from typing import Set, Tuple, Union, cast
 
 import msgspec
 import torch
@@ -165,6 +166,9 @@ class SequenceData(msgspec.Struct,
     # is called.
     _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
 
+    # It is used to compute mrope_position_ids.
+    _mrope_position_delta: Optional[int] = None
+
     def __post_init__(self) -> None:
         assert self._prompt_token_ids.typecode == "l"
         assert self._output_token_ids.typecode == "l"
@@ -219,6 +223,14 @@ def output_token_ids_array(self) -> array:
         assert isinstance(self._output_token_ids, array)
         return self._output_token_ids
 
+    @property
+    def mrope_position_delta(self) -> Optional[int]:
+        return self._mrope_position_delta
+
+    @mrope_position_delta.setter
+    def mrope_position_delta(self, new_mrope_position_delta):
+        self._mrope_position_delta = new_mrope_position_delta
+
     def append_token_id(self, token_id: int, logprob: float) -> None:
         self._output_token_ids.append(token_id)
         self._new_appended_tokens.append(token_id)
@@ -396,6 +408,10 @@ def __init__(
         self.status = SequenceStatus.WAITING
         self.stop_reason: Union[int, str, None] = None
 
+        # These are used to keep track of delta outputs
+        self._last_token_ids_offset: int = 0
+        self._last_output_text_offset: int = 0
+
         # Used for incremental detokenization
         self.prefix_offset = 0
         self.read_offset = 0
@@ -451,11 +467,37 @@ def prompt_adapter_id(self) -> int:
         return self.prompt_adapter_request.prompt_adapter_id \
                         if self.prompt_adapter_request else 0
 
-    def get_output_text_to_return(self, buffer_length: int):
+    def get_output_text_to_return(self, buffer_length: int,
+                                  delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
         # We return the full output text if the sequence is finished.
         truncate = buffer_length and not self.is_finished()
-        return self.output_text[:-buffer_length] if truncate else (
-            self.output_text)
+        if not delta:
+            return self.output_text[:-buffer_length] if truncate else (
+                self.output_text)
+        length = len(self.output_text)
+        if truncate:
+            length -= buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+    def get_output_token_ids_to_return(self,
+                                       delta: bool) -> GenericSequence[int]:
+        """If delta is True, only new tokens since the last call to
+        this method are returned"""
+        if not delta:
+            return self.get_output_token_ids()
+        length = self.get_output_len()
+        last_offset = self._last_token_ids_offset
+        if last_offset < length:
+            self._last_token_ids_offset = length
+            return self.data._output_token_ids[last_offset:]
+        return ()
 
     def hash_of_block(self, logical_idx: int) -> int:
         # TODO This can produce incorrect hash when block size > prompt size
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 6e35e40294381..1e403637d2388 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -2,7 +2,6 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.sampler import SamplerOutput
 
 try:
@@ -116,18 +115,9 @@ def _gpu_advance_step(
         # Update attn_metadata
         attn_metadata = model_input.attn_metadata
         assert isinstance(attn_metadata, FlashAttentionMetadata)
-        attn_metadata.advance_step(num_seqs, num_queries)
-
-        # Update GPU tensors
-        ops.advance_step(num_seqs=num_seqs,
-                         num_queries=num_queries,
-                         block_size=self.block_size,
-                         input_tokens=model_input.input_tokens,
-                         sampled_token_ids=sampled_token_ids,
-                         input_positions=model_input.input_positions,
-                         seq_lens=attn_metadata.seq_lens_tensor,
-                         slot_mapping=attn_metadata.slot_mapping,
-                         block_tables=attn_metadata.block_tables)
+
+        attn_metadata.advance_step(model_input, sampled_token_ids,
+                                   self.block_size, num_seqs, num_queries)
 
         # Update sampling_metadata
         sampling_metadata = model_input.sampling_metadata
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 64b2e29764cdf..55eb03725790e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,7 +4,9 @@
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
 
-from huggingface_hub import file_exists, hf_hub_download
+import huggingface_hub
+from huggingface_hub import (file_exists, hf_hub_download,
+                             try_to_load_from_cache)
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import (
     get_image_processor_config)
@@ -71,7 +73,22 @@ def file_or_path_exists(model: Union[str, Path], config_name, revision,
     if Path(model).exists():
         return (Path(model) / config_name).is_file()
 
-    return file_exists(model, HF_CONFIG_NAME, revision=revision, token=token)
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(repo_id=model,
+                                             filename=config_name,
+                                             revision=revision)
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+    try:
+        return file_exists(model, config_name, revision=revision, token=token)
+    except huggingface_hub.errors.OfflineModeIsEnabled:
+        # Don't raise in offline mode, all we know is that we don't have this
+        # file cached.
+        return False
 
 
 def get_config(
@@ -103,6 +120,15 @@ def get_config(
                                  token=kwargs.get("token")):
             config_format = ConfigFormat.MISTRAL
         else:
+            # If we're in offline mode and found no valid config format, then
+            # raise an offline mode error to indicate to the user that they
+            # don't have files cached and may need to go online.
+            # This is conveniently triggered by calling file_exists().
+            file_exists(model,
+                        HF_CONFIG_NAME,
+                        revision=revision,
+                        token=kwargs.get("token"))
+
             raise ValueError(f"No supported config format found in {model}")
 
     if config_format == ConfigFormat.HF:
@@ -210,14 +236,27 @@ def recurse_elems(elem: Any):
     config_dict["hidden_act"] = config_dict.get("activation", "silu")
     config_dict["tie_word_embeddings"] = config_dict.get(
         "tie_embeddings", False)
+    config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
+    config_dict["max_position_embeddings"] = config_dict.get(
+        "max_position_embeddings", 128_000)
 
-    if config_dict["model_type"] == "transformer":
-        if "moe" in config_dict:
-            config_dict["architectures"] = ["MixtralForCausalLM"]
-        else:
-            config_dict["architectures"] = ["MistralForCausalLM"]
+    if config_dict.get("moe") is not None:
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+
+    if config_dict.get("vision_encoder") is not None:
+        multimodal_config = config_dict.pop("vision_encoder")
 
-    return recurse_elems(config_dict)
+        config_dict = {
+            "text_config": config_dict,
+            "vision_config": multimodal_config
+        }
+        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
+        config_dict["model_type"] = "pixtral"
+
+    config = recurse_elems(config_dict)
+    return config
 
 
 def get_hf_image_processor_config(
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index c7d9eabd06f0e..4cffac3724ba8 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,6 +1,33 @@
 from typing import cast
 
 
+def get_video_processor(
+    processor_name: str,
+    trust_remote_code: bool = False,
+):
+    """
+    Gets a processor for the given model name via HuggingFace.
+    """
+    from transformers import AutoProcessor
+
+    try:
+        processor = AutoProcessor.from_pretrained(processor_name)
+        video_processor = processor.video_processor
+
+    except ValueError as e:
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return video_processor
+
+
 def get_image_processor(
     processor_name: str,
     *args,
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
new file mode 100644
index 0000000000000..2001746c5f7f9
--- /dev/null
+++ b/vllm/transformers_utils/processor.py
@@ -0,0 +1,37 @@
+from typing import cast
+
+
+def get_processor(
+    processor_name: str,
+    *args,
+    trust_remote_code: bool = False,
+    **kwargs,
+):
+    """Gets a processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor
+    from transformers.processing_utils import ProcessorMixin
+
+    try:
+        processor = AutoProcessor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return cast(ProcessorMixin, processor)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 533a86b787325..ea1910ed20ec3 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,7 +16,7 @@
                                                      Tekkenizer)
 
 if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ConversationMessage
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
 @dataclass
@@ -45,26 +45,25 @@ class MistralTokenizer:
     def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
-        self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
 
-        self.vocab_size = len(self.tokenizer.vocab())
-
-        assert isinstance(self.tokenizer,
-                          (Tekkenizer, SentencePieceTokenizer)), type(
-                              self.tokenizer)
-
-        if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
+        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        if isinstance(tokenizer_, Tekkenizer):
             # Make sure special tokens will not raise
-            self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
-
-        self._is_tekken = is_tekken
+            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
+
+            self._vocab = {
+                token: idx
+                for idx, token in enumerate(tokenizer_.vocab())
+            }
+        elif isinstance(tokenizer_, SentencePieceTokenizer):
+            self._vocab = {
+                token: idx
+                for idx, token in enumerate(tokenizer_.vocab())
+            }
+        else:
+            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
-        # the following attributes are set to fit VLLM's design
-        self.is_fast = True
-        self.chat_template = True
-        self.all_special_ids: List[Any] = []
-        self.all_special_tokens: List[Any] = []
-        self.all_special_tokens_extended: List[Any] = []
+        self.tokenizer = tokenizer_
 
     @classmethod
     def from_pretrained(cls,
@@ -102,6 +101,38 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                          revision=revision)
         return tokenizer_file
 
+    # the following attributes are set to fit VLLM's design
+    @property
+    def all_special_tokens_extended(self) -> List[str]:
+        return []
+
+    @property
+    def all_special_tokens(self) -> List[str]:
+        return []
+
+    @property
+    def all_special_ids(self) -> List[int]:
+        return []
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_id
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
     def __call__(
         self,
         prompt: str,
@@ -117,31 +148,34 @@ def __call__(
 
         return Encoding(input_ids=input_ids)
 
-    def get_added_vocab(self) -> List[str]:
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab
+
+    def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
-        return []
+        return {}
 
     def encode(self, prompt: str) -> List[int]:
-        # `encode ` should only be used for prompt completion
+        # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.
         # For chat completion use `apply_chat_template`
         return self.tokenizer.encode(prompt, bos=True, eos=False)
 
     def apply_chat_template(self,
-                            conversation: List["ConversationMessage"],
+                            messages: List["ChatCompletionMessageParam"],
                             tools: Optional[Dict[str, Any]] = None,
                             **kwargs) -> List[int]:
         assert tools is None, "`tools` are not yet supported."
 
         request = ChatCompletionRequest(
-            messages=conversation)  # type: ignore[type-var]
+            messages=messages)  # type: ignore[type-var]
         encoded = self.mistral.encode_chat_completion(request)
 
         # encode-decode to get clean prompt
         return encoded.tokens
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        if self._is_tekken:
+        if isinstance(self.tokenizer, Tekkenizer):
             return "".join(tokens)
         else:
             return self.tokenizer.decode(tokens)  # type: ignore[arg-type]
@@ -151,14 +185,11 @@ def decode(self, ids: Union[List[int], int]) -> str:
             ids = [ids]
         return self.tokenizer.decode(ids)
 
-    @property
-    def eos_token_id(self):
-        return self.tokenizer.eos_id
-
     def convert_ids_to_tokens(
-            self,
-            ids: List[int],
-            skip_special_tokens: Optional[bool] = True) -> List[str]:
+        self,
+        ids: List[int],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
@@ -170,6 +201,3 @@ def convert_ids_to_tokens(
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
         return tokens
-
-    def __len__(self):
-        return self.vocab_size
diff --git a/vllm/utils.py b/vllm/utils.py
index a22081ebe8df0..aba243071b69a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -82,6 +82,9 @@
                                        "currently supported with encoder/"
                                        "decoder models.")
 
+STR_NOT_IMPL_ENC_DEC_CPU = ("CPU is not currently supported with "
+                            "encoder/decoder models.")
+
 # Efficiently import all enc/dec error strings
 # rather than having to import all of the above
 STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
@@ -97,6 +100,7 @@
     "STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH": STR_NOT_IMPL_ENC_DEC_CUDAGRAPH,
     "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
     "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
+    "STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU
 }
 
 # Constants related to forcing the attention backend selection
diff --git a/vllm/version.py b/vllm/version.py
index 039f6369b8ed5..0ddc7fb99ad45 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -2,6 +2,7 @@
 
 try:
     import vllm.commit_id
+
     __commit__ = vllm.commit_id.__commit__
 except Exception as e:
     warnings.warn(f"Failed to read commit hash:\n{e}",
@@ -9,4 +10,4 @@
                   stacklevel=2)
     __commit__ = "COMMIT_HASH_PLACEHOLDER"
 
-__version__ = "0.6.0"
+__version__ = "0.6.1.post2"
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 7205b1a7beb8d..7b2caf4973589 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -15,7 +15,7 @@
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS, make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -121,6 +121,10 @@ def __init__(
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
+        if self.model_config.is_encoder_decoder_model:
+            raise NotImplementedError(
+                STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CPU'])
+
     def load_model(self) -> None:
         self.model = get_model(model_config=self.model_config,
                                load_config=self.load_config,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 52d1806018f51..5e36fba6ccdea 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -207,7 +207,8 @@ def stop_profile(self):
 
     def init_device(self) -> None:
         if self.local_omp_cpuid != "all":
-            torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            logger.info(ret)
 
         self.init_distributed_environment()
         # Set random seed.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 74f7d4e0860d3..9df9ae783b9fa 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -30,6 +30,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -52,7 +53,7 @@
     _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
+    _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -74,6 +75,10 @@
 
 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
 
+# For now, bump up cache limits for recompilations during CUDA graph warmups.
+torch._dynamo.config.cache_size_limit = 128
+torch._dynamo.config.accumulated_cache_size_limit = 128
+
 
 @dataclass(frozen=True)
 class ModelInputForGPU(ModelRunnerInputBase):
@@ -181,6 +186,7 @@ class InterDataForSeqGroup:
         def simple_reinit(self):
             self.input_tokens[0].clear()  # type: ignore
             self.input_positions[0].clear()  # type: ignore
+            self.mrope_input_positions = None  # type: ignore
             self.seq_lens[0] = 0  # type: ignore
             self.orig_seq_lens[0] = 0  # type: ignore
             self.query_lens[0] = 0  # type: ignore
@@ -206,6 +212,7 @@ def __init__(
             # Input tokens and positions.
             input_tokens: Optional[List[List[int]]] = None,
             input_positions: Optional[List[List[int]]] = None,
+            mrope_input_positions: Optional[List[List[List[int]]]] = None,
 
             # The sequence length (may be capped to the sliding window).
             seq_lens: Optional[List[int]] = None,
@@ -266,6 +273,8 @@ def __init__(
                         for seq_id in range(len(self.seq_ids)):
                             self.input_positions[seq_id].clear()
 
+                    self.mrope_input_positions = None
+
                     if seq_lens:
                         self.seq_lens = seq_lens
                     else:
@@ -327,6 +336,7 @@ def __init__(
             else:
                 self.input_tokens = input_tokens or []
                 self.input_positions = input_positions or []
+                self.mrope_input_positions = mrope_input_positions or None
                 self.seq_lens = seq_lens or []
                 self.orig_seq_lens = orig_seq_lens or []
                 self.query_lens = query_lens or []
@@ -357,6 +367,7 @@ def __post_init__(self):
 
             self.input_tokens = [[] for _ in range(self.n_seqs)]
             self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.mrope_input_positions = None
             self.seq_lens = [0] * self.n_seqs
             self.orig_seq_lens = [0] * self.n_seqs
             self.query_lens = [0] * self.n_seqs
@@ -493,6 +504,17 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
         inter_data.query_lens[
             seq_idx] = seq_len - context_len if inter_data.is_prompt else 1
 
+        if seq_data.mrope_position_delta is not None:
+            if inter_data.mrope_input_positions is None:
+                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+
+            inter_data.mrope_input_positions[
+                seq_idx] = MRotaryEmbedding.get_next_input_positions(
+                    seq_data.mrope_position_delta,
+                    context_len,
+                    seq_len,
+                )
+
     def _compute_for_prefix_cache_hit(
             self, inter_data: InterDataForSeqGroup, seq_idx: int,
             seq_group_metadata: SequenceGroupMetadata):
@@ -636,6 +658,40 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         mm_kwargs = self.multi_modal_input_mapper(mm_data)
         inter_data.multi_modal_inputs = mm_kwargs
 
+        # special processing for mrope position deltas.
+        if self.runner.model_is_mrope:
+            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+            assert image_grid_thw is not None or video_grid_thw is not None, (
+                "mrope embedding type requires multi-modal input mapper "
+                "returns 'image_grid_thw' or 'video_grid_thw'.")
+
+            hf_config = self.runner.model_config.hf_config
+
+            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+            for seq_idx in range(inter_data.n_seqs):
+                seq_data = seq_group_metadata.seq_data[
+                    inter_data.seq_ids[seq_idx]]
+                token_ids = seq_data.get_token_ids()
+
+                mrope_input_positions, mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions(
+                        token_ids,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.
+                        spatial_merge_size,
+                        context_len=inter_data.context_lens[seq_idx],
+                    )
+
+                seq_data.mrope_position_delta = mrope_position_delta
+                inter_data.mrope_input_positions[
+                    seq_idx] = mrope_input_positions
+
     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         """Add a sequence group to the builder."""
         seq_ids = seq_group_metadata.seq_data.keys()
@@ -684,10 +740,27 @@ def build(self) -> ModelInputForGPU:
             # prefix caching and there is no decode request.
             return self.model_input_cls()
 
-        input_positions = []
-        for inter_data in self.inter_data_list:
-            for cur_input_positions in inter_data.input_positions:
-                input_positions.extend(cur_input_positions)
+        mrope_input_positions: Optional[List[List[int]]] = None
+        if any(inter_data.mrope_input_positions is not None
+               for inter_data in self.inter_data_list):
+            mrope_input_positions = [[] for _ in range(3)]
+            for idx in range(3):
+                for inter_data in self.inter_data_list:
+                    msections = inter_data.mrope_input_positions
+                    if msections is None:
+                        for _seq_input_positions in inter_data.input_positions:
+                            mrope_input_positions[idx].extend(
+                                _seq_input_positions)
+                    else:
+                        for _seq_mrope_input_positions in msections:
+                            mrope_input_positions[idx].extend(
+                                _seq_mrope_input_positions[idx])
+            input_positions = None
+        else:
+            input_positions = []
+            for inter_data in self.inter_data_list:
+                for cur_input_positions in inter_data.input_positions:
+                    input_positions.extend(cur_input_positions)
 
         seq_lens = []
         max_decode_seq_len = 0
@@ -724,15 +797,24 @@ def build(self) -> ModelInputForGPU:
         # Tokens and positions.
         if cuda_graph_pad_size:
             input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
-            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
         assert self.runner.device is not None
         input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
                                                self.runner.device,
                                                self.runner.pin_memory)
-        input_positions_tensor = async_tensor_h2d(input_positions, torch.long,
-                                                  self.runner.device,
-                                                  self.runner.pin_memory)
-
+        if mrope_input_positions is not None:
+            for idx in range(3):
+                mrope_input_positions[idx].extend(
+                    itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
+        else:
+            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
         # Sequence and query lengths.
         if cuda_graph_pad_size:
             seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
@@ -982,9 +1064,13 @@ def load_model(self) -> None:
                     "This may lead to less accurate results!")
 
         if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
-            self.model = torch.compile(self.model,
-                                       fullgraph=True,
-                                       backend="eager")
+            from vllm.compilation.backends import vllm_backend
+            from vllm.plugins import get_torch_compile_backend
+            backend = get_torch_compile_backend() or vllm_backend
+            self.model = torch.compile(
+                self.model,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
 
     def save_sharded_state(
         self,
@@ -1199,6 +1285,15 @@ def list_prompt_adapters(self) -> Set[int]:
             raise RuntimeError("PromptAdapter is not enabled.")
         return self.prompt_adapter_manager.list_adapters()
 
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        return rope_scaling.get("type", None) == "mrope"
+
     @torch.inference_mode()
     def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         """Cuda graph capture a model.
@@ -1229,7 +1324,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         max_batch_size = self.max_batchsize_to_capture
         input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-
+        if self.model_is_mrope:
+            input_positions = torch.tile(input_positions, (3, 1))
         # Prepare dummy previous_hidden_states only if needed by the model.
         # This is used by draft models such as EAGLE.
         previous_hidden_states = None
@@ -1293,7 +1389,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         "input_ids":
                         input_tokens[:batch_size],
                         "positions":
-                        input_positions[:batch_size],
+                        input_positions[..., :batch_size],
                         "hidden_or_intermediate_states":
                         hidden_or_intermediate_states[
                             virtual_engine]  # type: ignore
@@ -1396,6 +1492,7 @@ def prepare_model_input(
                                    virtual_engine=virtual_engine)
 
     @torch.inference_mode()
+    @dump_input_when_exception(exclude_args=[0], exclude_kwargs=["self"])
     def execute_model(
         self,
         model_input: ModelInputForGPUWithSamplingMetadata,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index f8fd9d801d289..94d2507968382 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,5 +1,8 @@
 import dataclasses
+import pickle
 from abc import ABC, abstractmethod
+from datetime import datetime
+from functools import wraps
 from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
                     TypeVar)
 
@@ -98,6 +101,37 @@ def _init_frozen_model_input_from_tensor_dict(
     return tensor_dict
 
 
+def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
+                              exclude_kwargs: Optional[List[str]] = None):
+
+    def _inner(func):
+
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except Exception as err:
+                timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+                filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl"
+                with open(filename, "wb") as filep:
+                    dumped_inputs = {
+                        k: v
+                        for k, v in kwargs.items()
+                        if k not in (exclude_kwargs or [])
+                    }
+                    for i, arg in enumerate(args):
+                        if i not in (exclude_args or []):
+                            dumped_inputs[f"arg_{i}"] = arg
+                    pickle.dump(dumped_inputs, filep)
+                raise type(err)(
+                    f"Error in model execution (input dumped to {filename}): "
+                    f"{str(err)}") from err
+
+        return _wrapper
+
+    return _inner
+
+
 class BroadcastableModelInput(ABC):
 
     @abstractmethod
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index b13cf39bd846e..2c76775cd3231 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -4,16 +4,8 @@
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
                     Union)
 
-try:
-    from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-except ModuleNotFoundError:
-    # vllm_flash_attn is not installed, use the identical ROCm FA metadata
-    from vllm.attention.backends.rocm_flash_attn import (
-        ROCmFlashAttentionMetadata as FlashAttentionMetadata)
-
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
@@ -37,6 +29,8 @@
 
 logger = init_logger(__name__)
 
+MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"]
+
 
 def seq_output_builder():
     return SequenceOutput(
@@ -231,12 +225,15 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
         self._base_model_runner: GPUModelRunnerBase = base_model_runner
 
         self.is_multi_step = self.scheduler_config.is_multi_step
-        # used to copy tensors from GPU to CPU asynchronously
-        self._copy_stream = torch.cuda.Stream()
         self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
 
         self.pythonization_cache = PythonizationCache()
 
+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
     def make_model_input_from_broadcasted_tensor_dict(
             self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
         model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
@@ -487,35 +484,27 @@ def _update_sampling_metadata(self, sampling_metadata, num_seqs,
 
     def _advance_step(self, model_input: StatefulModelInput,
                       out: SamplerOutput) -> StatefulModelInput:
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        assert frozen_model_input.attn_metadata is not None
+        if self.attn_backend.get_name() not in MULTI_STEP_ATTENTION_BACKENDS:
+            raise ValueError(
+                f"Multi-step not supported for attention backend: "
+                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
+                f"to a value from {MULTI_STEP_ATTENTION_BACKENDS}.")
 
+        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
         num_seqs = model_input.num_seqs
         num_queries = model_input.num_queries
-        assert num_seqs > 0
-        assert num_queries > 0
-        assert num_seqs >= num_queries
-
+        frozen_model_input = model_input.frozen_model_input
+        assert frozen_model_input is not None
         attn_metadata = frozen_model_input.attn_metadata
-        assert isinstance(attn_metadata, FlashAttentionMetadata)
-        attn_metadata.advance_step(num_seqs, num_queries)
-
-        # Update GPU tensors
-        ops.advance_step(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=self.block_size,
-            input_tokens=frozen_model_input.input_tokens,
-            sampled_token_ids=model_input.cached_outputs[-1].sampled_token_ids,
-            input_positions=frozen_model_input.input_positions,
-            seq_lens=attn_metadata.seq_lens_tensor,
-            slot_mapping=attn_metadata.slot_mapping,
-            block_tables=attn_metadata.block_tables)
-
-        if frozen_model_input.seq_lens is not None:
-            for i in range(num_queries):
-                frozen_model_input.seq_lens[i] = attn_metadata.seq_lens[i]
+        assert attn_metadata is not None
+
+        attn_metadata.advance_step(
+            frozen_model_input,
+            sampled_token_ids,
+            self.block_size,
+            num_seqs,
+            num_queries,
+        )
 
         return model_input
 
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
new file mode 100644
index 0000000000000..e654f7172b266
--- /dev/null
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -0,0 +1,105 @@
+import dataclasses
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from vllm.distributed import broadcast_tensor_dict
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.tpu_model_runner import ModelInputForTPU
+from vllm.worker.tpu_worker import TPUWorker
+from vllm.worker.worker_base import WorkerInput
+
+
+class MultiStepTPUWorker(TPUWorker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cached_model_input: Optional[ModelInputForTPU] = None
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]:
+        assert self.is_driver_worker
+        assert execute_model_req.virtual_engine == 0
+
+        is_first_multi_step = execute_model_req.is_first_multi_step
+        is_last_step = execute_model_req.is_last_step
+        if is_first_multi_step:
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            worker_input = dataclasses.replace(
+                worker_input,
+                num_steps=execute_model_req.num_lookahead_slots + 1)
+            model_input: ModelInputForTPU = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
+
+            if execute_model_req.async_callback:
+                model_input = dataclasses.replace(
+                    model_input,
+                    async_callback=execute_model_req.async_callback)
+        else:
+            assert self.cached_model_input is not None
+            model_input = self.cached_model_input
+            worker_input = WorkerInput()
+        model_input = dataclasses.replace(
+            model_input,
+            is_first_multi_step=is_first_multi_step,
+            is_last_step=is_last_step)
+
+        if self.do_metadata_broadcast:
+            if is_first_multi_step:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_tensor_dict(broadcast_data, src=0)
+            else:
+                broadcast_data = {
+                    "is_first_multi_step": is_first_multi_step,
+                    "is_last_step": is_last_step,
+                }
+                broadcast_tensor_dict(broadcast_data, src=0)
+
+        # Retuning empty dict here to keep this compatible with
+        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
+        return model_input, worker_input, {}
+
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str,
+                                                            torch.Tensor]]]:
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    broadcast_tensor_dict({}, src=0)
+                return None
+
+            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
+                execute_model_req)
+            if model_input.is_first_multi_step:
+                self.cached_model_input = model_input
+            return model_input, worker_input, {}
+        else:
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            if len(broadcast_data) == 2:
+                assert self.cached_model_input is not None
+                self.cached_model_input = dataclasses.replace(
+                    self.cached_model_input,
+                    is_first_multi_step=broadcast_data["is_first_multi_step"],
+                    is_last_step=broadcast_data["is_last_step"])
+                empty_worker_input = WorkerInput()
+                return self.cached_model_input, empty_worker_input, {}
+
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+            self.cached_model_input = model_input
+            return model_input, worker_input, {}
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index db306bc743d3a..575769ca1aa4a 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -51,6 +51,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
     num_samples: int
     best_of: List[int]
     seq_groups: List[List[int]]
+    is_first_multi_step: bool = True
+    is_last_step: bool = True
     virtual_engine: int = 0
     async_callback: Optional[Callable] = None
 
@@ -65,6 +67,8 @@ def as_broadcastable_tensor_dict(
             "num_samples": self.num_samples,
             "best_of": self.best_of,
             "seq_groups": self.seq_groups,
+            "is_first_multi_step": self.is_first_multi_step,
+            "is_last_step": self.is_last_step,
             "virtual_engine": self.virtual_engine,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
@@ -118,6 +122,7 @@ def __init__(
             self.block_size,
             False,
         )
+        self.cached_step_outputs: List[torch.Tensor] = []
 
     def load_model(self) -> None:
         self.device = self.device_config.device
@@ -518,97 +523,159 @@ def execute_model(
         num_steps: int = 1,
     ) -> List[SamplerOutput]:
         assert intermediate_tensors is None
-        if num_steps > 1:
-            raise ValueError(
-                "TPUModelRunner does not support multi-step execution.")
-
-        def _execute_model(*args):
-            """Move input args from CPU to device and execute the model."""
-
-            new_args = []
-            for arg in args:
-                if isinstance(arg, torch.Tensor):
-                    arg = arg.to(self.device)
-                elif isinstance(arg, AttentionMetadata):
-                    arg.slot_mapping = arg.slot_mapping.to(self.device)
-                    if getattr(arg, "block_tables", None) is not None:
-                        arg.block_tables = arg.block_tables.to(self.device)
-                    if getattr(arg, "context_lens", None) is not None:
-                        arg.context_lens = arg.context_lens.to(self.device)
-                new_args.append(arg)
-            return self.model(*new_args, is_prompt=is_prompt)
-
-        num_prefills = model_input.attn_metadata.num_prefills
-        is_prompt = num_prefills > 0
+        if not model_input.is_first_multi_step:
+            if not model_input.is_last_step:
+                return []
+
+            use_async_out_proc = model_input.async_callback is not None
+            sampler_outputs = []
+            num_outputs = len(self.cached_step_outputs)
+            for i in range(num_outputs):
+                next_token_ids = self.cached_step_outputs.pop(0)
+                next_token_ids = next_token_ids.cpu().tolist()
+                sampler_output = _make_decode_output(next_token_ids,
+                                                     model_input.seq_groups)
+                sampler_outputs.append(sampler_output)
+
+                if i < num_outputs - 1 and use_async_out_proc:
+                    assert model_input.async_callback is not None
+                    ctx = model_input.async_callback.keywords[  # type: ignore
+                        "ctx"]
+                    ctx.append_output(
+                        outputs=[sampler_output],
+                        seq_group_metadata_list=ctx.seq_group_metadata_list,
+                        scheduler_outputs=ctx.scheduler_outputs,
+                        is_async=False,
+                        is_last_step=False)
+                    model_input.async_callback()
+            if use_async_out_proc:
+                return [sampler_outputs[-1]]
+            else:
+                return sampler_outputs
+
+        is_prompt = model_input.attn_metadata.num_prefills > 0
         if is_prompt:
+            assert num_steps == 1
             # NOTE(woosuk): Since the FlashAttention kernel does not support
             # ragged inputs, we split the prompts into different batches and
             # process them separately. This is a temporary hack that should be
             # optimized by using SplashAttention.
-            next_token_ids = []
             orig_slot_mapping = model_input.attn_metadata.slot_mapping
             batch_size = model_input.input_lens.shape[0]
             start_idx = 0
+            next_token_ids = []
             for i in range(batch_size):
                 # Get the actual prefill_len.
                 prefill_len = model_input.input_lens[i:i + 1].item()
                 prefill_len = _get_padded_prefill_len(prefill_len)
                 end_idx = start_idx + prefill_len
 
-                model_input.attn_metadata.slot_mapping = orig_slot_mapping[
-                    None, start_idx:end_idx]
-                model_input.attn_metadata.num_prefills = 1
-                output_token_ids = _execute_model(
-                    model_input.token_ids[None, start_idx:end_idx],
-                    model_input.position_ids[None, start_idx:end_idx],
-                    model_input.attn_metadata, model_input.input_lens[i:i + 1],
-                    model_input.t[i:i + 1], model_input.p[i:i + 1],
-                    model_input.num_samples, kv_caches)
-                if i == 0 and model_input.async_callback is not None:
-                    model_input.async_callback()
-                # Retrieve the outputs to CPU.
-                next_token_ids += output_token_ids.cpu().tolist()
+                token_ids = model_input.token_ids[None, start_idx:end_idx].to(
+                    self.device)
+                position_ids = model_input.position_ids[None,
+                                                        start_idx:end_idx].to(
+                                                            self.device)
+                attn_metadata = model_input.attn_metadata
+                attn_metadata.num_prefills = 1
+                attn_metadata.slot_mapping = orig_slot_mapping[
+                    None, start_idx:end_idx].to(self.device)
+                input_lens = model_input.input_lens[i:i + 1].to(self.device)
+                t = model_input.t[i:i + 1].to(self.device)
+                p = model_input.p[i:i + 1].to(self.device)
+                output_token_ids = self.model(token_ids,
+                                              position_ids,
+                                              attn_metadata,
+                                              input_lens,
+                                              t,
+                                              p,
+                                              model_input.num_samples,
+                                              kv_caches,
+                                              is_prompt=True)
+                next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
-        else:
-            # Execute the model.
-            output_token_ids = _execute_model(
-                model_input.token_ids, model_input.position_ids,
-                model_input.attn_metadata, model_input.input_lens,
-                model_input.t, model_input.p, model_input.num_samples,
-                kv_caches)
+
             if model_input.async_callback is not None:
                 model_input.async_callback()
             # Retrieve the outputs to CPU.
-            next_token_ids = output_token_ids.cpu().tolist()
-
-        # NOTE(woosuk): Minimal code to construct the sampler outputs.
-        # The TPU backend does not reuse the sampler, since the TPU backend
-        # does not support the advanced sampling parameters such as logprobs.
-        zero_logprob = Logprob(0.0)
-        batch_idx = 0
-        sampler_outputs = []
-        for seq_group in model_input.seq_groups:
-            seq_ids = seq_group
-            seq_outputs = []
-            if is_prompt:
+            next_token_ids = [
+                output_token_ids.cpu().tolist()
+                for output_token_ids in next_token_ids
+            ]
+
+            # NOTE(woosuk): Minimal code to construct the sampler outputs.
+            # The TPU backend does not reuse the sampler, since the TPU backend
+            # does not support advanced sampling parameters such as logprobs.
+            zero_logprob = Logprob(0.0)
+            sampler_outputs = []
+            for i, seq_group in enumerate(model_input.seq_groups):
+                seq_ids = seq_group
                 assert len(seq_ids) == 1
                 seq_id = seq_ids[0]
-                for i in range(model_input.best_of[batch_idx]):
-                    next_token_id = next_token_ids[batch_idx][i]
+                seq_outputs = []
+                for j in range(model_input.best_of[i]):
+                    next_token_id = next_token_ids[i][j]
                     seq_outputs.append(
                         SequenceOutput(seq_id, next_token_id,
                                        {next_token_id: zero_logprob}))
-                batch_idx += 1
-            else:
-                for seq_id in seq_ids:
-                    next_token_id = next_token_ids[batch_idx]
-                    seq_outputs.append(
-                        SequenceOutput(seq_id, next_token_id,
-                                       {next_token_id: zero_logprob}))
-                    batch_idx += 1
-            sampler_outputs.append(
-                CompletionSequenceGroupOutput(seq_outputs, None))
-        return [SamplerOutput(sampler_outputs)]
+                sampler_outputs.append(
+                    CompletionSequenceGroupOutput(seq_outputs, None))
+            return [SamplerOutput(sampler_outputs)]
+        else:
+            token_ids = model_input.token_ids.to(self.device)
+            position_ids = model_input.position_ids.to(self.device)
+            attn_metadata = model_input.attn_metadata
+            attn_metadata.slot_mapping = attn_metadata.slot_mapping.to(
+                self.device)
+            attn_metadata.block_tables = attn_metadata.block_tables.to(
+                self.device)
+            attn_metadata.context_lens = attn_metadata.context_lens.to(
+                self.device)
+            t = model_input.t.to(self.device)
+            p = model_input.p.to(self.device)
+            input_lens = model_input.input_lens.to(self.device)
+            for i in range(num_steps):
+                slot_mapping = attn_metadata.slot_mapping
+                output_token_ids = self.model(token_ids,
+                                              position_ids,
+                                              attn_metadata,
+                                              input_lens,
+                                              t,
+                                              p,
+                                              model_input.num_samples,
+                                              kv_caches,
+                                              is_prompt=False)
+                self.cached_step_outputs.append(output_token_ids)
+
+                if i < num_steps - 1:
+                    # Prepare the inputs for the next step.
+                    token_ids = output_token_ids.unsqueeze(dim=1).int()
+                    position_ids = position_ids + 1
+                    attn_metadata.context_lens = attn_metadata.context_lens + 1
+
+                    block_tables = attn_metadata.block_tables
+                    block_number = block_tables.gather(
+                        1,
+                        position_ids.long() // self.block_size)
+                    block_offset = position_ids % self.block_size
+
+                    is_padding = slot_mapping == _PAD_SLOT_ID
+                    slot_mapping = block_number * self.block_size + block_offset
+                    slot_mapping = slot_mapping.long()
+                    slot_mapping = torch.where(is_padding, _PAD_SLOT_ID,
+                                               slot_mapping)
+                    attn_metadata.slot_mapping = slot_mapping
+
+            if model_input.async_callback is not None:
+                model_input.async_callback()
+
+            if num_steps > 1:
+                return []
+            # Retrieve the outputs to CPU.
+            next_token_ids = self.cached_step_outputs.pop(0)
+            next_token_ids = next_token_ids.cpu().tolist()
+            sampler_output = _make_decode_output(next_token_ids,
+                                                 model_input.seq_groups)
+            return [sampler_output]
 
 
 class ModelWrapper(TorchCompileWrapperWithCustomDispatcher):
@@ -756,3 +823,24 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
     cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index)
     logits = logits.masked_fill_(logits < cutoff_logit, -float("inf"))
     return logits
+
+
+def _make_decode_output(
+    next_token_ids: List[int],
+    seq_groups: List[List[int]],
+) -> SamplerOutput:
+    zero_logprob = Logprob(0.0)
+    sampler_outputs = []
+    batch_idx = 0
+    for seq_group in seq_groups:
+        seq_ids = seq_group
+        seq_outputs = []
+        for seq_id in seq_ids:
+            next_token_id = next_token_ids[batch_idx]
+            seq_outputs.append(
+                SequenceOutput(seq_id, next_token_id,
+                               {next_token_id: zero_logprob}))
+            batch_idx += 1
+        sampler_outputs.append(CompletionSequenceGroupOutput(
+            seq_outputs, None))
+    return SamplerOutput(sampler_outputs)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0ff559a9af53e..52092dc2dc291 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -166,6 +166,7 @@ def init_device(self) -> None:
             torch.cuda.set_device(self.device)
 
             _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
             torch.cuda.empty_cache()
             self.init_gpu_memory = torch.cuda.mem_get_info()[0]
         else: