From 6e9ff050c8e83ad6d5e5eab621e83549e35933a1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Nov 2024 17:04:50 -0800
Subject: [PATCH 1/8] [misc] do not read HOST_IP (#10644)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/envs.py                      | 2 +-
 vllm/executor/ray_gpu_executor.py | 4 ++--
 vllm/executor/ray_hpu_executor.py | 4 ++--
 vllm/utils.py                     | 7 +++++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 14c1617f1be19..c896770e5f6bc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -153,7 +153,7 @@ def get_default_config_root():
     # If you are using multi-node inference, you should set this differently
     # on each node.
     'VLLM_HOST_IP':
-    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
+    lambda: os.getenv('VLLM_HOST_IP', ""),
 
     # used in distributed environment to manually set the communication port
     # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 810b0f06ff7b2..6542b18ae70b1 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -216,8 +216,8 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
                 " each node.")
 
         VLLM_INSTANCE_ID = get_vllm_instance_id()
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index 6fe8c6c403358..a74328e5aa272 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -192,8 +192,8 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP` "
+                "environment variable, make sure it is unique for"
                 " each node.")
 
         VLLM_INSTANCE_ID = get_vllm_instance_id()
diff --git a/vllm/utils.py b/vllm/utils.py
index dd4283e3ac381..bec876d983701 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -467,6 +467,13 @@ async def collect_from_async_generator(
 
 def get_ip() -> str:
     host_ip = envs.VLLM_HOST_IP
+    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
+        logger.warning(
+            "The environment variable HOST_IP is deprecated and ignored, as"
+            " it is often used by Docker and other software to"
+            "interact with the container's network stack. Please"
+            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
+            " to communicate with each other.")
     if host_ip:
         return host_ip
 

From 45ac4ff270b267765457159c0b75e1bb7ebf6d79 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Nov 2024 18:32:09 -0800
Subject: [PATCH 2/8] [bugfix] fix aria model and add torch.compile (#10645)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/models/aria.py  | 26 ++++----------------------
 vllm/model_executor/models/llama.py | 16 ++++++++++------
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 0356435e9c257..fa6b95f5481ad 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -29,7 +29,7 @@
                                               LlamaModel)
 from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
                                               is_pp_missing_parameter,
-                                              make_layers, maybe_prefix,
+                                              maybe_prefix,
                                               merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
@@ -363,27 +363,9 @@ class AriaMoELMModel(LlamaModel):
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        # FIXME: this is a hack to disable the compilation of the model
-        self.do_not_compile = True
-
-        self.layers = None
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MoEDecoderLayer(
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            ),
-            prefix=f"{prefix}.layers",
-        )
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=MoEDecoderLayer)
 
     # Adapted from LlamaModel.load_weights with the modification of adding
     # the expert weights mapping to `stacked_params_mapping`
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 33d78d74129c8..355b2f3ef8b28 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -273,7 +273,11 @@ def forward(
 @support_torch_compile
 class LlamaModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -299,10 +303,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: LlamaDecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         if get_pp_group().is_last_rank:

From a6760f6456b714409685e23301c820a85da856ca Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanket.kale@fujitsu.com>
Date: Tue, 26 Nov 2024 08:02:39 +0530
Subject: [PATCH 3/8] [Feature] vLLM ARM Enablement for AARCH64 CPUs (#9228)

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 Dockerfile.arm                                |  62 +++
 cmake/cpu_extension.cmake                     |  33 +-
 csrc/cpu/attention.cpp                        |  18 +-
 csrc/cpu/cpu_types.hpp                        |   6 +-
 csrc/cpu/cpu_types_arm.hpp                    | 515 ++++++++++++++++++
 .../getting_started/arm-installation.rst      |  50 ++
 docs/source/index.rst                         |   1 +
 examples/offline_inference.py                 |   2 +-
 requirements-cpu.txt                          |   7 +-
 9 files changed, 678 insertions(+), 16 deletions(-)
 create mode 100644 Dockerfile.arm
 create mode 100644 csrc/cpu/cpu_types_arm.hpp
 create mode 100644 docs/source/getting_started/arm-installation.rst

diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000000000..093ee2209222f
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 426189481575b..68f7ca1af05ad 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,16 +16,15 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
-    list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
-        "-DVLLM_CPU_EXTENSION")
-else()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
     list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
         "-mf16c"
-        "-DVLLM_CPU_EXTENSION")
+    )
 endif()
+list(APPEND CXX_COMPILE_FLAGS
+    "-fopenmp"
+    "-DVLLM_CPU_EXTENSION")
 
 execute_process(COMMAND cat /proc/cpuinfo
                 RESULT_VARIABLE CPUINFO_RET
@@ -59,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -78,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+    
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
 elseif (POWER9_FOUND OR POWER10_FOUND)
     message(STATUS "PowerPC detected")
     # Check for PowerPC VSX support
@@ -88,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
         "-mvsx"
         "-mcpu=native"
         "-mtune=native")
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
 
 #
@@ -159,4 +174,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e6c03dcb034fd..e21832ba7582f 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -51,6 +51,10 @@ struct KernelVecType<c10::BFloat16> {
   using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
+  #ifdef __aarch64__
+    #ifndef ARM_BF16_SUPPORT
+    // pass
+    #else
 template <>
 struct KernelVecType<c10::BFloat16> {
   using q_load_vec_type = vec_op::BF16Vec8;
@@ -60,6 +64,18 @@ struct KernelVecType<c10::BFloat16> {
   using qk_acc_vec_type = vec_op::FP32Vec16;
   using v_load_vec_type = vec_op::BF16Vec16;
 };
+    #endif
+  #else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+  #endif
 #endif
 
 template <typename T>
@@ -779,4 +795,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
+}
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 0213be09105ed..28db0479748bf 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -1,4 +1,3 @@
-
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
@@ -8,8 +7,11 @@
 #elif defined(__POWER9_VECTOR__)
   //ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__aarch64__)
+  //arm implementation
+  #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000000000..73e0f8cb2e0fb
--- /dev/null
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,515 @@
+#include <arm_neon.h>
+#include <torch/all.h> 
+#include <cmath>
+
+namespace vec_op {
+
+#ifdef ARM_BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+  template <typename T, T... indexes, typename F>
+  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+    (f(std::integral_constant<T, indexes>{}), ...);
+  };
+}; 
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  float16x8_t reg;
+
+  explicit FP16Vec8(const void *ptr)
+      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
+
+  explicit FP16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const {
+    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
+  }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+    constexpr static int VEC_ELEM_NUM = 16;
+    
+    float16x8x2_t reg; 
+    
+    explicit FP16Vec16(const void *ptr) {
+        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
+        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
+    }
+    
+    explicit FP16Vec16(const FP32Vec16& vec);
+    
+    void save(void *ptr) const {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+    }
+    
+    void save(void *ptr, const int elem_num) const {
+        int full_blocks = elem_num / 8;   
+        int remainder = elem_num % 8;     
+        
+        if (full_blocks > 0) {
+            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+            if (full_blocks > 1) {
+                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+            }
+        }
+        
+        if (remainder > 0) {
+            float16x8_t temp = reg.val[full_blocks];
+            for (int i = 0; i < remainder; ++i) {
+                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            }
+        }
+    }
+};
+
+
+#ifdef ARM_BF16_SUPPORT
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  bfloat16x8_t reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
+
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  bfloat16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
+
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  explicit BF16Vec16(float32x4x4_t v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
+  }){};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  bfloat16x8x4_t reg;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
+
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  union AliasReg {
+    float32x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4_t reg;
+
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
+
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
+
+  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+
+  explicit FP32Vec4(float32x4_t data) : reg(data) {};
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    float32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x2_t reg;
+
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+
+  explicit FP32Vec8(const FP16Vec8 &v) {
+        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
+        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
+    };
+
+  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+
+  #ifdef ARM_BF16_SUPPORT
+
+  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+
+  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+
+  #endif
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
+
+    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  } 
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    float32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x4_t reg;
+
+  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+
+  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+        reg.val[0] = data.reg.val[0]; 
+        reg.val[1] = data.reg.val[1]; 
+        reg.val[2] = data.reg.val[0]; 
+        reg.val[3] = data.reg.val[1]; 
+  }
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
+    vcvtq_low_f32_bf16(v.val[0]),
+    vcvtq_high_f32_bf16(v.val[0]),
+    vcvtq_low_f32_bf16(v.val[1]),
+    vcvtq_high_f32_bf16(v.val[1])
+  }) {};
+  #endif
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  };
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16 &v) : reg({
+    vcvtq_low_f32_bf16(v.reg.val[0]),
+    vcvtq_high_f32_bf16(v.reg.val[0]),
+    vcvtq_low_f32_bf16(v.reg.val[1]),
+    vcvtq_high_f32_bf16(v.reg.val[1])
+  }) {};
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
+  #endif
+
+  explicit FP32Vec16(const FP16Vec16 &v) {
+      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+  };
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vaddq_f32(reg.val[0], b.reg.val[0]),
+        vaddq_f32(reg.val[1], b.reg.val[1]),
+        vaddq_f32(reg.val[2], b.reg.val[2]),
+        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vmulq_f32(reg.val[0], b.reg.val[0]),
+        vmulq_f32(reg.val[1], b.reg.val[1]),
+        vmulq_f32(reg.val[2], b.reg.val[2]),
+        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vsubq_f32(reg.val[0], b.reg.val[0]),
+        vsubq_f32(reg.val[1], b.reg.val[1]),
+        vsubq_f32(reg.val[2], b.reg.val[2]),
+        vsubq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vdivq_f32(reg.val[0], b.reg.val[0]),
+        vdivq_f32(reg.val[1], b.reg.val[1]),
+        vdivq_f32(reg.val[2], b.reg.val[2]),
+        vdivq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  };
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
+
+    return answer;
+  };
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+    vst1q_f32(ptr + 8, reg.val[2]);
+    vst1q_f32(ptr + 12, reg.val[3]);
+  };
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+
+#ifdef ARM_BF16_SUPPORT
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+#endif
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<__fp16 *>(ptr) = v;
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
+    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+    reg.val[0] = vcombine_f16(low_0, high_0);
+    reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
+    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+    reg = vcombine_f16(lower_half, upper_half);
+};
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+};
+
+#ifdef ARM_BF16_SUPPORT
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+
+  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+};
+#endif
+
+#ifdef ARM_BF16_SUPPORT
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
+  }){};
+#endif
+
+inline void prefetch(const void *addr) {
+    __builtin_prefetch(addr, 0, 1);
+};
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
+  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+};
+#endif
+};
\ No newline at end of file
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
new file mode 100644
index 0000000000000..7b457df92c11d
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.rst
@@ -0,0 +1,50 @@
+.. _installation_arm:
+
+Installation for ARM CPUs
+=========================
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+* CPU backend inference capabilities
+* Relevant runtime environment variables
+* Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. :ref:`Requirements <arm_backend_requirements>`
+2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
+3. :ref:`Building from Source <build_arm_backend_from_source>`
+
+.. _arm_backend_requirements:
+
+Requirements
+------------
+
+* **Operating System**: Linux or macOS
+* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+* **Instruction Set Architecture (ISA)**: NEON support is required
+
+.. _arm_backend_quick_start_dockerfile:
+
+Quick Start with Dockerfile
+---------------------------
+
+You can quickly set up vLLM on ARM using Docker:
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --cpuset-cpus=<cpu-id-list, optional> \
+                 --cpuset-mems=<memory-node, optional> \
+                 vllm-cpu-env
+
+.. _build_arm_backend_from_source:
+
+Building from Source
+--------------------
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c2afd806c50f9..0692e949f1c77 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -67,6 +67,7 @@ Documentation
    getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/gaudi-installation
+   getting_started/arm-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f6..23cc6e8539431 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -19,4 +19,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 749b03a0603d8..db8ad9d3a015d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,7 @@
 # Common dependencies
 -r requirements-common.txt
 
-# Dependencies for x86_64 CPUs
-torch == 2.5.1+cpu; platform_machine != "ppc64le"
-torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+# Dependencies for CPUs
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
+torch==2.5.1; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file

From 519e8e4182af8e25d78b062ba5e613df661e6e5d Mon Sep 17 00:00:00 2001
From: Ricky Xu <rickyx@anyscale.com>
Date: Mon, 25 Nov 2024 21:09:43 -0800
Subject: [PATCH 4/8] [v1] EngineArgs for better config handling for v1
 (#10382)

Signed-off-by: rickyx <rickyx@anyscale.com>
---
 .buildkite/test-pipeline.yaml              |  2 +-
 tests/v1/engine/test_async_llm.py          |  3 ++
 tests/v1/engine/test_engine_args.py        | 42 +++++++++++++++++
 tests/v1/engine/test_engine_core.py        |  3 +-
 tests/v1/engine/test_engine_core_client.py |  6 ++-
 vllm/engine/arg_utils.py                   | 53 ++++++++++++++++++++--
 vllm/engine/async_llm_engine.py            |  2 +-
 vllm/engine/llm_engine.py                  |  2 +-
 vllm/engine/multiprocessing/engine.py      |  2 +-
 vllm/entrypoints/openai/api_server.py      |  4 +-
 vllm/v1/engine/async_llm.py                |  2 +-
 vllm/v1/engine/core.py                     | 13 ------
 vllm/v1/engine/llm_engine.py               |  2 +-
 13 files changed, 109 insertions(+), 27 deletions(-)
 create mode 100644 tests/v1/engine/test_engine_args.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bff33d35b423e..fc23c9cff0d87 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -172,7 +172,7 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - pytest -v -s v1
+    - VLLM_USE_V1=1 pytest -v -s v1
 
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 1f26fe0fc892f..fffb5b8100ec7 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str,
 
 @pytest.mark.asyncio
 async def test_load(monkeypatch):
+    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    # so that in the future when we switch, we don't have to change all the
+    # tests.
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
new file mode 100644
index 0000000000000..69cfdf5a395c1
--- /dev/null
+++ b/tests/v1/engine/test_engine_args.py
@@ -0,0 +1,42 @@
+import pytest
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.usage.usage_lib import UsageContext
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+def test_defaults():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+
+    # Assert V1 defaults
+    assert (engine_args.enable_prefix_caching
+            ), "V1 turns on prefix caching by default"
+
+
+def test_defaults_with_usage_context():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config: VllmConfig = engine_args.create_engine_config(
+        UsageContext.LLM_CLASS)
+
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
+
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config = engine_args.create_engine_config(
+        UsageContext.OPENAI_API_SERVER)
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
+
+
+def test_prefix_cache_disabled_with_multimodel():
+    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
+
+    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
+    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index b3692b594326a..bd11ff1877064 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -43,7 +43,8 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e248e35ae4069..582192196aaf9 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -82,7 +82,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         m.setenv("VLLM_USE_V1", "1")
 
         engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
@@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
 
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca68c1d57151c..60ad5ee54a2f2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -20,6 +20,7 @@
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
+from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
 
 if TYPE_CHECKING:
@@ -113,7 +114,7 @@ class EngineArgs:
     # NOTE(kzawora): default block size for Gaudi should be 128
     # smaller sizes still work, but very inefficiently
     block_size: int = 16 if not current_platform.is_hpu() else 128
-    enable_prefix_caching: bool = False
+    enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
     swap_space: float = 4  # GiB
@@ -197,6 +198,11 @@ def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
 
+        # Override the default value of enable_prefix_caching if it's not set
+        # by user.
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -953,7 +959,12 @@ def create_load_config(self) -> LoadConfig:
             ignore_patterns=self.ignore_patterns,
         )
 
-    def create_engine_config(self) -> VllmConfig:
+    def create_engine_config(self,
+                             usage_context: Optional[UsageContext] = None
+                             ) -> VllmConfig:
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_args(usage_context)
+
         # gguf file needs a specific model loader and doesn't use hf_repo
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
@@ -1170,7 +1181,7 @@ def create_engine_config(self) -> VllmConfig:
             or "all" in detailed_trace_modules,
         )
 
-        return VllmConfig(
+        config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
@@ -1185,6 +1196,42 @@ def create_engine_config(self) -> VllmConfig:
             compilation_config=self.compilation_config,
         )
 
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_config(config)
+        return config
+
+    def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
+        """
+        Override the EngineArgs's args based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+
+        if self.max_num_batched_tokens is None:
+            # When no user override, set the default values based on the
+            # usage context.
+            if usage_context == UsageContext.LLM_CLASS:
+                logger.warning("Setting max_num_batched_tokens to 8192 "
+                               "for LLM_CLASS usage context.")
+                self.max_num_seqs = 1024
+                self.max_num_batched_tokens = 8192
+            elif usage_context == UsageContext.OPENAI_API_SERVER:
+                logger.warning("Setting max_num_batched_tokens to 2048 "
+                               "for OPENAI_API_SERVER usage context.")
+                self.max_num_seqs = 1024
+                self.max_num_batched_tokens = 2048
+
+    def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
+        """
+        Override the EngineConfig's configs based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+        # TODO (ywang96): Enable APC by default when VLM supports it.
+        if engine_config.model_config.is_multimodal_model:
+            logger.warning(
+                "Prefix caching is currently not supported for multimodal "
+                "models and has been disabled.")
+            engine_config.cache_config.enable_prefix_caching = False
+
 
 @dataclass
 class AsyncEngineArgs(EngineArgs):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5a5388708b1c6..3224577c567f8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -680,7 +680,7 @@ def from_engine_args(
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
         if engine_config is None:
-            engine_config = engine_args.create_engine_config()
+            engine_config = engine_args.create_engine_config(usage_context)
 
         executor_class = cls._get_executor_cls(engine_config)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fb21b2dedeb74..a4975cece9a81 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -568,7 +568,7 @@ def from_engine_args(
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 7de23643a2e1c..49a90b321dac4 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -111,7 +111,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = LLMEngine._get_executor_cls(engine_config)
 
         use_async_sockets = engine_config.model_config.use_async_output_proc
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc018be982bff..6bc31ef83ded4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -135,8 +135,8 @@ async def build_async_engine_client_from_engine_args(
     # TODO: fill out feature matrix.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(
+            UsageContext.OPENAI_API_SERVER)
         uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
                            "uses_ray", False)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c44ebb2a85ba0..a17c8eac4b77c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -94,7 +94,7 @@ def from_engine_args(
 
         # Create the engine configs.
         if engine_config is None:
-            vllm_config = engine_args.create_engine_config()
+            vllm_config = engine_args.create_engine_config(usage_context)
         else:
             vllm_config = engine_config
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1a978fbe7355f..34f99dd30ef2e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -41,19 +41,6 @@ def __init__(
         executor_class: Type[GPUExecutor],
         usage_context: UsageContext,
     ):
-        # Override the configs for V1.
-        # FIXME
-        if usage_context == UsageContext.LLM_CLASS:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 8192
-        elif usage_context == UsageContext.OPENAI_API_SERVER:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 2048
-
-        # TODO (ywang96): Enable APC by default when VLM supports it.
-        if not vllm_config.model_config.is_multimodal_model:
-            vllm_config.cache_config.enable_prefix_caching = True
-
         assert vllm_config.model_config.task != "embedding"
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 75a77be750acd..7a5482f03b6fa 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -82,7 +82,7 @@ def from_engine_args(
         """Creates an LLM engine from the engine arguments."""
 
         # Create the engine configs.
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(vllm_config)
 
         if VLLM_ENABLE_V1_MULTIPROCESSING:

From 9a88f897993a83fad79d1bf6b95595be25a8d68a Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 26 Nov 2024 00:00:16 -0600
Subject: [PATCH 5/8] custom allreduce + torch.compile (#10121)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst     |   1 -
 tests/distributed/test_pynccl.py              |  15 +--
 tests/distributed/test_utils.py               |   2 -
 .../device_communicators/pynccl.py            |  26 ++---
 vllm/distributed/parallel_state.py            | 110 ++++++------------
 vllm/v1/worker/gpu_model_runner.py            |   6 +-
 6 files changed, 59 insertions(+), 101 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 77bf550601346..0c1afcbd7c0b9 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -86,7 +86,6 @@ If GPU/CPU communication cannot be established, you can use the following Python
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 
     pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-    pynccl.disabled = False
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index f702d7c46ea73..fb24d6bc2c100 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -60,7 +60,7 @@ def worker_fn():
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
     result = tensor.mean().cpu().item()
     assert result == pynccl_comm.world_size
 
@@ -84,12 +84,12 @@ def multiple_allreduce_worker_fn():
     with pynccl_comm.change_state(enable=True):
         # two groups can communicate independently
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.all_reduce(tensor)
-            pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
             result = tensor.mean().cpu().item()
             assert result == 4
         else:
-            pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
             result = tensor.mean().cpu().item()
             assert result == 2
 
@@ -140,14 +140,11 @@ def worker_fn_with_cudagraph():
         with torch.cuda.graph(
                 graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                     enable=True):
-            # operation during the graph capture is recorded but not executed
-            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
-            pynccl_comm.all_reduce(a)
+            a_out = pynccl_comm.all_reduce(a)
         pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**0
         graph.replay()
         pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**1
+        assert a_out.mean().cpu().item() == pynccl_comm.world_size**1
 
 
 @worker_fn_wrapper
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 686b697c98e03..5fb1ae7b29fd2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -70,14 +70,12 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
-    pynccl1.disabled = False
     if rank <= 2:
         pg2 = StatelessProcessGroup.create(host="127.0.0.1",
                                            port=port2,
                                            rank=rank,
                                            world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
-        pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
     pg1.barrier()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 7411304eb18fa..d4e3f81747038 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -106,30 +106,30 @@ def __init__(
             self.stream.synchronize()
             del data
 
-        # by default it is disabled, e.g. in profiling models and prefill phase.
-        # to use it, use under `with obj.change_state(enable=True)`, usually
-        # when we are using CUDA graph.
-        self.disabled = True
-
     def all_reduce(self,
-                   tensor: torch.Tensor,
+                   in_tensor: torch.Tensor,
                    op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
+                   stream=None) -> torch.Tensor:
         if self.disabled:
-            return
+            return None
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
-        assert tensor.device == self.device, (
+        assert in_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {in_tensor.device}")
+
+        out_tensor = torch.empty_like(in_tensor)
+
         if stream is None:
             stream = self.stream
-        self.nccl.ncclAllReduce(buffer_type(tensor.data_ptr()),
-                                buffer_type(tensor.data_ptr()), tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype),
+        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
+                                buffer_type(out_tensor.data_ptr()),
+                                in_tensor.numel(),
+                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
+        return out_tensor
 
     def all_gather(self,
                    output_tensor: torch.Tensor,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 87ade377266a2..ccbe00386c5da 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -96,42 +96,24 @@ def _register_group(group: "GroupCoordinator") -> None:
     _groups[group.unique_name] = weakref.ref(group)
 
 
-if supports_custom_op():
-
-    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        group._all_reduce_in_place(tensor)
-
-    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
-        return
+def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_reduce_out_place(tensor)
 
-    direct_register_custom_op(
-        op_name="inplace_all_reduce",
-        op_func=inplace_all_reduce,
-        mutates_args=["tensor"],
-        fake_impl=inplace_all_reduce_fake,
-    )
 
-    def outplace_all_reduce(tensor: torch.Tensor,
-                            group_name: str) -> torch.Tensor:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        return group._all_reduce_out_place(tensor)
+def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    return torch.empty_like(tensor)
 
-    def outplace_all_reduce_fake(tensor: torch.Tensor,
-                                 group_name: str) -> torch.Tensor:
-        return torch.empty_like(tensor)
 
+if supports_custom_op():
     direct_register_custom_op(
-        op_name="outplace_all_reduce",
-        op_func=outplace_all_reduce,
+        op_name="all_reduce",
+        op_func=all_reduce,
         mutates_args=[],
-        fake_impl=outplace_all_reduce_fake,
+        fake_impl=all_reduce_fake,
     )
 
 
@@ -317,30 +299,13 @@ def graph_capture(
             stream.wait_stream(curr_stream)
 
         with torch.cuda.stream(stream), maybe_ca_context:
-            # In graph mode, we have to be very careful about the collective
-            # operations. The current status is:
-            #     allreduce \ Mode   |  Eager  |  Graph  |
-            # --------------------------------------------
-            # custom allreduce       | enabled | enabled |
-            # PyNccl                 | disabled| enabled |
-            # torch.distributed      | enabled | disabled|
-            #
-            # Note that custom allreduce will have a runtime check, if the
-            #  tensor size is too large, it will fallback to the next
-            #  available option.
-            # In summary: When using CUDA graph, we use
-            #  either custom all-reduce kernel or pynccl. When not using
-            #  CUDA graph, we use either custom all-reduce kernel or
-            #  PyTorch NCCL. We always prioritize using custom all-reduce
-            #  kernel but fall back to PyTorch or pynccl if it is
-            #  disabled or not supported.
             pynccl_comm = self.pynccl_comm
             maybe_pynccl_context: Any
             if not pynccl_comm:
                 maybe_pynccl_context = nullcontext()
             else:
                 maybe_pynccl_context = pynccl_comm.change_state(
-                    enable=True, stream=torch.cuda.current_stream())
+                    stream=torch.cuda.current_stream())
             with maybe_pynccl_context:
                 yield graph_capture_context
 
@@ -356,8 +321,8 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
          coordinator.
 
         In addition, PyTorch custom ops do not support mutation or returning
-        a new tensor in the same op. So we need to figure out if the op is
-        in-place or out-of-place ahead of time.
+        a new tensor in the same op. So we always make the all-reduce operation
+        out-of-place.
         """
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
@@ -368,10 +333,6 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             ipex.distributed.all_reduce(input_, group=self.device_group)
             return input_
 
-        if not supports_custom_op():
-            self._all_reduce_in_place(input_)
-            return input_
-
         if self.tpu_communicator is not None and \
             not self.tpu_communicator.disabled:
             # TPU handles Dynamo with its own logic.
@@ -385,30 +346,31 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
                 not self.xpu_communicator.disabled:
             return self.xpu_communicator.all_reduce(input_)
 
-        if self.ca_comm is not None and \
-            not self.ca_comm.disabled and \
-                self.ca_comm.should_custom_ar(input_):
-            return torch.ops.vllm.outplace_all_reduce(
-                input_, group_name=self.unique_name)
-        else:
-            torch.ops.vllm.inplace_all_reduce(input_,
-                                              group_name=self.unique_name)
-            return input_
+        return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+        # always try custom allreduce first,
+        # and then pynccl.
         ca_comm = self.ca_comm
-        assert ca_comm is not None
-        assert not ca_comm.disabled
-        out = ca_comm.custom_all_reduce(input_)
-        assert out is not None
-        return out
-
-    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        if ca_comm is not None and not ca_comm.disabled and \
+            ca_comm.should_custom_ar(input_):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
         pynccl_comm = self.pynccl_comm
-        if (pynccl_comm is not None and not pynccl_comm.disabled):
-            pynccl_comm.all_reduce(input_)
-        else:
-            torch.distributed.all_reduce(input_, group=self.device_group)
+        assert pynccl_comm is not None
+        # TODO: pynccl should not use `stream=`
+        # it can just always use the current stream.
+        out = pynccl_comm.all_reduce(input_,
+                                     stream=torch.cuda.current_stream())
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 02f9498142bb7..13cbc8fa39c03 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -10,6 +10,7 @@
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -570,8 +571,9 @@ def capture_model(self) -> None:
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
-        for num_tokens in reversed(self.cudagraph_batch_sizes):
-            self._dummy_run(self.model, num_tokens, self.kv_caches)
+        with graph_capture():
+            for num_tokens in reversed(self.cudagraph_batch_sizes):
+                self._dummy_run(self.model, num_tokens, self.kv_caches)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]

From 940635343a087a5fb6548449989b84de77af5e73 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Nov 2024 14:55:00 +0800
Subject: [PATCH 6/8] [Misc] Remove outdated init protocols (#10655)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces.py      | 30 -------------------
 vllm/model_executor/models/interfaces_base.py |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 4f0c75b2c6a57..9b4a97abf9b51 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -10,7 +10,6 @@
 from .interfaces_base import is_embedding_model
 
 if TYPE_CHECKING:
-    from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -29,9 +28,6 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -39,9 +35,6 @@ def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
 class _SupportsMultiModalType(Protocol):
     supports_multimodal: Literal[True]
 
-    def __call__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 @overload
 def supports_multimodal(
@@ -81,10 +74,6 @@ class SupportsLoRA(Protocol):
     embedding_modules: ClassVar[Dict[str, str]]
     embedding_padding_modules: ClassVar[List[str]]
 
-    # lora_config is None when LoRA is not enabled
-    def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -97,9 +86,6 @@ class _SupportsLoRAType(Protocol):
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
-    def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 @overload
 def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
@@ -276,21 +262,11 @@ class HasInnerState(Protocol):
         for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
     """
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @runtime_checkable
 class _HasInnerStateType(Protocol):
     has_inner_state: ClassVar[Literal[True]]
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @overload
 def has_inner_state(model: object) -> TypeIs[HasInnerState]:
@@ -323,17 +299,11 @@ class IsAttentionFree(Protocol):
         True for Mamba but not Jamba.
     """
 
-    def __init__(self) -> None:
-        ...
-
 
 @runtime_checkable
 class _IsAttentionFreeType(Protocol):
     is_attention_free: ClassVar[Literal[True]]
 
-    def __init__(self) -> None:
-        ...
-
 
 @overload
 def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 7bb43beff255c..957a5a6e26b5c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -71,7 +71,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
                         and issubclass(model, nn.Module)):
         logger.warning(
             "The model (%s) is missing "
-            "vLLM-specific keywords from its initializer: %s",
+            "vLLM-specific keywords from its `forward` method: %s",
             model,
             missing_kws,
         )

From 334d64d1e816cc7c9fa2f67e22d24638e63c8e15 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 26 Nov 2024 00:20:04 -0800
Subject: [PATCH 7/8] [ci] add vllm_test_utils (#10659)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 Dockerfile                                    |  4 ++
 Dockerfile.cpu                                |  4 ++
 Dockerfile.hpu                                |  3 ++
 Dockerfile.neuron                             |  3 ++
 Dockerfile.openvino                           |  3 ++
 Dockerfile.ppc64le                            |  3 ++
 Dockerfile.rocm                               |  3 ++
 Dockerfile.tpu                                |  3 ++
 Dockerfile.xpu                                |  3 +-
 tests/entrypoints/llm/test_lazy_outlines.py   | 23 +++++---
 tests/test_lazy_torch_compile.py              | 54 +------------------
 tests/vllm_test_utils/setup.py                |  7 +++
 .../vllm_test_utils/__init__.py               |  8 +++
 .../vllm_test_utils/vllm_test_utils/blame.py  | 53 ++++++++++++++++++
 14 files changed, 113 insertions(+), 61 deletions(-)
 create mode 100644 tests/vllm_test_utils/setup.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/__init__.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/blame.py

diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..682f046d4b6ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,10 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils
+
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install hf_transfer
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 287b4958da4e5..d2f72ea975a3d 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -62,4 +62,8 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index d18fc016387bf..87e0c1a6a934e 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 
 RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..76dbd4c04d3f3 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -38,4 +38,7 @@ ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
     pip install --no-build-isolation -v -e .
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index a05ff452cd36e..8bd188ffde408 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..971248577983f 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip  \
 RUN --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 62d4a9b4909c3..e733994f8c33e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     if ls libs/*.whl; then \
     python3 -m pip install libs/*.whl; fi
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 0a507b6ecdf60..b617932a85b47 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 63bc682770422..a374f20d7d949 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1
-
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index cbfb0cc32c1ce..81fb000d8ac56 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,12 +1,12 @@
 import sys
 
+from vllm_test_utils import blame
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
+def run_normal():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -25,13 +25,12 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-    # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
-
     # Destroy the LLM object and free up the GPU memory.
     del llm
     cleanup_dist_env_and_memory()
 
+
+def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
@@ -51,5 +50,15 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
     # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
+    module_name = "outlines"
+    with blame(lambda: module_name in sys.modules) as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    assert not result.found, (
+        f"Module {module_name} is already imported, the"
+        f" first import location is:\n{result.trace_stack}")
diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py
index b8ac4dd93732b..4756fac8e2a8d 100644
--- a/tests/test_lazy_torch_compile.py
+++ b/tests/test_lazy_torch_compile.py
@@ -1,61 +1,9 @@
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
-
-import contextlib
-import dataclasses
 import sys
-import traceback
-from typing import Callable, Generator
-
-
-@dataclasses.dataclass
-class BlameResult:
-    found: bool = False
-    trace_stack: str = ""
-
-
-@contextlib.contextmanager
-def blame(func: Callable) -> Generator[BlameResult, None, None]:
-    """
-    Trace the function calls to find the first function that satisfies the
-    condition. The trace stack will be stored in the result.
-
-    Usage:
-
-    ```python
-    with blame(lambda: some_condition()) as result:
-        # do something
-    
-    if result.found:
-        print(result.trace_stack)
-    """
-    result = BlameResult()
-
-    def _trace_calls(frame, event, arg=None):
-        nonlocal result
-        if event in ['call', 'return']:
-            # for every function call or return
-            try:
-                # Temporarily disable the trace function
-                sys.settrace(None)
-                # check condition here
-                if not result.found and func():
-                    result.found = True
-                    result.trace_stack = "".join(traceback.format_stack())
-                # Re-enable the trace function
-                sys.settrace(_trace_calls)
-            except NameError:
-                # modules are deleted during shutdown
-                pass
-        return _trace_calls
-
-    sys.settrace(_trace_calls)
-
-    yield result
-
-    sys.settrace(None)
 
+from vllm_test_utils import blame
 
 module_name = "torch._inductor.async_compile"
 
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
new file mode 100644
index 0000000000000..790e891ec837d
--- /dev/null
+++ b/tests/vllm_test_utils/setup.py
@@ -0,0 +1,7 @@
+from setuptools import setup
+
+setup(
+    name='vllm_test_utils',
+    version='0.1',
+    packages=['vllm_test_utils'],
+)
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
new file mode 100644
index 0000000000000..bf0b62a5b75e3
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+vllm_utils is a package for vLLM testing utilities.
+It does not import any vLLM modules.
+"""
+
+from .blame import BlameResult, blame
+
+__all__ = ["blame", "BlameResult"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
new file mode 100644
index 0000000000000..ad23ab83c2d81
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -0,0 +1,53 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator
+
+
+@dataclasses.dataclass
+class BlameResult:
+    found: bool = False
+    trace_stack: str = ""
+
+
+@contextlib.contextmanager
+def blame(func: Callable) -> Generator[BlameResult, None, None]:
+    """
+    Trace the function calls to find the first function that satisfies the
+    condition. The trace stack will be stored in the result.
+
+    Usage:
+
+    ```python
+    with blame(lambda: some_condition()) as result:
+        # do something
+    
+    if result.found:
+        print(result.trace_stack)
+    """
+    result = BlameResult()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal result
+        if event in ['call', 'return']:
+            # for every function call or return
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # check condition here
+                if not result.found and func():
+                    result.found = True
+                    result.trace_stack = "".join(traceback.format_stack())
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    sys.settrace(_trace_calls)
+
+    yield result
+
+    sys.settrace(None)

From 1f6584ee851501cfae672973b9e55d000729818c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 26 Nov 2024 18:36:45 +0800
Subject: [PATCH 8/8] [V1] Enable profile for LLMEngine (#10665)

---
 vllm/v1/engine/llm_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7a5482f03b6fa..bd19d998a4adb 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -161,13 +161,13 @@ def step(self) -> List[RequestOutput]:
     # TODO(rob): Can we get rid of these?
 
     def get_model_config(self):
-        pass
+        return self.model_config
 
     def start_profile(self):
-        pass
+        self.engine_core.profile(True)
 
     def stop_profile(self):
-        pass
+        self.engine_core.profile(False)
 
     def get_tokenizer_group(self, group_type):
         pass