[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (vl…

…lm-project#10355) Signed-off-by: jiang1.li <[email protected]> Signed-off-by: Tyler Michael Smith <[email protected]>
neuralmagic · Nov 23, 2024 · 6475943 · 6475943
1 parent 93ec8fa
commit 6475943
Show file tree

Hide file tree

Showing 8 changed files with 559 additions and 369 deletions.
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -25,6 +25,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 
 function cpu_tests() {
   set -e
+  export NUMA_NODE=$2
 
   # offline inference
   docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
@@ -57,6 +58,12 @@ function cpu_tests() {
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
+  # Run chunked-prefill and prefix-cache test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v -k cpu_model \
+    tests/basic_correctness/test_chunked_prefill.py"  
+
   # online inference
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
@@ -75,4 +82,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 25 mins.
 export -f cpu_tests
-timeout 25m bash -c "cpu_tests $CORE_RANGE"
+timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
@@ -5,11 +5,11 @@ Installation with CPU
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
-- Tensor Parallel (``-tp = N``)
-- Quantization (``INT8 W8A8, AWQ``)
-
-.. note::
-    More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
+- Tensor Parallel 
+- Model Quantization (``INT8 W8A8, AWQ``)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
 
 Table of contents:
 

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
@@ -344,15 +344,15 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-     - ✗ 
+     - ✅
      - ✅
    * - :ref:`APC <apc>`
      - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
      - ✅
      - ✅
      - ✅
      - ✅
-     - ✗
+     - ✅
      - ✅
    * - :ref:`LoRA <lora>`
      - ✅

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -12,6 +12,7 @@
 import pytest
 
 from tests.kernels.utils import override_backend_env_variable
+from vllm.platforms import current_platform
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
@@ -206,12 +207,14 @@ def test_models_with_fp8_kv_cache(
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
     vllm_runner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
     tensor_parallel_size: int,
+    dtype: str,
 ) -> None:
     """
     Checks exact match decode with and without prefix caching
@@ -233,7 +236,7 @@ def test_with_prefix_caching(
     for enable in (True, False):
         with vllm_runner(
                 model,
-                dtype="half",
+                dtype=dtype,
                 max_num_batched_tokens=max_num_batched_tokens,
                 enable_chunked_prefill=True,
                 enable_prefix_caching=enable,
@@ -260,3 +263,61 @@ def test_with_prefix_caching(
             name_0="w/o prefix caching",
             name_1="with prefix caching",
         )
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_models_cpu(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    attention_backend: str,
+    monkeypatch,
+) -> None:
+    test_models(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        dtype,
+        max_tokens,
+        chunked_prefill_token_size,
+        enforce_eager,
+        1,
+        attention_backend,
+        monkeypatch,
+    )
+
+
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("chunk_size", [30, 32])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_with_prefix_caching_cpu(
+    vllm_runner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    dtype: str,
+) -> None:
+    test_with_prefix_caching(
+        vllm_runner,
+        max_tokens,
+        enforce_eager,
+        chunk_size,
+        1,
+        dtype,
+    )
-Original file line number
+Diff line change
@@ Expand Up / @@ -344,15 +344,15 @@ Feature x Hardware @@
          - ✅
          - ✅
          - ✅
-         - ✗
+         - ✅
          - ✅
        * - :ref:`APC <apc>`
          - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__
          - ✅
          - ✅
          - ✅
          - ✅
-         - ✗
+         - ✅
          - ✅
        * - :ref:`LoRA <lora>`
          - ✅
@@ Expand Down @@