Skip to content

Commit

Permalink
[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (vl…
Browse files Browse the repository at this point in the history
…lm-project#10355)

Signed-off-by: jiang1.li <[email protected]>
Signed-off-by: Tyler Michael Smith <[email protected]>
  • Loading branch information
bigPYJ1151 authored and tlrmchlsmth committed Nov 23, 2024
1 parent 93ec8fa commit 6475943
Show file tree
Hide file tree
Showing 8 changed files with 559 additions and 369 deletions.
9 changes: 8 additions & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg

function cpu_tests() {
set -e
export NUMA_NODE=$2

# offline inference
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
Expand Down Expand Up @@ -57,6 +58,12 @@ function cpu_tests() {
pytest -s -v \
tests/quantization/test_ipex_quant.py"

# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"

# online inference
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
Expand All @@ -75,4 +82,4 @@ function cpu_tests() {

# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
timeout 25m bash -c "cpu_tests $CORE_RANGE"
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
10 changes: 5 additions & 5 deletions docs/source/getting_started/cpu-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ Installation with CPU

vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:

- Tensor Parallel (``-tp = N``)
- Quantization (``INT8 W8A8, AWQ``)

.. note::
More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
- Tensor Parallel
- Model Quantization (``INT8 W8A8, AWQ``)
- Chunked-prefill
- Prefix-caching
- FP8-E5M2 KV-Caching (TODO)

Table of contents:

Expand Down
4 changes: 2 additions & 2 deletions docs/source/serving/compatibility_matrix.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,15 @@ Feature x Hardware
- ✅
- ✅
- ✅
-
-
- ✅
* - :ref:`APC <apc>`
- `<https://github.com/vllm-project/vllm/issues/3687>`__
- ✅
- ✅
- ✅
- ✅
-
-
- ✅
* - :ref:`LoRA <lora>`
- ✅
Expand Down
63 changes: 62 additions & 1 deletion tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pytest

from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
Expand Down Expand Up @@ -206,12 +207,14 @@ def test_models_with_fp8_kv_cache(
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching(
vllm_runner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
tensor_parallel_size: int,
dtype: str,
) -> None:
"""
Checks exact match decode with and without prefix caching
Expand All @@ -233,7 +236,7 @@ def test_with_prefix_caching(
for enable in (True, False):
with vllm_runner(
model,
dtype="half",
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
enable_prefix_caching=enable,
Expand All @@ -260,3 +263,61 @@ def test_with_prefix_caching(
name_0="w/o prefix caching",
name_1="with prefix caching",
)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch,
) -> None:
test_models(
hf_runner,
vllm_runner,
example_prompts,
model,
dtype,
max_tokens,
chunked_prefill_token_size,
enforce_eager,
1,
attention_backend,
monkeypatch,
)


@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
dtype: str,
) -> None:
test_with_prefix_caching(
vllm_runner,
max_tokens,
enforce_eager,
chunk_size,
1,
dtype,
)
Loading

0 comments on commit 6475943

Please sign in to comment.