Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into skip-lm-head
Browse files Browse the repository at this point in the history
* upstream/main:
  [Build] Temporarily Disable Kernels and LoRA tests (vllm-project#6961)
  [core][misc] improve free_finished_seq_groups (vllm-project#6865)
  [Kernel] Remove scaled_fp8_quant kernel padding footgun (vllm-project#6842)
  [Bugfix] Fix tensorizer memory profiling bug during testing (vllm-project#6881)
  [OpenVINO] Updated OpenVINO requirements and build docs (vllm-project#6948)
  [Kernel] Squash a few more warnings (vllm-project#6914)
  [BugFix] Fix use of per-request seed with pipeline parallel (vllm-project#6698)
  [Doc] Super tiny fix doc typo (vllm-project#6949)
  • Loading branch information
tjohnson31415 committed Jul 30, 2024
2 parents fd11ac1 + 40c27a7 commit ee83917
Show file tree
Hide file tree
Showing 36 changed files with 422 additions and 284 deletions.
40 changes: 20 additions & 20 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,12 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s multimodal

- label: Kernels Test %N
#mirror_hardwares: [amd]
commands:
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
# - label: Kernels Test %N
# #mirror_hardwares: [amd]
# commands:
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
# parallelism: 4

- label: Models Test
#mirror_hardwares: [amd]
Expand Down Expand Up @@ -202,20 +202,20 @@ steps:
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode

- label: LoRA Test %N
#mirror_hardwares: [amd]
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4

- label: LoRA Long Context (Distributed)
#mirror_hardwares: [amd]
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py
# - label: LoRA Test %N
# #mirror_hardwares: [amd]
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
# parallelism: 4

# - label: LoRA Long Context (Distributed)
# #mirror_hardwares: [amd]
# num_gpus: 4
# # This test runs llama 13B, so it is required to run on 4 GPUs.
# commands:
# # FIXIT: find out which code initialize cuda before running the test
# # before the fix, we need to use spawn to test it
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -v -s -x lora/test_long_context.py

- label: Tensorizer Test
#mirror_hardwares: [amd]
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.openvino
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

FROM ubuntu:20.04 AS dev
FROM ubuntu:22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
Expand All @@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with OpenVINO backend
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/

COPY examples/ /workspace/vllm/examples
COPY benchmarks/ /workspace/vllm/benchmarks
Expand Down
4 changes: 2 additions & 2 deletions csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);

int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);

// NOTE: alibi_slopes is optional.
Expand Down Expand Up @@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);

int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);

// NOTE: alibi_slopes is optional.
Expand Down
2 changes: 0 additions & 2 deletions csrc/quantization/aqlm/gemm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
}
__syncthreads();

float res = 0;

int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
while (iters--) {
if (pred && a_gl_rd < a_gl_end) {
Expand Down
2 changes: 2 additions & 0 deletions csrc/quantization/fp8/amd/quant_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
}
#endif
assert(false);
return {}; // Squash missing return statement warning
}

template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
Expand All @@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
}
#endif
assert(false);
return {}; // Squash missing return statement warning
}

// The following macro is used to dispatch the conversion function based on
Expand Down
2 changes: 2 additions & 0 deletions csrc/quantization/fp8/nvidia/quant_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
}
#endif
assert(false);
return {}; // Squash missing return statement warning
}

template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
Expand All @@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
}
#endif
assert(false);
return {}; // Squash missing return statement warning
}

// The following macro is used to dispatch the conversion function based on
Expand Down
3 changes: 2 additions & 1 deletion csrc/quantization/squeezellm/quant_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
#endif
mat.data_ptr<int>(),
#ifndef USE_ROCM
(half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
(half2*)mul.data_ptr<at::Half>(),
(__half*)lookup_table.data_ptr<at::Half>(),
#else
(float2*)mul.data_ptr<float>(),
(__half*)lookup_table.data_ptr<at::Half>(),
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/openvino-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Install from source

.. code-block:: console
$ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
.. _openvino_backend_performance_tips:

Expand Down
28 changes: 27 additions & 1 deletion requirements-openvino.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,33 @@
# Common dependencies
-r requirements-common.txt
# -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq

# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.1
2 changes: 1 addition & 1 deletion tests/quantization/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))

# Padding
y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
assert y.shape[0] == 17
assert torch.allclose(
ref_y,
Expand Down
23 changes: 8 additions & 15 deletions tests/samplers/test_rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,9 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
high=vocab_size,
size=(batch_size, k),
dtype=torch.int64)
generators = [None] * batch_size

rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids, generators)
draft_token_ids)


@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
Expand Down Expand Up @@ -185,14 +184,13 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,

results = []
for _ in range(n_rep):
generators = [
torch.Generator(
device=device).manual_seed(i) if seeded_mask[i] else None
for i in range(batch_size)
]
seeded_seqs = {
i: torch.Generator(device=device).manual_seed(i)
for i in range(batch_size) if seeded_mask[i]
}
results.append(
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids, generators))
draft_token_ids, seeded_seqs))

for i in range(batch_size):
if seeded_mask[i]:
Expand Down Expand Up @@ -242,11 +240,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
raise AssertionError()

oob_token_ids[0][0] = rogue_token_id
generators = [None] * batch_size

with pytest.raises(AssertionError):
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids, generators)
draft_token_ids)


@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
Expand Down Expand Up @@ -417,15 +414,11 @@ def _estimate_rejection_sampling_pdf(
dtype=torch.int64,
device="cuda").repeat(num_samples, 1)

# unseeded
generators = [None]

# Get output tokens via rejection sampling.
output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
bonus_token_ids.to("cuda"),
draft_probs.to("cuda"),
draft_token_ids.to("cuda"),
generators)
draft_token_ids.to("cuda"))

# Remove bonus tokens
output_token_ids = output_token_ids[:, :-1].flatten()
Expand Down
5 changes: 4 additions & 1 deletion tests/samplers/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,13 +510,16 @@ def test_sampler_mixed(seed: int, device: str):
))
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

generators: Dict[str, torch.Generator] = {}

def test_sampling():
sampling_metadata = SamplingMetadata.prepare(
seq_group_metadata_list,
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
pin_memory=is_pin_memory_available(),
generators=generators)
sampler_output = sampler(logits=fake_logits,
sampling_metadata=sampling_metadata)

Expand Down
54 changes: 53 additions & 1 deletion tests/spec_decode/e2e/test_mlp_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

import pytest

from .conftest import run_greedy_equality_correctness_test
from .conftest import (run_equality_correctness_test,
run_greedy_equality_correctness_test)

# main model
MAIN_MODEL = "JackFram/llama-160m"
Expand Down Expand Up @@ -77,6 +78,57 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
force_output_len=True)


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model": MAIN_MODEL,
# Speculative model
"speculative_model": SPEC_MODEL,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
@pytest.mark.parametrize("output_len", [64])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("temperature", [0.1, 1.0])
@pytest.mark.parametrize("seed", [None])
def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
batch_size: int, output_len: int,
temperature: float):
"""Verify seeded runs produce the same output."""
run_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
temperature=temperature,
seeded=True,
force_output_len=True)

# Ensure this same test does fail if we _don't_ include per-request seeds
with pytest.raises(AssertionError):
run_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
temperature=temperature,
seeded=False,
force_output_len=True)


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
Expand Down
2 changes: 1 addition & 1 deletion tests/spec_decode/e2e/test_seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"output_len",
[
# Use smaller output len for fast test.
10,
20,
])
@pytest.mark.parametrize("seed", [None])
def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
Expand Down
1 change: 1 addition & 0 deletions tests/spec_decode/test_batch_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def test_create_single_target_seq_group_metadata(k: int):
input_seq_id,
target_seq_id,
token_ids,
input_seq_group_metadata.sampling_params,
)

assert output.request_id == input_seq_group_metadata.request_id
Expand Down
Loading

0 comments on commit ee83917

Please sign in to comment.