Skip to content

Commit

Permalink
Merge branch 'upstream_main'
Browse files Browse the repository at this point in the history
  • Loading branch information
flaviabeo committed Oct 31, 2024
2 parents 0da7979 + 77f7ef2 commit 4531c33
Show file tree
Hide file tree
Showing 84 changed files with 3,128 additions and 3,576 deletions.
4 changes: 2 additions & 2 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
Expand Down
22 changes: 18 additions & 4 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
Expand Down Expand Up @@ -330,15 +331,28 @@ steps:
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py

- label: Decoder-only Multi-Modal Models Test # 1h31min
- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language
- pytest -v -s models/decoder_only/audio_language -m core_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model

- label: Decoder-only Multi-Modal Models Test (Extended)
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m 'not core_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'

- label: Other Models Test # 6min
#mirror_hardwares: [amd]
Expand Down Expand Up @@ -413,7 +427,7 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.neuron
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,6 @@ RUN python3 -m pip install -U \

ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
pip install --no-build-isolation -v -e . \
pip install --no-build-isolation -v -e .

CMD ["/bin/bash"]
2 changes: 1 addition & 1 deletion benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
Expand Down
7 changes: 5 additions & 2 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ Text Generation
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-
- ✅︎
- ✅︎
* - :code:`Qwen2ForCausalLM`
- Qwen2
Expand Down Expand Up @@ -516,7 +516,7 @@ Text Generation
- Qwen-VL
- T + I\ :sup:`E+`
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-
- ✅︎
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
Expand All @@ -540,6 +540,9 @@ Text Generation
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
.. note::
vLLM currently only supports adding LoRA to the language backbone of multimodal models.

.. note::
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
Expand Down
2 changes: 1 addition & 1 deletion docs/source/serving/compatibility_matrix.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ Feature x Feature
- ✅
- ✅
- ✅
-
- `<https://github.com/vllm-project/vllm/issues/8985>`__
- ?
- ✅
- ✅
Expand Down
5 changes: 1 addition & 4 deletions docs/source/serving/distributed_serving.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
Details for Distributed Inference and Serving
----------------------------------------------

vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.

Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.

Expand All @@ -49,9 +49,6 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
$ --tensor-parallel-size 4 \
$ --pipeline-parallel-size 2
.. note::
Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.

Multi-Node Inference and Serving
--------------------------------

Expand Down
6 changes: 3 additions & 3 deletions docs/source/serving/run_on_sky.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
.. raw:: html

Expand All @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down Expand Up @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference_openai.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

### Step 2: Run the batch
Expand Down Expand Up @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

Now upload your batch file to your S3 bucket.
Expand Down
3 changes: 1 addition & 2 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str):

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Tested on L40
llm = LLM(
model=model_name,
max_model_len=8192,
max_model_len=4096,
max_num_seqs=5,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={
Expand Down
12 changes: 6 additions & 6 deletions examples/openai_api_client_for_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def run_text_only() -> None:
"content": "What's the capital of France?"
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion.choices[0].message.content
Expand Down Expand Up @@ -83,7 +83,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -109,7 +109,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down Expand Up @@ -144,7 +144,7 @@ def run_multi_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand Down Expand Up @@ -175,7 +175,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -201,7 +201,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down
4 changes: 2 additions & 2 deletions examples/openai_example_batch.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing
Expand Down
12 changes: 12 additions & 0 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import pytest

from tests.kernels.utils import override_backend_env_variable

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test

Expand All @@ -28,6 +30,7 @@
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models(
hf_runner,
vllm_runner,
Expand All @@ -38,11 +41,15 @@ def test_models(
chunked_prefill_token_size: int,
enforce_eager: bool,
tensor_parallel_size: int,
attention_backend: str,
monkeypatch,
) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
override_backend_env_variable(monkeypatch, attention_backend)

max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size

Expand Down Expand Up @@ -71,13 +78,18 @@ def test_models(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
monkeypatch,
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)

if (model == "meta-llama/Llama-2-7b-hf"
and distributed_executor_backend == "ray"):
# test ray adag
Expand Down
26 changes: 14 additions & 12 deletions tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
if not fullgraph:
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"
all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
+ ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] +
["-tp", str(tp_size)]] * 3
# don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
# inductor will change the output, so we cannot compare them.
all_envs: List[Optional[Dict[str, str]]] = [{
"VLLM_TORCH_COMPILE_LEVEL":
str(level)
} for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]]
all_envs: List[Optional[Dict[str, str]]] = []
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]:
all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore

compare_all_settings(model, all_args, all_envs, method=method)
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,7 @@ def __init__(
is_sentence_transformer: bool = False,
skip_tokenizer_init: bool = False,
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
postprocess_inputs: Callable[[BatchEncoding],
BatchEncoding] = identity,
postprocess_inputs: Callable[..., BatchEncoding] = identity,
) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]

Expand Down Expand Up @@ -303,6 +302,7 @@ def __init__(
if skip_tokenizer_init:
self.tokenizer = self.processor.tokenizer

self.dtype = dtype
self.postprocess_inputs = postprocess_inputs

def get_inputs(
Expand Down Expand Up @@ -337,7 +337,7 @@ def get_inputs(
processor_kwargs["sampling_rate"] = sr

inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs)
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)

all_inputs.append(inputs)

Expand Down
Loading

0 comments on commit 4531c33

Please sign in to comment.