Skip to content

Commit

Permalink
[Model] Add user-configurable task for models that support both gener…
Browse files Browse the repository at this point in the history
…ation and embedding (#9424)
  • Loading branch information
DarkLight1337 authored Oct 18, 2024
1 parent 7dbe738 commit 051eaf6
Show file tree
Hide file tree
Showing 33 changed files with 451 additions and 201 deletions.
8 changes: 8 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,10 @@ Text Embedding
-
- ✅︎

.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.

Reward Modeling
---------------

Expand Down Expand Up @@ -482,6 +486,10 @@ Multimodal Embedding
- 🚧
- ✅︎

.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.

----

If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc

.. code-block:: bash
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
--trust-remote-code --limit-mm-per-prompt image=2
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
.. important::
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
Expand Down
1 change: 1 addition & 0 deletions examples/offline_inference_vision_language_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Create an LLM.
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
Expand Down
4 changes: 2 additions & 2 deletions examples/openai_api_client_for_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
--trust-remote-code --limit-mm-per-prompt image=2
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TokenizerPoolConfig
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel,
Expand Down Expand Up @@ -619,6 +619,7 @@ class VllmRunner:
def __init__(
self,
model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
Expand All @@ -634,6 +635,7 @@ def __init__(
) -> None:
self.model = LLM(
model=model_name,
task=task,
tokenizer=tokenizer_name,
trust_remote_code=True,
dtype=dtype,
Expand Down
15 changes: 13 additions & 2 deletions tests/core/test_chunked_prefill_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def test_simple():
num_seq_group = 4
max_model_len = 16
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens,
scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True)
Expand Down Expand Up @@ -78,6 +79,7 @@ def test_chunk():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -126,6 +128,7 @@ def test_complex():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len = 8
max_num_batched_tokens = 2
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len = 64
max_num_batched_tokens = 32
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs = 64
max_model_len = 32
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens,
scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True)
Expand All @@ -348,6 +354,7 @@ def test_swap():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down Expand Up @@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
Expand Down
56 changes: 32 additions & 24 deletions tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
1,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=1,
)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4
Expand All @@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
1,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=1,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
Expand All @@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
num_seq_group,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
Expand Down Expand Up @@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len = 30
max_batched_num_tokens = 30
scheduler_config = SchedulerConfig(
max_batched_num_tokens,
2,
max_model_len,
"generate",
max_num_batched_tokens=max_batched_num_tokens,
max_num_seqs=2,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
Expand Down Expand Up @@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
2,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=2,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
Expand Down Expand Up @@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group = 2
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
max_seq_group,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=max_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
Expand Down Expand Up @@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
16,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=16,
delay_factor=0.5,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
Expand Down Expand Up @@ -350,9 +357,10 @@ def initialize_scheduler(
):
block_size = block_size
scheduler_config = SchedulerConfig(
max_token_budget,
max_num_seqs,
max_model_len,
"generate",
max_num_batched_tokens=max_token_budget,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks
Expand Down
7 changes: 6 additions & 1 deletion tests/core/test_scheduler_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size = 4
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
scheduler_config = SchedulerConfig(
task="generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
Expand Down
Loading

0 comments on commit 051eaf6

Please sign in to comment.