Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vlm] Remove vision language config. #6089

Merged
merged 11 commits into from
Jul 3, 2024
7 changes: 1 addition & 6 deletions docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``

.. code-block:: python

llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
)
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
xwjiang2010 marked this conversation as resolved.
Show resolved Hide resolved

.. important::
xwjiang2010 marked this conversation as resolved.
Show resolved Hide resolved
Currently, you have to specify ``image_feature_size`` to support memory profiling.
Expand Down
7 changes: 1 addition & 6 deletions examples/llava_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,7 @@


def run_llava():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
)
llm = LLM(model="llava-hf/llava-1.5-7b-hf")

prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

Expand Down
8 changes: 1 addition & 7 deletions examples/llava_next_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@


def run_llava_next():
llm = LLM(
model="llava-hf/llava-v1.6-mistral-7b-hf",
image_token_id=32000,
image_input_shape="1,3,336,336",
# Use the maximum possible value for memory profiling
image_feature_size=2928,
)
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")

prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
Expand Down
3 changes: 0 additions & 3 deletions examples/openai_vision_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
Launch the vLLM server with the following command:
python -m vllm.entrypoints.openai.api_server \
--model llava-hf/llava-1.5-7b-hf \
--image-token-id 32000 \
--image-input-shape 1,3,336,336 \
--image-feature-size 576 \
--chat-template template_llava.jinja
"""
import base64
Expand Down
6 changes: 1 addition & 5 deletions examples/phi3v_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@ def run_phi3v():
llm = LLM(
model=model_path,
trust_remote_code=True,
image_token_id=32044,
image_input_shape="1,3,1008,1344",
# Use the maximum possible value for memory profiling
image_feature_size=2653,
max_num_seqs=5,
max_num_seqs=1,
xwjiang2010 marked this conversation as resolved.
Show resolved Hide resolved
)

image = Image.open("images/cherry_blossom.jpg")
Expand Down
6 changes: 3 additions & 3 deletions tests/distributed/test_multimodal_broadcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
model = os.environ["TEST_DIST_MODEL"]

if model.startswith("llava-hf/llava"):
from ..models.test_llava import model_and_vl_config, run_test
from ..models.test_llava import models, run_test
elif model.startswith("microsoft/Phi-3-vision"):
from ..models.test_phi3v import model_and_vl_config, run_test
from ..models.test_phi3v import models, run_test
else:
raise NotImplementedError(f"Unsupported model: {model}")

Expand All @@ -44,7 +44,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
hf_runner,
vllm_runner,
image_assets,
model_and_config=model_and_vl_config[0],
model=models[0],
size_factors=[1.0],
dtype=dtype,
max_tokens=max_tokens,
Expand Down
60 changes: 17 additions & 43 deletions tests/models/test_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pytest
from transformers import AutoTokenizer

from vllm.config import VisionLanguageConfig
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs

Expand All @@ -21,49 +20,27 @@
"USER: <image>\nWhat's in this image?\nASSISTANT:",
})

IMAGE_TOKEN_ID = 32000

def iter_llava_configs(model_name: str):
image_hw_to_feature_size = {
(336, 336): 576,
}

for (h, w), f in image_hw_to_feature_size.items():
input_shape = (1, 3, h, w)
yield (model_name,
VisionLanguageConfig(image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape))


model_and_vl_config = [
*iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
]
models = ["llava-hf/llava-1.5-7b-hf"]


def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
vlm_config: VisionLanguageConfig, model_id: str):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
model: str):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
image_token_id = vlm_config.image_token_id

tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(image_token_id)
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id

hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
]

hf_output_str = output_str \
.replace(image_token_str * vlm_config.image_feature_size, "")
assert hf_output_str[0] == " "
hf_output_str = hf_output_str[1:]
assert output_str[0] == " "
xwjiang2010 marked this conversation as resolved.
Show resolved Hide resolved
hf_output_str = output_str[1:]
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)

Expand All @@ -74,7 +51,7 @@ def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model_and_config: Tuple[str, VisionLanguageConfig],
model,
xwjiang2010 marked this conversation as resolved.
Show resolved Hide resolved
*,
size_factors: List[float],
dtype: str,
Expand All @@ -92,7 +69,6 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id, vlm_config = model_and_config
images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
Expand All @@ -106,12 +82,11 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).

# max_model_len should be greater than image_feature_size
with vllm_runner(model_id,
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
**vlm_config.as_cli_args_dict()) as vllm_model:
enforce_eager=True) as vllm_model:
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
Expand All @@ -120,7 +95,7 @@ def run_test(
for prompts, images in inputs_per_image
]

with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
Expand All @@ -136,15 +111,15 @@ def run_test(
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, vlm_config, model_id)
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)


@pytest.mark.parametrize("model_and_config", model_and_vl_config)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
Expand All @@ -161,14 +136,13 @@ def run_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
size_factors, dtype: str, max_tokens: int,
num_logprobs: int) -> None:
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model_and_config,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
Expand Down
54 changes: 13 additions & 41 deletions tests/models/test_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
from transformers import AutoTokenizer

from vllm.config import VisionLanguageConfig
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs

Expand All @@ -27,46 +26,22 @@
f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
})


def iter_llava_next_configs(model_name: str):
# Need to use the max possible feature size for profile_run
image_hw_to_feature_size = {
(336, 336): 2928,
}

for (h, w), f in image_hw_to_feature_size.items():
input_shape = (1, 3, h, w)
yield (model_name,
VisionLanguageConfig(
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
))


model_and_vl_config = [
*iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
]
IMAGE_TOKEN_ID = 32000


def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
vlm_config: VisionLanguageConfig, model_id: str):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
model: str):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
image_token_id = vlm_config.image_token_id

tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(image_token_id)
tokenizer = AutoTokenizer.from_pretrained(model)
image_token_str = tokenizer.decode(IMAGE_TOKEN_ID)
eos_token_id = tokenizer.eos_token_id

hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
]

hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
Expand All @@ -78,7 +53,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
return hf_output_ids, hf_output_str, out_logprobs


@pytest.mark.parametrize("model_and_config", model_and_vl_config)
@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
@pytest.mark.parametrize(
"size_factors",
[
Expand All @@ -95,9 +70,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
size_factors, dtype: str, max_tokens: int,
num_logprobs: int) -> None:
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
"""Inference result should be the same between hf and vllm.

All the image fixtures for the test is under tests/images.
Expand All @@ -107,7 +81,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id, vlm_config = model_and_config
images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
Expand All @@ -116,11 +89,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

# max_model_len should be greater than image_feature_size
with vllm_runner(model_id,
with vllm_runner(model,
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
**vlm_config.as_cli_args_dict()) as vllm_model:
enforce_eager=True) as vllm_model:
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
Expand All @@ -129,7 +101,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
for prompts, images in inputs_per_image
]

with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
Expand All @@ -145,7 +117,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, vlm_config, model_id)
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
Expand Down
Loading
Loading