Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Extend Ultravox to accept audio longer than 30s #13631

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cc2df56
update ultravox to accept more than 30s audio
farzadab Feb 20, 2025
c7e0329
temporarily use model with updated processor for tests
farzadab Feb 20, 2025
0c5363e
fix collation
farzadab Feb 21, 2025
189f5cc
revert audio_replacement -> audio_token_replacement
farzadab Feb 25, 2025
bc3ba8c
increase max mm tokens
farzadab Feb 25, 2025
618e752
increase max mm tokens
farzadab Feb 25, 2025
0e62945
reduce max mm tokens
farzadab Feb 25, 2025
69278e2
revert increasing max mm tokens
farzadab Feb 25, 2025
788fd59
Merge remote-tracking branch 'upstream/main' into farzad-long-audio
farzadab Feb 25, 2025
75c138b
fix <|begin_of_text|> not being included
farzadab Feb 25, 2025
3b0e237
batching for whisper to avoid oom
farzadab Feb 26, 2025
97f6f5b
add comment
farzadab Feb 26, 2025
bea5a31
use flat_from_sizes for ultravox mm_fields_config
farzadab Feb 26, 2025
28f16ce
revert ultravox test model id
farzadab Feb 26, 2025
48c359b
improve documentation for double bos_id case
farzadab Feb 26, 2025
e920ab9
Merge remote-tracking branch 'upstream/main' into farzad-long-audio
farzadab Feb 26, 2025
4a54ea1
do not use vocab in get_hf_processor
farzadab Feb 26, 2025
347ada8
revert tests to use v0_5
farzadab Feb 26, 2025
e829dac
Merge remote-tracking branch 'upstream/main' into farzad-long-audio
farzadab Mar 1, 2025
b04878e
revert tests to use v0_5
farzadab Mar 1, 2025
631487f
adding tests for both ultravox v0.4 and v0.5
farzadab Mar 1, 2025
a9828ea
handle audio_num_chunks when no audio is passed
farzadab Mar 3, 2025
33a9cf0
drop test for ultravox v0_4
farzadab Mar 3, 2025
7ca61cf
drop matching Ultravox audio_features with cache
farzadab Mar 4, 2025
48f7da3
ignore exact match for audio_features in _items_by_modality
farzadab Mar 5, 2025
2813a47
fix type hint
farzadab Mar 5, 2025
66c10e4
Merge remote-tracking branch 'vllm-base/main' into farzad-long-audio
farzadab Mar 5, 2025
11ff27f
debug logs for ci
farzadab Mar 10, 2025
2776a31
if all else fails just stack?
farzadab Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from ....utils import RemoteOpenAIServer
from ...utils import check_logprobs_close

MODEL_NAME = "fixie-ai/ultravox-v0_4"
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"

AudioTuple = tuple[np.ndarray, int]

Expand Down
57 changes: 50 additions & 7 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0

import copy
from functools import partial
from typing import Optional

import numpy as np
import pytest
Expand All @@ -21,6 +23,7 @@ def _test_processing_correctness(
hit_rate: float,
num_batches: int,
simplify_rate: float,
ignore_mm_keys: Optional[list[str]] = None,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
Expand Down Expand Up @@ -123,26 +126,32 @@ def _test_processing_correctness(
hf_processor_mm_kwargs={},
)

assert baseline_result == cached_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
assert _drop_mm_kwargs_keys(
baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
cached_result, ignore_mm_keys), (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")

baseline_tokenized_result = baseline_processor.apply(
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)

assert baseline_result == baseline_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
assert _drop_mm_kwargs_keys(
baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
baseline_tokenized_result, ignore_mm_keys), (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")

cached_tokenized_result = cached_processor.apply(
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)

assert cached_result == cached_tokenized_result, (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
assert _drop_mm_kwargs_keys(
cached_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
cached_tokenized_result, ignore_mm_keys), (
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")


# yapf: disable
Expand Down Expand Up @@ -173,7 +182,7 @@ def _test_processing_correctness(
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_4",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
Expand All @@ -186,11 +195,19 @@ def test_processing_correctness(
num_batches: int,
simplify_rate: float,
):
ignore_mm_keys = None
if 'ultravox' in model_id:
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
# attention_mask lets us ignore the difference.
ignore_mm_keys = ['audio_features']

_test_processing_correctness(
model_id,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
ignore_mm_keys=ignore_mm_keys,
)


Expand Down Expand Up @@ -219,3 +236,29 @@ def test_processing_correctness_phi3v(
num_batches=num_batches,
simplify_rate=simplify_rate,
)


def _drop_mm_kwargs_keys(result: dict,
ignore_mm_keys: Optional[list[str]] = None) -> dict:
"""Drop specified keys from result['mm_kwargs'].

This is mainly to avoid doing exact match of audio_features in ultravox.

Args:
result: Result to drop keys from
ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
"""
if not ignore_mm_keys:
return result

if 'mm_kwargs' in result:
result = copy.deepcopy(result)
mm_kwargs = result['mm_kwargs']
for key in ignore_mm_keys:
mm_kwargs.pop(key, None)
for items in mm_kwargs._items_by_modality.values():
for item in items:
for key in ignore_mm_keys:
item.pop(key, None)

return result
3 changes: 1 addition & 2 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,7 @@ def check_available_online(
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
min_transformers_version="4.49"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
extras={"v0.5": "fixie-ai/ultravox-v0_5-llama-3_2-1b"}, # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
trust_remote_code=True),
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
Expand Down
Loading