Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][Model] Add support for multiple audio chunks/audio URLs #7826

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 34 additions & 24 deletions examples/offline_inference_audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,33 @@
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser

# Input audio and question
audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate
question = "What is recited in the audio?"
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = [
"What is recited in the audio?",
"What sport and what nursery rhyme are referenced?"
]


# Ultravox 0.3
def run_ultravox(question):
def run_ultravox(question, audio_count):
model_name = "fixie-ai/ultravox-v0_3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [{
'role': 'user',
'content': f"<|reserved_special_token_0|>\n{question}"
'role':
'user',
'content':
"<|reserved_special_token_0|>\n" * audio_count + question
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

llm = LLM(model=model_name)
llm = LLM(model=model_name,
enforce_eager=True,
enable_chunked_prefill=False,
max_model_len=8192,
limit_mm_per_prompt={"audio": audio_count})
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand All @@ -44,7 +52,9 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")

llm, prompt, stop_token_ids = model_example_map[model](question)
audio_count = args.num_audios
llm, prompt, stop_token_ids = model_example_map[model](
question_per_audio_count[audio_count - 1], audio_count)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
Expand All @@ -53,23 +63,18 @@ def main(args):
stop_token_ids=stop_token_ids)

assert args.num_prompts > 0
if args.num_prompts == 1:
# Single inference
inputs = {
"prompt": prompt,
"multi_modal_data": {
"audio": audio_and_sample_rate
},
}

else:
inputs = {
"prompt": prompt,
"multi_modal_data": {
"audio": [
asset.audio_and_sample_rate
for asset in audio_assets[:audio_count]
]
},
}
if args.num_prompts > 1:
# Batch inference
inputs = [{
"prompt": prompt,
"multi_modal_data": {
"audio": audio_and_sample_rate
},
} for _ in range(args.num_prompts)]
inputs = [inputs] * args.num_prompts

outputs = llm.generate(inputs, sampling_params=sampling_params)

Expand All @@ -92,6 +97,11 @@ def main(args):
type=int,
default=1,
help='Number of prompts to run.')
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[1, 2],
help="Number of audio items per prompt.")

args = parser.parse_args()
main(args)
42 changes: 37 additions & 5 deletions examples/openai_audio_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@
models = client.models.list()
model = models.data[0].id

# Any format supported by librosa is supported
audio_url = AudioAsset("winning_call").url

# Use audio url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
Expand All @@ -39,7 +36,8 @@
{
"type": "audio_url",
"audio_url": {
"url": audio_url
# Any format supported by librosa is supported
"url": AudioAsset("winning_call").url
},
},
],
Expand All @@ -63,7 +61,8 @@ def encode_audio_base64_from_url(audio_url: str) -> str:
return result


audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
audio_base64 = encode_audio_base64_from_url(
audio_url=AudioAsset("winning_call").url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
Expand All @@ -88,3 +87,36 @@ def encode_audio_base64_from_url(audio_url: str) -> str:

result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")

chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What sport and what nursery rhyme are referenced?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": AudioAsset("mary_had_lamb").url
},
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": AudioAsset("winning_call").url
},
},
],
}],
model=model,
max_tokens=64,
temperature=0.0,
)

result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")
72 changes: 35 additions & 37 deletions tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,15 @@
MODEL_NAME = "fixie-ai/ultravox-v0_3"
TEST_AUDIO_URLS = [
AudioAsset("winning_call").url,
AudioAsset("mary_had_lamb").url
]


@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--limit-mm-per-prompt", f"audio={len(TEST_AUDIO_URLS)}"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
Expand Down Expand Up @@ -73,8 +71,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,

choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212)
assert chat_completion.usage.completion_tokens == 10

message = choice.message
message = chat_completion.choices[0].message
Expand Down Expand Up @@ -128,8 +125,7 @@ async def test_single_chat_session_audio_base64encoded(

choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212)
assert chat_completion.usage.completion_tokens == 10

message = choice.message
message = chat_completion.choices[0].message
Expand Down Expand Up @@ -204,50 +200,52 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
assert delta.content
assert "".join(chunks) == output


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
audio_url: str):

async def test_multiple_audio_urls(client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role":
"user",
"content": [
{
"type": "audio_url",
"audio_url": {
"url": audio_url
}
},
{
*({
"type": "audio_url",
"audio_url": {
"url": audio_url
}
},
} for audio_url in TEST_AUDIO_URLS),
{
"type": "text",
"text": "What's happening in this audio?"
"text": "What sport and what nursery rhyme are referenced?"
},
],
}]

with pytest.raises(openai.BadRequestError): # test multi-audio input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage.completion_tokens == 10

message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})

# the server should still work afterwards
completion = await client.completions.create(
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
messages=messages,
max_tokens=10,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Loading
Loading