Skip to content

Commit

Permalink
[Model] Add Qwen2-Audio model support (vllm-project#9248)
Browse files Browse the repository at this point in the history
Co-authored-by: DarkLight1337 <[email protected]>
Signed-off-by: Tyler Michael Smith <[email protected]>
  • Loading branch information
2 people authored and tlrmchlsmth committed Nov 23, 2024
1 parent cdee3c1 commit 17b1175
Show file tree
Hide file tree
Showing 7 changed files with 515 additions and 17 deletions.
6 changes: 6 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,12 @@ Text Generation
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
- T + A\ :sup:`+`
- :code:`Qwen/Qwen2-Audio-7B-Instruct`
-
- ✅︎
* - :code:`Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I\ :sup:`E+` + V\ :sup:`+`
Expand Down
54 changes: 38 additions & 16 deletions examples/offline_inference_audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
from vllm.utils import FlexibleArgumentParser

audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = [
"What is recited in the audio?",
"What sport and what nursery rhyme are referenced?"
]
question_per_audio_count = {
0: "What is 1+1?",
1: "What is recited in the audio?",
2: "What sport and what nursery rhyme are referenced?"
}


# Ultravox 0.3
def run_ultravox(question, audio_count):
def run_ultravox(question: str, audio_count: int):
model_name = "fixie-ai/ultravox-v0_3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand All @@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
return llm, prompt, stop_token_ids


model_example_map = {
"ultravox": run_ultravox,
}
# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int):
model_name = "Qwen/Qwen2-Audio-7B-Instruct"

llm = LLM(model=model_name,
max_model_len=4096,
max_num_seqs=5,
limit_mm_per_prompt={"audio": audio_count})

audio_in_prompt = "".join([
f"Audio {idx+1}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
])

prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return llm, prompt, stop_token_ids


model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}


def main(args):
Expand All @@ -54,24 +75,25 @@ def main(args):

audio_count = args.num_audios
llm, prompt, stop_token_ids = model_example_map[model](
question_per_audio_count[audio_count - 1], audio_count)
question_per_audio_count[audio_count], audio_count)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
stop_token_ids=stop_token_ids)

assert args.num_prompts > 0
inputs = {
"prompt": prompt,
"multi_modal_data": {
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate
for asset in audio_assets[:audio_count]
]
},
}
}

assert args.num_prompts > 0
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts
Expand Down Expand Up @@ -100,7 +122,7 @@ def main(args):
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[1, 2],
choices=[0, 1, 2],
help="Number of audio items per prompt.")

args = parser.parse_args()
Expand Down
1 change: 1 addition & 0 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def iter_params(self, model_name: str):
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}
Expand Down
5 changes: 4 additions & 1 deletion vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ def _placeholder_str(self, modality: ModalityStr,
elif modality == "audio":
if model_type == "ultravox":
return "<|reserved_special_token_0|>"
raise TypeError(f"Unknown {modality} model type: {model_type}")
if model_type == "qwen2_audio":
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
raise TypeError(f"Unknown model type: {model_type}")
elif modality == "video":
if model_type == "qwen2_vl":
return "<|vision_start|><|video_pad|><|vision_end|>"
Expand Down
Loading

0 comments on commit 17b1175

Please sign in to comment.