From 5f8d8075f957d5376b2f1cc451e35a2a757e95a5 Mon Sep 17 00:00:00 2001 From: litianjian <45817262+litianjian@users.noreply.github.com> Date: Tue, 29 Oct 2024 02:04:10 +0800 Subject: [PATCH] [Model][VLM] Add multi-video support for LLaVA-Onevision (#8905) Co-authored-by: litianjian Co-authored-by: DarkLight1337 --- .../vision_language/test_llava_onevision.py | 173 +++++------------- vllm/model_executor/models/clip.py | 4 +- vllm/model_executor/models/llava_onevision.py | 94 +++++++--- vllm/model_executor/models/siglip.py | 4 +- vllm/multimodal/video.py | 10 +- 5 files changed, 123 insertions(+), 162 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 367f25f446279..1616fd299b9aa 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type, overload +from typing import List, Optional, Tuple, Type import pytest from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, @@ -9,9 +9,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _VideoAssets) -from ....utils import large_gpu_test +from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, + PromptVideoInput, VllmRunner) from ...utils import check_logprobs_close # Video test @@ -20,7 +19,7 @@ "<|im_start|>user\n