diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 83e869efa4712..b005d83c17f90 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -273,7 +273,8 @@ def __init__(self, config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index d33d4ac5bfaed..9466e72ecc639 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -277,7 +277,8 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index d02cf9044dfc0..43eec43d56643 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -256,7 +256,8 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 10aa8049a2347..47e62409072e5 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -400,7 +400,8 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config)