vllm-project · DarkLight1337 · Jul 31, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -222,9 +222,13 @@ Vision Language Models
     -
   * - :code:`MiniCPM-V`
     - MiniCPM-V
-    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    - :code:`openbmb/MiniCPM-V-2(Incoming...)`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
     -
 
+.. note::
+  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork(:code:`HwwwH/MiniCPM-V-2`) for now.
+  For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -418,11 +418,9 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
-                                  input_embeds)
+                                  attn_metadata, intermediate_tensors)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,

diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
@@ -370,6 +370,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
@@ -463,11 +464,10 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        input_embeds: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, input_embeds)
+                                   attn_metadata, intermediate_tensors)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,