diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 00781ae4eb2c5..f3c3fe31c68a3 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -100,7 +100,11 @@ _ROCM_SWA_REASON, "PaliGemmaForConditionalGeneration": ("ROCm flash attention does not yet " - "fully support 32-bit precision on PaliGemma") + "fully support 32-bit precision on PaliGemma"), + "Phi3VForCausalLM": + ("ROCm Triton flash attention may run into compilation errors due to " + "excessive use of shared memory. If this happens, disable Triton FA " + "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") }