From f9ecbb18bf03338a4272c933a49a87021363b048 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 17 Dec 2024 16:37:04 +0800
Subject: [PATCH] [Misc] Allow passing logits_soft_cap for xformers backend
 (#11252)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index e2e989efb020c..3e59b3603d2c6 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)