Skip to content

Commit

Permalink
[V1] Support sliding window attention (vllm-project#9679)
Browse files Browse the repository at this point in the history
Signed-off-by: Woosuk Kwon <[email protected]>
Signed-off-by: Shanshan Wang <[email protected]>
  • Loading branch information
WoosukKwon authored and cooleel committed Oct 28, 2024
1 parent f9c7c75 commit 0219172
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions vllm/v1/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,10 @@ def __init__(
if alibi_slopes is not None:
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
self.alibi_slopes = alibi_slopes
self.sliding_window = ((sliding_window, sliding_window)
if sliding_window is not None else (-1, -1))
if sliding_window is None:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (sliding_window - 1, 0)
self.kv_cache_dtype = kv_cache_dtype
if logits_soft_cap is None:
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
Expand All @@ -93,12 +95,6 @@ def __init__(
assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads

if sliding_window is not None:
# NOTE(woosuk): flash-attn's sliding window does not work with
# paged KV cache.
raise ValueError(
"Sliding window is not supported in FlashAttention.")

support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
if head_size not in support_head_sizes:
raise ValueError(
Expand Down

0 comments on commit 0219172

Please sign in to comment.