From efb04327a7bcf5c88bb939835632de6e123e3667 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 13 Nov 2024 10:22:45 -0800
Subject: [PATCH 1/4] corrected types for strides in triton FA (#274) (#276)

Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
(cherry picked from commit 9a46e97c1e63cbb5223a10a86705063b00e55576)
---
 vllm/attention/backends/rocm_flash_attn.py   |  3 +-
 vllm/attention/ops/triton_flash_attention.py | 40 ++++++++++----------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7d2d87176800c..e5df445d8449b 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -619,7 +619,8 @@ def forward(
         # QKV for prefill.
         query = query[:num_prefill_tokens]
 
-        if key is not None and value is not None:
+        if key is not None and value is not None \
+            and attn_type != AttentionType.ENCODER_DECODER:
             key = key[:num_prefill_tokens]
             value = value[:num_prefill_tokens]
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index f94211116a746..2019ed184e5a1 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -314,26 +314,26 @@ def attn_fwd(
     sm_scale,
     L,
     Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    stride_bz,
-    stride_bh,
-    stride_bm,
-    stride_bn,
+    stride_qz: tl.int64,
+    stride_qh: tl.int64,
+    stride_qm: tl.int64,
+    stride_qk: tl.int64,
+    stride_kz: tl.int64,
+    stride_kh: tl.int64,
+    stride_kn: tl.int64,
+    stride_kk: tl.int64,
+    stride_vz: tl.int64,
+    stride_vh: tl.int64,
+    stride_vk: tl.int64,
+    stride_vn: tl.int64,
+    stride_oz: tl.int64,
+    stride_oh: tl.int64,
+    stride_om: tl.int64,
+    stride_on: tl.int64,
+    stride_bz: tl.int64,
+    stride_bh: tl.int64,
+    stride_bm: tl.int64,
+    stride_bn: tl.int64,
     cu_seqlens_q,
     cu_seqlens_k,
     dropout_p,

From d291770df08a29e14b616d9ce1538b00ba09a432 Mon Sep 17 00:00:00 2001
From: dhonnappa-amd <divin.honnappa@amd.com>
Date: Wed, 4 Dec 2024 15:46:32 -0600
Subject: [PATCH 2/4] Update test-template.j2 (#283)

Adding build only k8s node and queue names update
---
 .buildkite/test-template.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index e7b24268ba398..ce448836a8278 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -19,7 +19,7 @@ steps:
         - exit_status: -10  # Agent was lost
           limit: 5
     agents:
-      queue: amd
+      queue: amd-cpu
 
 {% for step in steps %}
 {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
@@ -27,7 +27,7 @@ steps:
     depends_on: 
       - "amd-build"
     agents:
-      queue: amd
+      queue: amd_gpu
     commands: 
       - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
     env:

From 679a15cbdb64d1806286ecc0dd317768c58cdbf9 Mon Sep 17 00:00:00 2001
From: Hosang <156028780+hyoon1@users.noreply.github.com>
Date: Mon, 9 Dec 2024 12:30:40 -0500
Subject: [PATCH 3/4] Fix max_seqlens_q/k initialization for Navi GPUs (#310)

- max_seqlens_q/k variables were not correctly initialized for Navi GPUs leading to incorrect outputs.
- ensure that the correct values are passed to the attn_fwd kernel based on the GPU type.
---
 vllm/attention/ops/triton_flash_attention.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index a49df831b46ea..3671c2f91e3b7 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -912,9 +912,8 @@ def check_and_convert(t, scale):
         p_descale = 1.0 / p_scale
         o_descale = 1.0 / o_scale
 
-        if is_navi():
-            max_seqlens_q = 0
-            max_seqlens_k = 0
+        arg_max_seqlens_q = 0 if is_navi() else max_seqlens_q
+        arg_max_seqlens_k = 0 if is_navi() else max_seqlens_k
 
         attn_fwd[grid](
             q,
@@ -944,8 +943,8 @@ def check_and_convert(t, scale):
             HQ=nheads_q,
             HK=nheads_k,
             ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=max_seqlens_q,
-            MAX_SEQLENS_K=max_seqlens_k,
+            MAX_SEQLENS_Q=arg_max_seqlens_q,
+            MAX_SEQLENS_K=arg_max_seqlens_k,
             IS_CAUSAL=causal,
             VARLEN=True,
             BLOCK_DMODEL=padded_d_model,

From 22f9066285861cc7cdb49d5caad995582ae3cd36 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:22:28 -0500
Subject: [PATCH 4/4] Setting the value for the scpecilative decoding worker
 class on rocm platform (#313)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/platforms/rocm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 00efa056f7ef0..d2f7cd40e25b2 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -150,6 +150,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             elif vllm_config.speculative_config:
                 parallel_config.worker_cls = \
                     "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.worker.Worker"
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"