From efb04327a7bcf5c88bb939835632de6e123e3667 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:22:45 -0800 Subject: [PATCH 1/4] corrected types for strides in triton FA (#274) (#276) Co-authored-by: Aleksandr Malyshev (cherry picked from commit 9a46e97c1e63cbb5223a10a86705063b00e55576) --- vllm/attention/backends/rocm_flash_attn.py | 3 +- vllm/attention/ops/triton_flash_attention.py | 40 ++++++++++---------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7d2d87176800c..e5df445d8449b 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -619,7 +619,8 @@ def forward( # QKV for prefill. query = query[:num_prefill_tokens] - if key is not None and value is not None: + if key is not None and value is not None \ + and attn_type != AttentionType.ENCODER_DECODER: key = key[:num_prefill_tokens] value = value[:num_prefill_tokens] diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index f94211116a746..2019ed184e5a1 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -314,26 +314,26 @@ def attn_fwd( sm_scale, L, Out, - stride_qz, - stride_qh, - stride_qm, - stride_qk, - stride_kz, - stride_kh, - stride_kn, - stride_kk, - stride_vz, - stride_vh, - stride_vk, - stride_vn, - stride_oz, - stride_oh, - stride_om, - stride_on, - stride_bz, - stride_bh, - stride_bm, - stride_bn, + stride_qz: tl.int64, + stride_qh: tl.int64, + stride_qm: tl.int64, + stride_qk: tl.int64, + stride_kz: tl.int64, + stride_kh: tl.int64, + stride_kn: tl.int64, + stride_kk: tl.int64, + stride_vz: tl.int64, + stride_vh: tl.int64, + stride_vk: tl.int64, + stride_vn: tl.int64, + stride_oz: tl.int64, + stride_oh: tl.int64, + stride_om: tl.int64, + stride_on: tl.int64, + stride_bz: tl.int64, + stride_bh: tl.int64, + stride_bm: tl.int64, + stride_bn: tl.int64, cu_seqlens_q, cu_seqlens_k, dropout_p, From d291770df08a29e14b616d9ce1538b00ba09a432 Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Wed, 4 Dec 2024 15:46:32 -0600 Subject: [PATCH 2/4] Update test-template.j2 (#283) Adding build only k8s node and queue names update --- .buildkite/test-template.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index e7b24268ba398..ce448836a8278 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -19,7 +19,7 @@ steps: - exit_status: -10 # Agent was lost limit: 5 agents: - queue: amd + queue: amd-cpu {% for step in steps %} {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} @@ -27,7 +27,7 @@ steps: depends_on: - "amd-build" agents: - queue: amd + queue: amd_gpu commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: From 679a15cbdb64d1806286ecc0dd317768c58cdbf9 Mon Sep 17 00:00:00 2001 From: Hosang <156028780+hyoon1@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:30:40 -0500 Subject: [PATCH 3/4] Fix max_seqlens_q/k initialization for Navi GPUs (#310) - max_seqlens_q/k variables were not correctly initialized for Navi GPUs leading to incorrect outputs. - ensure that the correct values are passed to the attn_fwd kernel based on the GPU type. --- vllm/attention/ops/triton_flash_attention.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index a49df831b46ea..3671c2f91e3b7 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -912,9 +912,8 @@ def check_and_convert(t, scale): p_descale = 1.0 / p_scale o_descale = 1.0 / o_scale - if is_navi(): - max_seqlens_q = 0 - max_seqlens_k = 0 + arg_max_seqlens_q = 0 if is_navi() else max_seqlens_q + arg_max_seqlens_k = 0 if is_navi() else max_seqlens_k attn_fwd[grid]( q, @@ -944,8 +943,8 @@ def check_and_convert(t, scale): HQ=nheads_q, HK=nheads_k, ACTUAL_BLOCK_DMODEL=head_size, - MAX_SEQLENS_Q=max_seqlens_q, - MAX_SEQLENS_K=max_seqlens_k, + MAX_SEQLENS_Q=arg_max_seqlens_q, + MAX_SEQLENS_K=arg_max_seqlens_k, IS_CAUSAL=causal, VARLEN=True, BLOCK_DMODEL=padded_d_model, From 22f9066285861cc7cdb49d5caad995582ae3cd36 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:22:28 -0500 Subject: [PATCH 4/4] Setting the value for the scpecilative decoding worker class on rocm platform (#313) Signed-off-by: Gregory Shtrasberg --- vllm/platforms/rocm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 00efa056f7ef0..d2f7cd40e25b2 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -150,6 +150,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: elif vllm_config.speculative_config: parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker"