diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index e7b24268ba398..ce448836a8278 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -19,7 +19,7 @@ steps: - exit_status: -10 # Agent was lost limit: 5 agents: - queue: amd + queue: amd-cpu {% for step in steps %} {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} @@ -27,7 +27,7 @@ steps: depends_on: - "amd-build" agents: - queue: amd + queue: amd_gpu commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index a49df831b46ea..3671c2f91e3b7 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -912,9 +912,8 @@ def check_and_convert(t, scale): p_descale = 1.0 / p_scale o_descale = 1.0 / o_scale - if is_navi(): - max_seqlens_q = 0 - max_seqlens_k = 0 + arg_max_seqlens_q = 0 if is_navi() else max_seqlens_q + arg_max_seqlens_k = 0 if is_navi() else max_seqlens_k attn_fwd[grid]( q, @@ -944,8 +943,8 @@ def check_and_convert(t, scale): HQ=nheads_q, HK=nheads_k, ACTUAL_BLOCK_DMODEL=head_size, - MAX_SEQLENS_Q=max_seqlens_q, - MAX_SEQLENS_K=max_seqlens_k, + MAX_SEQLENS_Q=arg_max_seqlens_q, + MAX_SEQLENS_K=arg_max_seqlens_k, IS_CAUSAL=causal, VARLEN=True, BLOCK_DMODEL=padded_d_model, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 00efa056f7ef0..d2f7cd40e25b2 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -150,6 +150,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: elif vllm_config.speculative_config: parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker"