Skip to content

Commit

Permalink
[Bugfix][CI/Build][Hardware][AMD] Shard ID parameters in AMD tests ru…
Browse files Browse the repository at this point in the history
…nning parallel jobs (vllm-project#9279)

Signed-off-by: Hissu Hyvarinen <[email protected]>
Signed-off-by: Sumit Dubey <[email protected]>
  • Loading branch information
hissu-hyvarinen authored and sumitd2 committed Nov 14, 2024
1 parent dc46c9a commit 4e6fe2f
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
11 changes: 6 additions & 5 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,12 @@ fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands"
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
Expand All @@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \
${image_name} \
/bin/bash -c "${commands}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
Expand Down
7 changes: 6 additions & 1 deletion tests/lora/test_minicpmv.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from typing import List

import pytest

import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"

Expand Down Expand Up @@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts


@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand All @@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
trust_remote_code=True,
gpu_memory_utilization=0.97 # This model is pretty big for CI gpus
)

output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
Expand Down

0 comments on commit 4e6fe2f

Please sign in to comment.