From b19bd0326657e89b49c24dfb6771b759175c520f Mon Sep 17 00:00:00 2001 From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:57:54 -0400 Subject: [PATCH] [CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369) Co-authored-by: Simon Mo --- .buildkite/run-amd-test.sh | 47 +++++++++++++++++++++++++++++----- .buildkite/test-pipeline.yaml | 3 +-- tests/lora/test_gemma.py | 4 +++ tests/lora/test_quant_model.py | 24 ++++++++++++----- 4 files changed, 64 insertions(+), 14 deletions(-) mode change 100644 => 100755 .buildkite/run-amd-test.sh diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh old mode 100644 new mode 100755 index 5548071390aff..972c62a091aea --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,5 +1,5 @@ # This script runs test inside the corresponding ROCm docker container. -set -ex +set -o pipefail # Print ROCm version echo "--- Confirming Clean Initial State" @@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface" mkdir -p ${HF_CACHE} HF_MOUNT="/root/.cache/huggingface" -docker run \ +commands=$@ +PARALLEL_JOB_COUNT=8 +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +if [[ $commands == *"--shard-id="* ]]; then + for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do + #replace shard arguments + commands=${@//"--shard-id= "/"--shard-id=${GPU} "} + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + docker run \ --device /dev/kfd --device /dev/dri \ --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=0 \ + -e HIP_VISIBLE_DEVICES=${GPU} \ -e HF_TOKEN \ -v ${HF_CACHE}:${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ + --name ${container_name}_${GPU} \ ${image_name} \ - /bin/bash -c "${@}" - + /bin/bash -c "${commands}" \ + |& while read -r line; do echo ">>Shard $GPU: $line"; done & + PIDS+=($!) + done + #wait for all processes to finish and collect exit codes + for pid in ${PIDS[@]}; do + wait ${pid} + STATUS+=($?) + done + for st in ${STATUS[@]}; do + if [[ ${st} -ne 0 ]]; then + echo "One of the processes failed with $st" + exit ${st} + fi + done +else + docker run \ + --device /dev/kfd --device /dev/dri \ + --network host \ + --shm-size=16gb \ + --rm \ + -e HIP_VISIBLE_DEVICES=0 \ + -e HF_TOKEN \ + -v ${HF_CACHE}:${HF_MOUNT} \ + -e HF_HOME=${HF_MOUNT} \ + --name ${container_name} \ + ${image_name} \ + /bin/bash -c "${commands}" +fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 86eddb576c42a..65e1862ce8181 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -218,9 +218,9 @@ steps: - pytest -v -s spec_decode - label: LoRA Test %N # 30min each + mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - - csrc/punica - tests/lora command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py parallelism: 4 @@ -360,7 +360,6 @@ steps: num_gpus: 4 source_file_dependencies: - vllm/lora - - csrc/punica - tests/lora/test_long_context commands: # FIXIT: find out which code initialize cuda before running the test diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 709246179bfe4..58cac3156c9c1 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,7 +1,10 @@ from typing import List +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hip MODEL_PATH = "google/gemma-7b" @@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 2370c693e9534..133e0d4514a6d 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -7,6 +7,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hip from .conftest import cleanup @@ -17,12 +18,23 @@ class ModelWithQuantization: quantization: str -MODELS: List[ModelWithQuantization] = [ - ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - quantization="AWQ"), - ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), -] +MODELS: List[ModelWithQuantization] +#AWQ quantization is currently not supported in ROCm. +if is_hip(): + MODELS = [ + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + quantization="GPTQ"), + ] +else: + MODELS = [ + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + quantization="AWQ"), + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + quantization="GPTQ"), + ] def do_sample(llm: vllm.LLM,