Skip to content

Commit

Permalink
[CI/Build][ROCm] Enabling LoRA tests on ROCm (vllm-project#7369)
Browse files Browse the repository at this point in the history
Co-authored-by: Simon Mo <[email protected]>
  • Loading branch information
alexeykondrat and simon-mo authored Sep 4, 2024
1 parent 13f6412 commit b19bd03
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 14 deletions.
47 changes: 41 additions & 6 deletions .buildkite/run-amd-test.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This script runs test inside the corresponding ROCm docker container.
set -ex
set -o pipefail

# Print ROCm version
echo "--- Confirming Clean Initial State"
Expand Down Expand Up @@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"

docker run \
commands=$@
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
--name ${container_name}_${GPU} \
${image_name} \
/bin/bash -c "${@}"

/bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}"
fi
3 changes: 1 addition & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,9 @@ steps:
- pytest -v -s spec_decode

- label: LoRA Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- csrc/punica
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
Expand Down Expand Up @@ -360,7 +360,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/lora
- csrc/punica
- tests/lora/test_long_context
commands:
# FIXIT: find out which code initialize cuda before running the test
Expand Down
4 changes: 4 additions & 0 deletions tests/lora/test_gemma.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import List

import pytest

import vllm
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip

MODEL_PATH = "google/gemma-7b"

Expand All @@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts


@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
Expand Down
24 changes: 18 additions & 6 deletions tests/lora/test_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import vllm
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip

from .conftest import cleanup

Expand All @@ -17,12 +18,23 @@ class ModelWithQuantization:
quantization: str


MODELS: List[ModelWithQuantization] = [
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
MODELS: List[ModelWithQuantization]
#AWQ quantization is currently not supported in ROCm.
if is_hip():
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
else:
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]


def do_sample(llm: vllm.LLM,
Expand Down

0 comments on commit b19bd03

Please sign in to comment.