Skip to content

Commit

Permalink
Merge branch 'main' into eagle
Browse files Browse the repository at this point in the history
  • Loading branch information
abhigoyal1997 authored Aug 20, 2024
2 parents f906cef + c4be16e commit 6af6665
Show file tree
Hide file tree
Showing 75 changed files with 1,440 additions and 165 deletions.
9 changes: 5 additions & 4 deletions .buildkite/nightly-benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,18 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan

Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.

Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label.
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.




## Performance benchmark details

See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.

See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.


#### Latency test
Expand All @@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:

In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

Expand Down
2 changes: 1 addition & 1 deletion .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ steps:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ def results_to_json(latency, throughput, serving):
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:

results = read_markdown(
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
results = read_markdown("../.buildkite/nightly-benchmarks/" +
"performance-benchmarks-descriptions.md")
results = results.format(
latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ check_hf_token() {
ensure_sharegpt_downloaded() {
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
if [ ! -f "$FILE" ]; then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
else
echo "$FILE already exists."
echo "$FILE already exists."
fi
}

Expand Down Expand Up @@ -68,11 +68,29 @@ wait_for_server() {
done' && return 0 || return 1
}

kill_processes_launched_by_current_bash() {
# Kill all python processes launched from current bash script
current_shell_pid=$$
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
if [ -n "$processes" ]; then
echo "Killing the following processes matching '$1':"
echo "$processes"
echo "$processes" | xargs kill -9
else
echo "No processes found matching '$1'."
fi
}

kill_gpu_processes() {
# kill all processes on GPU.

ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3


# wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
Expand All @@ -82,11 +100,6 @@ kill_gpu_processes() {
# remove vllm config file
rm -rf ~/.config/vllm

# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}

upload_to_buildkite() {
Expand All @@ -104,7 +117,7 @@ upload_to_buildkite() {
fi

# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}

Expand Down Expand Up @@ -156,7 +169,7 @@ run_latency_tests() {
latency_command: $latency,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

# run the benchmark
eval "$latency_command"
Expand All @@ -166,7 +179,6 @@ run_latency_tests() {
done
}


run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
Expand Down Expand Up @@ -214,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

# run the benchmark
eval "$throughput_command"
Expand Down Expand Up @@ -246,7 +258,6 @@ run_serving_tests() {
continue
fi


# get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters')
Expand Down Expand Up @@ -324,7 +335,7 @@ run_serving_tests() {
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

done

Expand All @@ -341,6 +352,7 @@ main() {
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)

# get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
Expand All @@ -359,7 +371,6 @@ main() {
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json


# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
Expand Down
9 changes: 9 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,15 @@ steps:
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/
- tests/multi_step/test_correctness.py
commands:
- pytest -v -s multi_step/test_correctness.py

- label: Pipeline Parallelism Test # 23min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
Expand Down
4 changes: 4 additions & 0 deletions collect_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
"nccl",
"transformers",
"zmq",
"nvidia",
"pynvml",
}

DEFAULT_PIP_PATTERNS = {
Expand All @@ -79,6 +81,8 @@
"nccl",
"transformers",
"zmq",
"nvidia",
"pynvml",
}


Expand Down
1 change: 1 addition & 0 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
sphinx-copybutton==0.5.2
myst-parser==2.0.0
sphinx-argparse==0.4.0
msgspec

# packages to install to build the documentation
pydantic
Expand Down
3 changes: 2 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ matplotlib # required for qwen-vl test
aiohttp

# quantization
bitsandbytes==0.42.0
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def _build_custom_ops() -> bool:


def _build_core_ext() -> bool:
return not _is_neuron() and not _is_tpu() and not _is_openvino()
return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())


def get_hipcc_rocm_version():
Expand Down
26 changes: 26 additions & 0 deletions tests/core/block/test_prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,32 @@ def test_eviction_order(num_blocks: int, block_size: int, seed: int):

assert new_block[0].block_id == last_block_id

# Test case for cache mertics
@staticmethod
def test_metric():
block_size = 16
allocator = PrefixCachingBlockAllocator(num_blocks=4,
block_size=block_size)
# Test when no query (0/0)
assert allocator.get_prefix_cache_hit_rate() == 0.0

token_ids = list(range(block_size))
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
# Test 0/1 hit rate
assert allocator.get_prefix_cache_hit_rate() == 0.0

allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
# Test 1/2 hit rate
assert allocator.get_prefix_cache_hit_rate() == 0.5

# Test more than one block
for _ in range(2, 1005):
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
assert allocator.get_prefix_cache_hit_rate() > 0.99

@staticmethod
def create_immutable_chain(
block_size: int,
Expand Down
24 changes: 19 additions & 5 deletions tests/models/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pytest
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer

from tests.quantization.utils import is_quant_method_supported

Expand All @@ -20,7 +21,7 @@
MODELS = [
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_0.gguf")),
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
Expand All @@ -39,22 +40,36 @@
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_models(
num_gpus_available,
vllm_runner,
example_prompts,
model,
dtype: str,
max_tokens: int,
num_logprobs: int,
tp_size: int,
) -> None:
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

original_model, gguf_model = model

tokenizer = AutoTokenizer.from_pretrained(original_model)
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in example_prompts]
example_prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

# Run unquantized model.
with vllm_runner(model_name=original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
enforce_eager=True,
tensor_parallel_size=1) as original_model:
tensor_parallel_size=tp_size) as original_model:

original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
Expand All @@ -63,8 +78,7 @@ def test_models(
with vllm_runner(model_name=gguf_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
enforce_eager=True,
tensor_parallel_size=1) as gguf_model:
tensor_parallel_size=tp_size) as gguf_model:
gguf_outputs = gguf_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)

Expand Down
Empty file added tests/multi_step/__init__.py
Empty file.
Loading

0 comments on commit 6af6665

Please sign in to comment.