Merge branch 'vllm-project:main' into hidden_states_fix

vllm-project · Aug 17, 2024 · 08b3cd5 · 08b3cd5
2 parents 6954ead + d95cc0a
commit 08b3cd5
Show file tree

Hide file tree

Showing 92 changed files with 5,539 additions and 566 deletions.
diff --git a/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml b/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
-model_name: "nvidia/Minitron-4B-Base"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.252
+    value: 0.233
   - name: "exact_match,flexible-extract"
-    value: 0.252
+    value: 0.236
 limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,7 +4,7 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -70,23 +70,13 @@ wait_for_server() {
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
-  else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
 
-      echo "All GPU processes have been killed."
-  fi
+  ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
+  ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
 
-  # waiting for GPU processes to be fully killed
-  # loop while nvidia-smi returns any processes
-  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+  # wait until GPU memory usage smaller than 1GB
+  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
     sleep 1
-    echo "Waiting for GPU processes to be killed"
   done
 
   # remove vllm config file

diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md
@@ -1,47 +1,42 @@
 
 ## Latency tests
 
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
-### Latency benchmarking results
 
 {latency_tests_markdown_table}
 
-## Throughput tests
 
-This test suite aims to test vllm's throughput.
+## Throughput tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 
-### Throughput benchmarking results
 
 {throughput_tests_markdown_table}
 
-## Serving tests
 
-This test suite aims to test vllm's real serving metrics.
+## Serving tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
-### Serving benchmarking results
 
 {serving_tests_markdown_table}
 
+
 ## json version of the benchmarking tables
 
 This section contains the data of the markdown tables above in JSON format. 

diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "num_iters_warmup": 5,
@@ -12,7 +12,7 @@
     {
         "test_name": "latency_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "num-iters-warmup": 5,

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -3,15 +3,15 @@
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -22,15 +22,15 @@
         "test_name": "serving_llama70B_tp4_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -60,7 +60,7 @@
         "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
         "qps_list": [2],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
             "swap_space": 16, 
@@ -70,7 +70,7 @@
             "use_v2_block_manager": ""
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "throughput_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -13,7 +13,7 @@
     {
         "test_name": "throughput_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -56,7 +56,7 @@ steps:
   - pytest -v -s worker # Worker
 
 - label: Basic Correctness Test # 30min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   fast_check: true
   source_file_dependencies:
   - vllm/
@@ -81,7 +81,7 @@ steps:
 - label: Entrypoints Test # 20min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   commands:
@@ -101,10 +101,9 @@ steps:
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
-##### fast check tests  #####
-#####  1 GPU test  #####
-
 - label: Metrics, Tracing Test # 10min
+  num_gpus: 2 
+  fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/metrics
@@ -118,6 +117,9 @@ steps:
       opentelemetry-semantic-conventions-ai"
   - pytest -v -s tracing
 
+##### fast check tests  #####
+#####  1 GPU test  #####
+
 - label: Regression Test # 5min
   mirror_hardwares: [amd]
   source_file_dependencies:
@@ -139,7 +141,7 @@ steps:
 
 - label: Examples Test # 12min
   working_dir: "/vllm-workspace/examples"
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
@@ -171,14 +173,14 @@ steps:
 
 
 - label: Vision Language Models Test # 42min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   commands:
     - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test # 7min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
@@ -261,7 +263,6 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -290,7 +291,7 @@ steps:
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 
 - label: Distributed Tests (2 GPUs) # 28min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -313,11 +314,11 @@ steps:
   num_gpus: 4
   source_file_dependencies:
   - vllm/
-  - tests/distributed/test_pipeline_parallel
   - tests/distributed/test_pp_cudagraph.py
+  - tests/distributed/test_pipeline_parallel
   commands:
-  - pytest -v -s distributed/test_pipeline_parallel.py
   - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.

diff --git a/.gitignore b/.gitignore
@@ -189,4 +189,4 @@ _build/
 hip_compat.h
 
 # Benchmark dataset
-*.json
+benchmarks/*.json