Merge branch 'habana_main' into SW-201504_1

HabanaAI · Dec 18, 2024 · 044acd2 · 044acd2
2 parents c008a9c + 88ef381
commit 044acd2
Show file tree

Hide file tree

Showing 176 changed files with 6,930 additions and 3,471 deletions.
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -39,3 +39,19 @@ steps:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+  - label: "Build and publish TPU release image"
+    depends_on: ~
+    if: build.env("NIGHTLY") == "1"
+    agents:
+      queue: tpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "docker push vllm/vllm-tpu:nightly"
+      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllm
+          password-env: DOCKERHUB_TOKEN
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -237,7 +237,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -362,6 +362,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
@@ -377,6 +378,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,33 +1,4 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 
-# This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth
-
-# vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
-
-# Test ownership
-/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
-/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/quantization @mgoin @robertgshaw2-neuralmagic
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-neuralmagic @comaniac
-/tests/weight_loading @mgoin @youkaichao
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
+* @kzawora-intel @madamczykhabana @michalkuligowski @mgawarkiewicz
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -2,14 +2,14 @@ name: Lint GitHub Actions workflows
 on:
   push:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
       - '.github/workflows/matchers/actionlint.json'
   pull_request:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'

diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -5,7 +5,7 @@ on:
   # but only for the main branch
   push:
     branches:
-      - main
+      - habana_main
     paths:
       - "**/*.py"
       - "**/*.md"
@@ -15,7 +15,7 @@ on:
       - .github/workflows/codespell.yml
   pull_request:
     branches:
-      - main
+      - habana_main
     paths:
       - "**/*.py"
       - "**/*.md"

diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
@@ -2,13 +2,13 @@ name: Lint PNG exports from excalidraw
 on:
   push:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '*.excalidraw.png'
       - '.github/workflows/png-lint.yml'
   pull_request:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '*.excalidraw.png'
       - '.github/workflows/png-lint.yml'

diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
@@ -2,13 +2,13 @@ name: Lint shell scripts
 on:
   push:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '**/*.sh'
       - '.github/workflows/shellcheck.yml'
   pull_request:
     branches:
-      - "main"
+      - "habana_main"
     paths:
       - '**/*.sh'
       - '.github/workflows/shellcheck.yml'

diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml
@@ -3,12 +3,12 @@ name: Lint documentation
 on:
   push:
     branches:
-      - main
+      - habana_main
     paths:
       - "docs/**"
   pull_request:
     branches:
-      - main
+      - habana_main
     paths:
       - "docs/**"
 

diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml
@@ -5,10 +5,10 @@ tasks:
 - name: "gsm8k_cot_llama"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.8317
+    value: 0.664
   - name: "exact_match,flexible-extract"
-    value: 0.8355
-limit: null
+    value: 0.676
+limit: 250
 num_fewshot: 8
 dtype: "bfloat16"
 fewshot_as_multiturn: true

diff --git a/.jenkins/lm-eval-harness/inc_unit_scales_config.json b/.jenkins/lm-eval-harness/inc_unit_scales_config.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "unit_scale",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": ""
+}
diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh
@@ -43,14 +43,16 @@ do
     export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
     export VLLM_SKIP_WARMUP=true
     RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
-    JUNIT_SUFFIX=""
+    JUNIT_FAMILY=""
+    JUNIT_XML=""
     if [[ -n "$TEST_RESULTS_DIR" ]]; then
         LOG_DIR=$TEST_RESULTS_DIR
         LOG_FILENAME="test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
         LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
-        JUNIT_SUFFIX="-o junit_family=xunit1 --junitxml=${LOG_PATH}"
+        JUNIT_FAMILY="-o junit_family=xunit1"
+        JUNIT_XML="--junitxml=${LOG_PATH}"
     fi
-    pytest -s test_lm_eval_correctness.py "$JUNIT_SUFFIX" || LOCAL_SUCCESS=$?
+    pytest -s test_lm_eval_correctness.py "$JUNIT_FAMILY" "$JUNIT_XML" || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
         echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="

diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -27,12 +27,10 @@
 TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 
 
-def setup_fp8(model_path, device_type):
-    flavor = f"g{device_type[-1]}"
-    normalized_model_name = Path(model_path).parts[-1].lower()
+def setup_fp8():
     os.environ[
         "QUANT_CONFIG"] = \
-            f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json"
+            "inc_unit_scales_config.json"
 
 
 def fail_on_exit():
@@ -147,7 +145,7 @@ def test_lm_eval_correctness(record_xml_attribute, record_property):
 
         # Set up environment for FP8 inference
         if eval_config.get("fp8"):
-            setup_fp8(eval_config["model_name"], platform)
+            setup_fp8()
         # Launch eval requests.
         start_time = time.perf_counter()
         results = launch_lm_eval(eval_config)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -1,5 +1,6 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
+# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
 
 FROM $BASE_IMAGE
 

diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!

diff --git a/README_GAUDI.md b/README_GAUDI.md
@@ -247,7 +247,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
 - `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
 - `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_REGIONAL_COMPILATION`: if `false`, turn off regional complation (when using torch.compile execution mode).
+- `VLLM_REGIONAL_COMPILATION`: if `false`, turn off regional compilation (when using torch.compile execution mode).
 
 **Performance tuning knobs:**
 

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
@@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
       blocksparse_block_size, blocksparse_head_sliding_step);
 
 #define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes

diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
@@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
       blocksparse_head_sliding_step);
 
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
     torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor&
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor& slot_mapping,  // [num_tokens]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
     const std::string& kv_cache_dtype, const double k_scale,
     const double v_scale) {
-  int num_tokens = key.size(0);
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
   int block_size = key_cache.size(1);

diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
         // (which occurs when `final_state_position` is a non-positivie index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
-        if (final_state_position < 0 && seqlen > kWidth){
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
             if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
                 // chunk = n_chunks - 2, a segment of the final state sits in the last index

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -16,5 +16,6 @@ mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 requests
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -82,6 +82,7 @@ Documentation
    serving/openai_compatible_server
    serving/deploying_with_docker
    serving/deploying_with_k8s
+   serving/deploying_with_helm
    serving/deploying_with_nginx
    serving/distributed_serving
    serving/metrics
@@ -102,6 +103,7 @@ Documentation
 
    usage/lora
    usage/multimodal_inputs
+   usage/tool_calling
    usage/structured_outputs
    usage/spec_decode
    usage/compatibility_matrix