Skip to content

Commit

Permalink
Merge branch 'habana_main' into SW-201504_1
Browse files Browse the repository at this point in the history
  • Loading branch information
RonBenMosheHabana authored Dec 18, 2024
2 parents c008a9c + 88ef381 commit 044acd2
Show file tree
Hide file tree
Showing 176 changed files with 6,930 additions and 3,471 deletions.
16 changes: 16 additions & 0 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,19 @@ steps:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
- docker-login#v3.0.0:
username: vllm
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
4 changes: 3 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
Expand Down Expand Up @@ -362,6 +362,7 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
Expand All @@ -377,6 +378,7 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
Expand Down
31 changes: 1 addition & 30 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,33 +1,4 @@
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file

# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
CMakeLists.txt @tlrmchlsmth

# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic

# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-neuralmagic
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/multi_step @alexm-neuralmagic @comaniac
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
* @kzawora-intel @madamczykhabana @michalkuligowski @mgawarkiewicz
4 changes: 2 additions & 2 deletions .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ name: Lint GitHub Actions workflows
on:
push:
branches:
- "main"
- "habana_main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
pull_request:
branches:
- "main"
- "habana_main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
# but only for the main branch
push:
branches:
- main
- habana_main
paths:
- "**/*.py"
- "**/*.md"
Expand All @@ -15,7 +15,7 @@ on:
- .github/workflows/codespell.yml
pull_request:
branches:
- main
- habana_main
paths:
- "**/*.py"
- "**/*.md"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/png-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ name: Lint PNG exports from excalidraw
on:
push:
branches:
- "main"
- "habana_main"
paths:
- '*.excalidraw.png'
- '.github/workflows/png-lint.yml'
pull_request:
branches:
- "main"
- "habana_main"
paths:
- '*.excalidraw.png'
- '.github/workflows/png-lint.yml'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/shellcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ name: Lint shell scripts
on:
push:
branches:
- "main"
- "habana_main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
pull_request:
branches:
- "main"
- "habana_main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/sphinx-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ name: Lint documentation
on:
push:
branches:
- main
- habana_main
paths:
- "docs/**"
pull_request:
branches:
- main
- habana_main
paths:
- "docs/**"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ tasks:
- name: "gsm8k_cot_llama"
metrics:
- name: "exact_match,strict-match"
value: 0.8317
value: 0.664
- name: "exact_match,flexible-extract"
value: 0.8355
limit: null
value: 0.676
limit: 250
num_fewshot: 8
dtype: "bfloat16"
fewshot_as_multiturn: true
Expand Down
16 changes: 16 additions & 0 deletions .jenkins/lm-eval-harness/inc_unit_scales_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "unit_scale",
"allowlist": {
"types": [],
"names": []
},
"blocklist": {
"types": [],
"names": [
"lm_head"
]
},
"dump_stats_path": ""
}
8 changes: 5 additions & 3 deletions .jenkins/lm-eval-harness/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ do
export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
export VLLM_SKIP_WARMUP=true
RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
JUNIT_SUFFIX=""
JUNIT_FAMILY=""
JUNIT_XML=""
if [[ -n "$TEST_RESULTS_DIR" ]]; then
LOG_DIR=$TEST_RESULTS_DIR
LOG_FILENAME="test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
JUNIT_SUFFIX="-o junit_family=xunit1 --junitxml=${LOG_PATH}"
JUNIT_FAMILY="-o junit_family=xunit1"
JUNIT_XML="--junitxml=${LOG_PATH}"
fi
pytest -s test_lm_eval_correctness.py "$JUNIT_SUFFIX" || LOCAL_SUCCESS=$?
pytest -s test_lm_eval_correctness.py "$JUNIT_FAMILY" "$JUNIT_XML" || LOCAL_SUCCESS=$?

if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
Expand Down
8 changes: 3 additions & 5 deletions .jenkins/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,10 @@
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


def setup_fp8(model_path, device_type):
flavor = f"g{device_type[-1]}"
normalized_model_name = Path(model_path).parts[-1].lower()
def setup_fp8():
os.environ[
"QUANT_CONFIG"] = \
f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json"
"inc_unit_scales_config.json"


def fail_on_exit():
Expand Down Expand Up @@ -147,7 +145,7 @@ def test_lm_eval_correctness(record_xml_attribute, record_property):

# Set up environment for FP8 inference
if eval_config.get("fp8"):
setup_fp8(eval_config["model_name"], platform)
setup_fp8()
# Launch eval requests.
start_time = time.perf_counter()
results = launch_lm_eval(eval_config)
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.neuron
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"

FROM $BASE_IMAGE

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
---

*Latest News* 🔥
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
Expand Down
2 changes: 1 addition & 1 deletion README_GAUDI.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
- `VLLM_REGIONAL_COMPILATION`: if `false`, turn off regional complation (when using torch.compile execution mode).
- `VLLM_REGIONAL_COMPILATION`: if `false`, turn off regional compilation (when using torch.compile execution mode).

**Performance tuning knobs:**

Expand Down
11 changes: 4 additions & 7 deletions csrc/attention/paged_attention_v1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
blocksparse_block_size, blocksparse_head_sliding_step);

#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}

// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
Expand Down
11 changes: 4 additions & 7 deletions csrc/attention/paged_attention_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
blocksparse_head_sliding_step);

#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}

// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
Expand Down
14 changes: 12 additions & 2 deletions csrc/cache_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,20 @@ void reshape_and_cache_flash(
torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor&
value_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& slot_mapping, // [num_tokens]
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale) {
int num_tokens = key.size(0);
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
// slot_mapping.size(0) because of padding for CUDA graphs.
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
// both include padding.
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
// since key includes padding for CUDA graphs, while slot_mapping does not.
// In this case, slot_mapping.size(0) represents the actual number of tokens
// before padding.
// For compatibility with both cases, we use slot_mapping.size(0) as the
// number of tokens.
int num_tokens = slot_mapping.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = key_cache.size(1);
Expand Down
2 changes: 1 addition & 1 deletion csrc/mamba/causal_conv1d/causal_conv1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positivie index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if (final_state_position < 0 && seqlen > kWidth){
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
input_t vals_load[kNElts] = {0};
if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
// chunk = n_chunks - 2, a segment of the final state sits in the last index
Expand Down
1 change: 1 addition & 0 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ mistral_common >= 1.5.0
aiohttp
starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
2 changes: 2 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ Documentation
serving/openai_compatible_server
serving/deploying_with_docker
serving/deploying_with_k8s
serving/deploying_with_helm
serving/deploying_with_nginx
serving/distributed_serving
serving/metrics
Expand All @@ -102,6 +103,7 @@ Documentation

usage/lora
usage/multimodal_inputs
usage/tool_calling
usage/structured_outputs
usage/spec_decode
usage/compatibility_matrix
Expand Down
Loading

0 comments on commit 044acd2

Please sign in to comment.