vllm-project · SlightwindSec · May 25, 2025 · May 26, 2025 · May 26, 2025 · May 27, 2025
diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+ARG PY_VERSION=3.10
+FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py${PY_VERSION}
+
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+COPY . /workspace/vllm-ascend/
+
+# Install req
+RUN python3 -m pip install -r vllm-ascend/requirements.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip install twine
+
+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    cd vllm-ascend && \
+    python3 setup.py bdist_wheel && \
+    ls -l dist && \
+    for f in dist/*.whl; do mv "$f" "$(echo "$f" | sed -e 's/-linux_x86_64\.whl$/-manylinux1_x86_64.whl/' -e 's/-linux_aarch64\.whl$/-manylinux2014_aarch64.whl/')"; done && \
+    ls -l dist
+
+CMD ["/bin/bash"]
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -5,3 +5,4 @@ self-hosted-runner:
     - linux-arm64-npu-2
     - linux-arm64-npu-4
     - linux-arm64-npu-static-8
+    - ubuntu-24.04-arm
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -47,7 +47,7 @@ jobs:
 
       - name: "Run actionlint"
         env:
-          SHELLCHECK_OPTS: --exclude=SC2046,SC2006
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
         run: |
           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
           tools/actionlint.sh -color
diff --git a/.github/workflows/release_code.yml b/.github/workflows/release_code.yml
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: build / sdist
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/release_code.yml'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/release_code.yml'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+jobs:
+  build:
+    name: release code
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Print
+        run: |
+          lscpu
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install twine setuptools_scm
+
+      - name: Generate tar.gz
+        run: |
+          python3 setup.py sdist
+          ls dist
+
+      - name: Archive tar.gz
+        uses: actions/upload-artifact@v4
+        with:
+          name: vllm-ascend-src
+          path: dist/*
+
+      - name: Release
+        if: startsWith(github.ref, 'refs/tags/')
+        run: |
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/release_whl.yml b/.github/workflows/release_whl.yml
@@ -0,0 +1,95 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: build / wheel
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/release_whl.yml'
+      - '.github/Dockerfile.buildwheel'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/release_whl.yml'
+      - '.github/Dockerfile.buildwheel'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+jobs:
+  build:
+    name: build and release wheel
+    strategy:
+      matrix:
+        os: [ubuntu-24.04, ubuntu-24.04-arm]
+        python-version: ['3.9', '3.10', '3.11']
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Build wheel
+      run: |
+        ls
+        docker build -f ./.github/Dockerfile.buildwheel \
+        --build-arg PY_VERSION=${{ matrix.python-version }} \
+        -t wheel:v1 .
+        docker run --rm \
+        -v $(pwd):/outpwd \
+        wheel:v1 \
+        bash -c "cp -r /workspace/vllm-ascend/dist /outpwd"
+        ls dist
+
+    - name: Archive wheel
+      uses: actions/upload-artifact@v4
+      with:
+        name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
+        path: dist/*
+
+    - name: Set up Python ${{ matrix.python-version }}
+      if: startsWith(github.ref, 'refs/tags/')
+      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Release
+      if: startsWith(github.ref, 'refs/tags/')
+      run: |
+        python3 -m pip install twine
+        python3 -m twine upload --verbose dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
@@ -18,7 +18,7 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
 #
-export SHELLCHECK_OPTS="--exclude=SC2046,SC2006"
+export SHELLCHECK_OPTS="--exclude=SC2046,SC2006,SC2086"
 
 if command -v actionlint &> /dev/null; then
     actionlint .github/workflows/*.yml .github/workflows/*.yaml

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -30,6 +30,7 @@
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
+from vllm_ascend.utils import vllm_version_is
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -141,8 +142,14 @@ def reorder_batch(self, input_batch: "InputBatch",
 
     def build(self, num_reqs, num_actual_tokens, max_query_len,
               common_prefix_len):
-        block_table = (
-            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
+            block_table = (self.runner.input_batch.block_table.
+                           get_device_tensor()[:num_reqs])
+        else:
+            block_table = self.runner.input_batch.block_table[
+                0].get_device_tensor()
+            block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+                block_table[:num_reqs])
 
         query_lens = self.runner.query_lens
         seq_lens = self.runner.seq_lens_cpu[:num_reqs]

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -16,6 +16,7 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 if TYPE_CHECKING:
@@ -238,8 +239,14 @@ def build(self,
         # function. We should avoid GPU -> CPU sync as much as possible because
         # it blocks on all previous kernels.
         device = self.runner.device
-        block_table = (
-            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
+            block_table = (self.runner.input_batch.block_table.
+                           get_device_tensor()[:num_reqs])
+        else:
+            block_table = self.runner.input_batch.block_table[
+                0].get_device_tensor()
+            block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+                block_table[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             device, non_blocking=True)
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -36,6 +36,8 @@
     lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
+    "VLLM_ENABLE_FIXED_ALL_TO_ALL_BUFFER":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_FIXED_ALL_TO_ALL_BUFFER", '0'))),
     "USING_LCCL_COM":
     lambda: bool(int(os.getenv("USING_LCCL_COM", '0'))),
     "SOC_VERSION":

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -748,6 +748,7 @@ def __init__(
                     vllm_parallel_config=vllm_config.parallel_config))
 
             self.moe_parallel_config.ep_size = get_ep_group().world_size
+            self.moe_parallel_config.tp_size = get_etp_group().world_size
 
         self.top_k = top_k
         self.num_experts = num_experts
@@ -766,6 +767,7 @@ def __init__(
         self.e_score_correction_bias = e_score_correction_bias
         self.expert_map = None
         self.activation = activation
+        self.max_model_len = vllm_config.model_config.max_model_len
 
         if self.ep_size > 1:
             # Create a tensor of size num_experts filled with -1