[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <[email protected]>
vllm-project · Nov 6, 2024 · 21063c1 · 21063c1
1 parent 4be3a45
commit 21063c1
Show file tree

Hide file tree

Showing 115 changed files with 240 additions and 322 deletions.
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
 
 def read_markdown(file):
     if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
             return f.read() + "\n"
     else:
         return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
             # this result is generated via `benchmark_serving.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_latency.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_throughput.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 

diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,15 +72,15 @@ def main(args):
 
     # collect results
     for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             results = results + json.loads(f.read())
 
     # generate markdown table
     df = pd.DataFrame.from_dict(results)
 
     md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
         description = f.read()
 
     description = description.format(

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
             command = json.loads(f.read())
         raw_result.update(command)
 

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -29,19 +29,19 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
-    - name: Run isort
-      run: |
-        isort . --check-only
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
@@ -23,16 +23,16 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+          pip install toml==0.10.2
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: '3.9'
 
 sphinx:
-   configuration: docs/source/conf.py
-   fail_on_warning: true
+  configuration: docs/source/conf.py
+  fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
-   - requirements: docs/requirements-docs.txt
-
+  install:
+    - requirements: docs/requirements-docs.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -128,17 +128,17 @@ endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
-  # For cuda we want to be able to control which architectures we compile for on 
+  # For cuda we want to be able to control which architectures we compile for on
   # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
+  # the set of architectures we want to compile for and remove the from the
   # CMAKE_CUDA_FLAGS so that they are not applied globally.
   #
   clear_cuda_arches(CUDA_ARCH_FLAGS)
   extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
   message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
   # Filter the target architectures by the supported supported archs
   # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
+  cuda_archs_loose_intersection(CUDA_ARCHS
     "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
   message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
+    set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -277,15 +277,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "in CUDA target architectures")
     endif()
 
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
     # build any 3x kernels
     set(SCALED_MM_3X_ARCHS)
   endif()
 
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
     #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
     # preselected input type pairs and schedules.
     # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
+    set(MACHETE_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
     file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
 
@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
         OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
       execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
           ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
         RESULT_VARIABLE machete_generation_result
         OUTPUT_VARIABLE machete_generation_output
@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
       if (NOT machete_generation_result EQUAL 0)
         message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
+                            " Result: \"${machete_generation_result}\""
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
       else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
             CACHE STRING "Last run machete generate script hash" FORCE)
         message(STATUS "Machete generation completed successfully.")
       endif()
@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
     message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
         AND MACHETE_ARCHS)
       message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -392,8 +392,8 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
   return()
 endif ()
 
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   foreach(_ARCH ${CUDA_ARCHS})

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
                         # any data, we should skip it.
                         if chunk_bytes.startswith(":"):
                             continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
 
                         data = json.loads(chunk)
                         output.generated_text += data["text_output"]
@@ -261,8 +261,8 @@ async def async_request_openai_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
     return output
 
 
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download

diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
 
 
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
     m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
     Ms = list(range(m_start, m_end + 1, m_increment))
     Ks = list(range(k_start, k_end + 1, k_increment))
     Ns = list(range(n_start, n_end + 1, n_increment))