Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into luka/custom_ops_env
Browse files Browse the repository at this point in the history
# Conflicts:
#	vllm/envs.py
#	vllm/model_executor/layers/activation.py
  • Loading branch information
ProExpertProg committed Oct 17, 2024
2 parents 5201dc6 + 390be74 commit 0d42d55
Show file tree
Hide file tree
Showing 164 changed files with 4,748 additions and 2,473 deletions.
4 changes: 2 additions & 2 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ steps:
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
Expand All @@ -22,7 +22,7 @@ steps:
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
Expand Down
2 changes: 2 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,12 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/embedding/vision_language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language

Expand Down
30 changes: 29 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,33 @@
/.venv
/build
dist
Dockerfile*
vllm/*.so

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

.mypy_cache

# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
3 changes: 1 addition & 2 deletions .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH

# Install requirements
$python_executable -m pip install wheel packaging 'setuptools-scm>=8'
$python_executable -m pip install -r requirements-cuda.txt
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt

# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
Expand Down
4 changes: 2 additions & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ sphinx:
fail_on_warning: true

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
- pdf
formats: []

# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt

45 changes: 25 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,27 +144,32 @@ else()
endif()


#
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()

#
# Override the GPU architectures detected by cmake/torch and filter them by
# the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")

#
# Query torch for additional GPU compilation flags for the given
# `VLLM_GPU_LANG`.
Expand Down Expand Up @@ -281,10 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)

if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
Expand All @@ -294,13 +295,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()

# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()

#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
Expand Down
10 changes: 1 addition & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt

# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY README.md README.md
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm
COPY . .

# max jobs used by Ninja to build extensions
ARG max_jobs=2
Expand Down
11 changes: 1 addition & 10 deletions Dockerfile.openvino
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,7 @@ RUN apt-get update -y && \
ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace

# copy requirements
COPY requirements-build.txt /workspace/vllm/
COPY requirements-common.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/

COPY vllm/ /workspace/vllm/vllm
COPY csrc/core /workspace/vllm/csrc/core
COPY cmake/utils.cmake /workspace/vllm/cmake/
COPY CMakeLists.txt /workspace/vllm/
COPY setup.py /workspace/vllm/
COPY . .

# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
38 changes: 18 additions & 20 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,16 +431,15 @@ async def benchmark(

if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
)
profile_input = RequestFuncInput(model=model_id,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
Expand All @@ -453,16 +452,15 @@ async def benchmark(
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(
model=model_id,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
output_len=output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=mm_content,
)
request_func_input = RequestFuncInput(model=model_id,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
output_len=output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=mm_content,
ignore_eos=ignore_eos)
tasks.append(
asyncio.create_task(
request_func(request_func_input=request_func_input,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def benchmark_rope_kernels_multi_lora(
# batched RoPE can take multiple scaling factors
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_style, {
"type": "linear",
"rope_type": "linear",
"factor": tuple(scaling_factors)
})
# non-batched RoPE takes only one scaling factor, we create multiple
Expand All @@ -41,7 +41,7 @@ def benchmark_rope_kernels_multi_lora(
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
{
"type": "linear",
"rope_type": "linear",
"factor": (scaling_factor, )
}))

Expand Down
27 changes: 10 additions & 17 deletions collect_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,23 +267,16 @@ def get_neuron_sdk_version(run_lambda):


def get_vllm_version():
version = ""
try:
import vllm
version = vllm.__version__
except Exception:
pass
commit = ""
try:
import vllm
commit = vllm.__commit__
except Exception:
pass
if version != "" and commit != "":
return f"{version}@{commit}"
if version == "" and commit == "":
return "N/A"
return version or commit
from vllm import __version__, __version_tuple__

if __version__ == "dev":
return "N/A (dev)"

if len(__version_tuple__) == 4: # dev build
git_sha = __version_tuple__[-1][1:] # type: ignore
return f"{__version__} (git sha: {git_sha}"

return __version__

def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
Expand Down
Loading

0 comments on commit 0d42d55

Please sign in to comment.