Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into kill-the-server
Browse files Browse the repository at this point in the history
  • Loading branch information
joerunde committed Aug 7, 2024
2 parents 214585f + 469b3bc commit b846a86
Show file tree
Hide file tree
Showing 84 changed files with 5,356 additions and 1,045 deletions.
27 changes: 20 additions & 7 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
steps:
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
- label: "Build wheel - CUDA 12.1"
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"

- block: "Build CUDA 11.8 wheel"
key: block-build-cu118-wheel

- label: "Build wheel - CUDA 11.8"
depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
Expand All @@ -12,8 +30,3 @@ steps:
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"
4 changes: 3 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ steps:
- python3 cpu_offload.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
- python3 offline_inference_vision_language.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py

- label: Models Test # 1hr10min
source_file_dependencies:
Expand Down Expand Up @@ -289,6 +290,7 @@ steps:
commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
- pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s distributed/test_multimodal_broadcast.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openvino
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ COPY setup.py /workspace/vllm/
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/

COPY examples/ /workspace/vllm/examples
COPY benchmarks/ /workspace/vllm/benchmarks
Expand Down
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def setup(app):
"tensorizer",
"pynvml",
"outlines",
"gguf",
"lark",
]

for mock_target in autodoc_mock_imports:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/openvino-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Install from source

.. code-block:: console
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
.. _openvino_backend_performance_tips:

Expand Down
66 changes: 55 additions & 11 deletions docs/source/models/spec_decode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory
Speculating with a draft model
------------------------------

The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.

.. code-block:: python
from vllm import LLM, SamplingParams
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="facebook/opt-6.7b",
tensor_parallel_size=1,
Expand All @@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
To perform the same with an online mode launch the server:

.. code-block:: bash
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
--seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
--num_speculative_tokens 5 --gpu_memory_utilization 0.8
Then use a client:
.. code-block:: python
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Completion API
stream = False
completion = client.completions.create(
model=model,
prompt="The future of AI is",
echo=False,
n=1,
stream=stream,
)
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
Speculating by matching n-grams in the prompt
---------------------------------------------

Expand All @@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
.. code-block:: python
from vllm import LLM, SamplingParams
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="facebook/opt-6.7b",
tensor_parallel_size=1,
Expand All @@ -63,7 +107,7 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
Expand All @@ -74,7 +118,7 @@ Speculating using MLP speculators

The following code configures vLLM to use speculative decoding where proposals are generated by
draft models that conditioning draft predictions on both context vectors and sampled tokens.
For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
`this technical report <https://arxiv.org/abs/2404.19124>`_.

.. code-block:: python
Expand All @@ -100,9 +144,9 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
Note that these speculative models currently need to be run without tensor parallelism, although
it is possible to run the main model using tensor parallelism (see example above). Since the
speculative models are relatively small, we still see significant speedups. However, this
Note that these speculative models currently need to be run without tensor parallelism, although
it is possible to run the main model using tensor parallelism (see example above). Since the
speculative models are relatively small, we still see significant speedups. However, this
limitation will be fixed in a future release.

A variety of speculative models of this type are available on HF hub:
Expand Down
99 changes: 99 additions & 0 deletions examples/offline_inference_encoder_decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
'''
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
'''

from vllm import LLM, SamplingParams
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
from vllm.utils import zip_enc_dec_prompt_lists

dtype = "float"

# Create a BART encoder/decoder model instance
llm = LLM(
model="facebook/bart-large-cnn",
dtype=dtype,
)

# Get BART tokenizer
tokenizer = llm.llm_engine.get_tokenizer_group()

# Test prompts
#
# This section shows all of the valid ways to prompt an
# encoder/decoder model.
#
# - Helpers for building prompts
text_prompt_raw = "Hello, my name is"
text_prompt = TextPrompt(prompt="The president of the United States is")
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
prompt="The capital of France is"))
# - Pass a single prompt to encoder/decoder model
# (implicitly encoder input prompt);
# decoder input prompt is assumed to be None

single_text_prompt_raw = text_prompt_raw # Pass a string directly
single_text_prompt = text_prompt # Pass a TextPrompt
single_tokens_prompt = tokens_prompt # Pass a TokensPrompt

# - Pass explicit encoder and decoder input prompts within one data structure.
# Encoder and decoder prompts can both independently be text or tokens, with
# no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.

enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
# Pass encoder prompt string directly, &
# pass decoder prompt tokens
encoder_prompt=single_text_prompt_raw,
decoder_prompt=single_tokens_prompt,
)
enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
# Pass TextPrompt to encoder, and
# pass decoder prompt string directly
encoder_prompt=single_text_prompt,
decoder_prompt=single_text_prompt_raw,
)
enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
# Pass encoder prompt tokens directly, and
# pass TextPrompt to decoder
encoder_prompt=single_tokens_prompt,
decoder_prompt=single_text_prompt,
)

# - Finally, here's a useful helper function for zipping encoder and
# decoder prompt lists together into a list of ExplicitEncoderDecoderPrompt
# instances
zipped_prompt_list = zip_enc_dec_prompt_lists(
['An encoder prompt', 'Another encoder prompt'],
['A decoder prompt', 'Another decoder prompt'])

# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
prompts = [
single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
] + zipped_prompt_list

print(prompts)

# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
min_tokens=0,
max_tokens=20,
)

# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Encoder prompt: {encoder_prompt!r}, "
f"Decoder prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
33 changes: 3 additions & 30 deletions requirements-openvino.txt
Original file line number Diff line number Diff line change
@@ -1,34 +1,7 @@
# Common dependencies
# -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
gguf == 0.9.1
-r requirements-common.txt

# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.1
openvino ~= 2024.3.0
optimum-intel[openvino] >= 1.18.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def _build_custom_ops() -> bool:


def _build_core_ext() -> bool:
return not _is_neuron() and not _is_tpu()
return not _is_neuron() and not _is_tpu() and not _is_openvino()


def get_hipcc_rocm_version():
Expand Down
9 changes: 5 additions & 4 deletions tests/async_engine/api_server_async_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""vllm.entrypoints.api_server with some extra logging for testing."""
from typing import Any, Dict
from typing import Any, Dict, Iterable

import uvicorn
from fastapi.responses import JSONResponse, Response
Expand All @@ -18,9 +18,10 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._num_aborts = 0

async def abort(self, request_id: str) -> None:
await super().abort(request_id)
self._num_aborts += 1
async def _engine_abort(self, request_ids: Iterable[str]):
ids = list(request_ids)
self._num_aborts += len(ids)
await super()._engine_abort(ids)

def testing_stats(self) -> Dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
Expand Down
Loading

0 comments on commit b846a86

Please sign in to comment.