Merge remote-tracking branch 'upstream/main' into kill-the-server

vllm-project · Aug 7, 2024 · b846a86 · b846a86
2 parents 214585f + 469b3bc
commit b846a86
Show file tree

Hide file tree

Showing 84 changed files with 5,356 additions and 1,045 deletions.
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -1,9 +1,27 @@
 steps:
-  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
+  - label: "Build wheel - CUDA 12.1"
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build CUDA 11.8 wheel"
+    key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    depends_on: block-build-cu118-wheel
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
@@ -12,8 +30,3 @@ steps:
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     env:
       DOCKER_BUILDKIT: "1"
-    matrix:
-      setup:
-        cuda_version:
-          - "11.8.0"
-          - "12.1.0"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -148,8 +148,9 @@ steps:
     - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference_encoder_decoder.py
 
 - label: Models Test # 1hr10min
   source_file_dependencies:
@@ -289,6 +290,7 @@ steps:
   commands:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
   - pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -21,7 +21,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -112,6 +112,8 @@ def setup(app):
     "tensorizer",
     "pynvml",
     "outlines",
+    "gguf",
+    "lark",
 ]
 
 for mock_target in autodoc_mock_imports:

diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
@@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory
 Speculating with a draft model
 ------------------------------
 
-The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+To perform the same with an online mode launch the server:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+
+ Then use a client:
+
+.. code-block:: python
+
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+
 Speculating by matching n-grams in the prompt
 ---------------------------------------------
 
@@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -63,7 +107,7 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
@@ -74,7 +118,7 @@ Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
 draft models that conditioning draft predictions on both context vectors and sampled tokens.
-For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or 
+For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
 `this technical report <https://arxiv.org/abs/2404.19124>`_.
 
 .. code-block:: python
@@ -100,9 +144,9 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-Note that these speculative models currently need to be run without tensor parallelism, although 
-it is possible to run the main model using tensor parallelism (see example above). Since the 
-speculative models are relatively small, we still see significant speedups. However, this 
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
 limitation will be fixed in a future release.
 
 A variety of speculative models of this type are available on HF hub:

diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference_encoder_decoder.py
@@ -0,0 +1,99 @@
+'''
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically BART
+'''
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
+from vllm.utils import zip_enc_dec_prompt_lists
+
+dtype = "float"
+
+# Create a BART encoder/decoder model instance
+llm = LLM(
+    model="facebook/bart-large-cnn",
+    dtype=dtype,
+)
+
+# Get BART tokenizer
+tokenizer = llm.llm_engine.get_tokenizer_group()
+
+# Test prompts
+#
+# This section shows all of the valid ways to prompt an
+# encoder/decoder model.
+#
+# - Helpers for building prompts
+text_prompt_raw = "Hello, my name is"
+text_prompt = TextPrompt(prompt="The president of the United States is")
+tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
+    prompt="The capital of France is"))
+# - Pass a single prompt to encoder/decoder model
+#   (implicitly encoder input prompt);
+#   decoder input prompt is assumed to be None
+
+single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+single_text_prompt = text_prompt  # Pass a TextPrompt
+single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+
+# - Pass explicit encoder and decoder input prompts within one data structure.
+#   Encoder and decoder prompts can both independently be text or tokens, with
+#   no requirement that they be the same prompt type. Some example prompt-type
+#   combinations are shown below, note that these are not exhaustive.
+
+enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+    # Pass encoder prompt string directly, &
+    # pass decoder prompt tokens
+    encoder_prompt=single_text_prompt_raw,
+    decoder_prompt=single_tokens_prompt,
+)
+enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+    # Pass TextPrompt to encoder, and
+    # pass decoder prompt string directly
+    encoder_prompt=single_text_prompt,
+    decoder_prompt=single_text_prompt_raw,
+)
+enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+    # Pass encoder prompt tokens directly, and
+    # pass TextPrompt to decoder
+    encoder_prompt=single_tokens_prompt,
+    decoder_prompt=single_text_prompt,
+)
+
+# - Finally, here's a useful helper function for zipping encoder and
+#   decoder prompt lists together into a list of ExplicitEncoderDecoderPrompt
+#   instances
+zipped_prompt_list = zip_enc_dec_prompt_lists(
+    ['An encoder prompt', 'Another encoder prompt'],
+    ['A decoder prompt', 'Another decoder prompt'])
+
+# - Let's put all of the above example prompts together into one list
+#   which we will pass to the encoder/decoder LLM.
+prompts = [
+    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
+    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+] + zipped_prompt_list
+
+print(prompts)
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    min_tokens=0,
+    max_tokens=20,
+)
+
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -1,34 +1,7 @@
 # Common dependencies
-# -r requirements-common.txt
-# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
-cmake >= 3.21
-ninja  # For faster builds.
-psutil
-sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
-requests
-tqdm
-py-cpuinfo
-transformers < 4.43
-tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
-aiohttp
-openai
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
-prometheus_client >= 0.18.0
-prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-pyzmq
-gguf == 0.9.1
+-r requirements-common.txt
 
 # OpenVINO dependencies
 torch >= 2.1.2
-openvino ~= 2024.3.0.dev
-openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
-optimum-intel[openvino] >= 1.18.1
+openvino ~= 2024.3.0
+optimum-intel[openvino] >= 1.18.2
diff --git a/setup.py b/setup.py
@@ -272,7 +272,7 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not _is_neuron() and not _is_tpu()
+    return not _is_neuron() and not _is_tpu() and not _is_openvino()
 
 
 def get_hipcc_rocm_version():

diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
@@ -1,5 +1,5 @@
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict
+from typing import Any, Dict, Iterable
 
 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -18,9 +18,10 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._num_aborts = 0
 
-    async def abort(self, request_id: str) -> None:
-        await super().abort(request_id)
-        self._num_aborts += 1
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        ids = list(request_ids)
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
 
     def testing_stats(self) -> Dict[str, Any]:
         return {"num_aborted_requests": self._num_aborts}