Merge branch 'vllm-project:main' into add_chart_helm_example

vllm-project · Dec 3, 2024 · bef565c · bef565c
2 parents bcbee96 + f6084f6
commit bef565c
Show file tree

Hide file tree

Showing 48 changed files with 937 additions and 504 deletions.
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
@@ -12,5 +12,7 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run -it -d --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test /bin/bash
+docker exec xpu-test bash -c "python3 examples/offline_inference.py"
+docker exec xpu-test bash -c "python3 examples/offline_inference_cli.py -tp 2" 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -481,7 +481,6 @@ steps:
 
 - label: LoRA TP Test (Distributed)
   num_gpus: 4
-  soft_fail: true
   source_file_dependencies:
   - vllm/lora
   - tests/lora

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -294,23 +294,36 @@ def main(args: argparse.Namespace):
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        # As tokenizer may add additional tokens like BOS, we need to try
-        # different lengths to get the desired input length.
-        for i in range(-10, 10):
-            prompt = "hi " * (args.input_len + i)
-            tokenized_prompt = tokenizer(prompt).input_ids
-            if len(tokenized_prompt) == args.input_len:
-                break
-        else:
-            raise ValueError(
-                f"Failed to synthesize a prompt with {args.input_len} tokens.")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=args.input_len,
-                          expected_output_len=args.output_len)
-            for _ in range(args.num_prompts)
-        ]
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1)
+                for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = tokenizer.decode(candidate_ids)
+                tokenized_len = len(tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend([
+                        random.randint(100, vocab_size - 100)
+                        for _ in range(diff)
+                    ])
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(prompt=candidate_prompt,
+                              prompt_len=args.input_len,
+                              expected_output_len=args.output_len))
     else:
         requests = sample_requests(tokenizer, args)
 

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -16,4 +16,5 @@ mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+requests
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -10,11 +10,13 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
+import inspect
 import logging
 import os
 import sys
 from typing import List
 
+import requests
 from sphinx.ext import autodoc
 
 logger = logging.getLogger(__name__)
@@ -34,6 +36,7 @@
 extensions = [
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
+    "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
     "sphinx.ext.autodoc",
@@ -94,6 +97,69 @@ def setup(app):
     generate_examples()
 
 
+_cached_base: str = ""
+_cached_branch: str = ""
+
+
+def get_repo_base_and_branch(pr_number):
+    global _cached_base, _cached_branch
+    if _cached_base and _cached_branch:
+        return _cached_base, _cached_branch
+
+    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        _cached_base = data['head']['repo']['full_name']
+        _cached_branch = data['head']['ref']
+        return _cached_base, _cached_branch
+    else:
+        logger.error("Failed to fetch PR details: %s", response)
+        return None, None
+
+
+def linkcode_resolve(domain, info):
+    if domain != 'py':
+        return None
+    if not info['module']:
+        return None
+    filename = info['module'].replace('.', '/')
+    module = info['module']
+
+    # try to determine the correct file and line number to link to
+    obj = sys.modules[module]
+
+    # get as specific as we can
+    lineno: int = 0
+    filename: str = ""
+    try:
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+
+            if not (inspect.isclass(obj) or inspect.isfunction(obj)
+                    or inspect.ismethod(obj)):
+                obj = obj.__class__  # Get the class of the instance
+
+            lineno = inspect.getsourcelines(obj)[1]
+            filename = (inspect.getsourcefile(obj)
+                        or f"{filename}.py").split("vllm/", 1)[1]
+    except Exception:
+        # For some things, like a class member, won't work, so
+        # we'll use the line number of the parent (the class)
+        pass
+
+    if filename.startswith("checkouts/"):
+        # a PR build on readthedocs
+        pr_number = filename.split("/")[1]
+        filename = filename.split("/", 2)[2]
+        base, branch = get_repo_base_and_branch(pr_number)
+        if base and branch:
+            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
+
+    # Otherwise, link to the source file on the main branch
+    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
+
+
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
     "compressed_tensors",
@@ -112,6 +178,7 @@ def setup(app):
     "tensorizer",
     "pynvml",
     "outlines",
+    "xgrammar,"
     "librosa",
     "soundfile",
     "gguf",

diff --git a/requirements-common.txt b/requirements-common.txt
@@ -19,6 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
+xgrammar
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
@@ -7,7 +7,6 @@
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
@@ -81,6 +80,7 @@ def test_simple_piecewise_compile():
         use_cudagraph=True,
         splitting_ops=["silly.attention"],
         cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
     ))
     with set_current_vllm_config(vllm_config):
         model = SillyModel(vllm_config=vllm_config, prefix='')
@@ -96,11 +96,10 @@ def test_simple_piecewise_compile():
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
-        with set_compile_context([1, 2]):
-            model(inputs)
+        model(inputs)
 
-            model(torch.randn(2).cuda())
-            model(torch.randn(1).cuda())
+        model(torch.randn(2).cuda())
+        model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         global global_counter

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -13,7 +13,6 @@
 from torch import nn
 from torch.library import Library
 
-from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
@@ -256,6 +255,7 @@ def run_model(llama_config,
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
+            cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
             compilation_config.splitting_ops = ["silly.attention"]
@@ -273,10 +273,9 @@ def run_model(llama_config,
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
     positions = torch.arange(B).cuda()
 
-    with set_compile_context([1, 2]):
-        model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+    model(input_ids, positions)
+    model(input_ids[:2], positions[:2])
+    model(input_ids[:1], positions[:1])
 
     input_ids[:2].zero_()
     output = model(input_ids[:2], positions[:2])
@@ -379,10 +378,13 @@ def benchmark():
                 level=CompilationLevel.PIECEWISE,
                 use_cudagraph=True,
                 splitting_ops=["silly.attention"],
+                cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE, )
+                level=CompilationLevel.PIECEWISE,
+                cudagraph_capture_sizes=cudagraph_sizes,
+            )
 
         vllm_config = VllmConfig(compilation_config=compilation_config)
         with set_current_vllm_config(vllm_config):
@@ -396,17 +398,16 @@ def benchmark():
 
         graphs = {}
 
-        with set_compile_context(cudagraph_sizes):
-            model(input_ids, positions)
-            for b in cudagraph_sizes[::-1]:
-                if not piecewise:
-                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph, pool=pool):
-                        output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (graph, output)
-                else:
+        model(input_ids, positions)
+        for b in cudagraph_sizes[::-1]:
+            if not piecewise:
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph, pool=pool):
                     output = model(input_ids[:b], positions[:b])
-                    graphs[b] = (model, output)
+                graphs[b] = (graph, output)
+            else:
+                output = model(input_ids[:b], positions[:b])
+                graphs[b] = (model, output)
         for b in cudagraph_sizes:
             if piecewise:
                 # noqa is for `Function definition does not bind loop variable`