Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into add_chart_helm_example
Browse files Browse the repository at this point in the history
  • Loading branch information
mfournioux authored Dec 3, 2024
2 parents bcbee96 + f6084f6 commit bef565c
Show file tree
Hide file tree
Showing 48 changed files with 937 additions and 504 deletions.
6 changes: 4 additions & 2 deletions .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@ remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
# Run the image and test offline inference/tensor parallel
docker run -it -d --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test /bin/bash
docker exec xpu-test bash -c "python3 examples/offline_inference.py"
docker exec xpu-test bash -c "python3 examples/offline_inference_cli.py -tp 2"
1 change: 0 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,6 @@ steps:

- label: LoRA TP Test (Distributed)
num_gpus: 4
soft_fail: true
source_file_dependencies:
- vllm/lora
- tests/lora
Expand Down
47 changes: 30 additions & 17 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,23 +294,36 @@ def main(args: argparse.Namespace):
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for i in range(-10, 10):
prompt = "hi " * (args.input_len + i)
tokenized_prompt = tokenizer(prompt).input_ids
if len(tokenized_prompt) == args.input_len:
break
else:
raise ValueError(
f"Failed to synthesize a prompt with {args.input_len} tokens.")
requests = [
SampleRequest(prompt=prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len)
for _ in range(args.num_prompts)
]
vocab_size = tokenizer.vocab_size
requests = []
for _ in range(args.num_prompts):
# Synthesize a prompt with the given input length.
candidate_ids = [
random.randint(0, vocab_size - 1)
for _ in range(args.input_len)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for _ in range(5): # Max attempts to correct
candidate_prompt = tokenizer.decode(candidate_ids)
tokenized_len = len(tokenizer.encode(candidate_prompt))

if tokenized_len == args.input_len:
break

# Adjust length based on difference
diff = args.input_len - tokenized_len
if diff > 0:
candidate_ids.extend([
random.randint(100, vocab_size - 100)
for _ in range(diff)
])
else:
candidate_ids = candidate_ids[:diff]
requests.append(
SampleRequest(prompt=candidate_prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len))
else:
requests = sample_requests(tokenizer, args)

Expand Down
3 changes: 2 additions & 1 deletion docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ mistral_common >= 1.5.0
aiohttp
starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
67 changes: 67 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.

import inspect
import logging
import os
import sys
from typing import List

import requests
from sphinx.ext import autodoc

logger = logging.getLogger(__name__)
Expand All @@ -34,6 +36,7 @@
extensions = [
"sphinx.ext.napoleon",
"sphinx.ext.viewcode",
"sphinx.ext.linkcode",
"sphinx.ext.intersphinx",
"sphinx_copybutton",
"sphinx.ext.autodoc",
Expand Down Expand Up @@ -94,6 +97,69 @@ def setup(app):
generate_examples()


_cached_base: str = ""
_cached_branch: str = ""


def get_repo_base_and_branch(pr_number):
global _cached_base, _cached_branch
if _cached_base and _cached_branch:
return _cached_base, _cached_branch

url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
_cached_base = data['head']['repo']['full_name']
_cached_branch = data['head']['ref']
return _cached_base, _cached_branch
else:
logger.error("Failed to fetch PR details: %s", response)
return None, None


def linkcode_resolve(domain, info):
if domain != 'py':
return None
if not info['module']:
return None
filename = info['module'].replace('.', '/')
module = info['module']

# try to determine the correct file and line number to link to
obj = sys.modules[module]

# get as specific as we can
lineno: int = 0
filename: str = ""
try:
for part in info['fullname'].split('.'):
obj = getattr(obj, part)

if not (inspect.isclass(obj) or inspect.isfunction(obj)
or inspect.ismethod(obj)):
obj = obj.__class__ # Get the class of the instance

lineno = inspect.getsourcelines(obj)[1]
filename = (inspect.getsourcefile(obj)
or f"{filename}.py").split("vllm/", 1)[1]
except Exception:
# For some things, like a class member, won't work, so
# we'll use the line number of the parent (the class)
pass

if filename.startswith("checkouts/"):
# a PR build on readthedocs
pr_number = filename.split("/")[1]
filename = filename.split("/", 2)[2]
base, branch = get_repo_base_and_branch(pr_number)
if base and branch:
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"

# Otherwise, link to the source file on the main branch
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"


# Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [
"compressed_tensors",
Expand All @@ -112,6 +178,7 @@ def setup(app):
"tensorizer",
"pynvml",
"outlines",
"xgrammar,"
"librosa",
"soundfile",
"gguf",
Expand Down
1 change: 1 addition & 0 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11
outlines >= 0.0.43, < 0.1
xgrammar
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
Expand Down
9 changes: 4 additions & 5 deletions tests/compile/piecewise/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from torch import nn
from torch.library import Library

from vllm.compilation.compile_context import set_compile_context
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
Expand Down Expand Up @@ -81,6 +80,7 @@ def test_simple_piecewise_compile():
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_copy_inputs=True,
cudagraph_capture_sizes=[1, 2],
))
with set_current_vllm_config(vllm_config):
model = SillyModel(vllm_config=vllm_config, prefix='')
Expand All @@ -96,11 +96,10 @@ def test_simple_piecewise_compile():
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):

with set_compile_context([1, 2]):
model(inputs)
model(inputs)

model(torch.randn(2).cuda())
model(torch.randn(1).cuda())
model(torch.randn(2).cuda())
model(torch.randn(1).cuda())

input = torch.zeros(2).cuda()
global global_counter
Expand Down
33 changes: 17 additions & 16 deletions tests/compile/piecewise/test_toy_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from torch import nn
from torch.library import Library

from vllm.compilation.compile_context import set_compile_context
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
Expand Down Expand Up @@ -256,6 +255,7 @@ def run_model(llama_config,
compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
cudagraph_capture_sizes=[1, 2],
)
if split_attn:
compilation_config.splitting_ops = ["silly.attention"]
Expand All @@ -273,10 +273,9 @@ def run_model(llama_config,
input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
positions = torch.arange(B).cuda()

with set_compile_context([1, 2]):
model(input_ids, positions)
model(input_ids[:2], positions[:2])
model(input_ids[:1], positions[:1])
model(input_ids, positions)
model(input_ids[:2], positions[:2])
model(input_ids[:1], positions[:1])

input_ids[:2].zero_()
output = model(input_ids[:2], positions[:2])
Expand Down Expand Up @@ -379,10 +378,13 @@ def benchmark():
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_capture_sizes=cudagraph_sizes,
)
else:
compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, )
level=CompilationLevel.PIECEWISE,
cudagraph_capture_sizes=cudagraph_sizes,
)

vllm_config = VllmConfig(compilation_config=compilation_config)
with set_current_vllm_config(vllm_config):
Expand All @@ -396,17 +398,16 @@ def benchmark():

graphs = {}

with set_compile_context(cudagraph_sizes):
model(input_ids, positions)
for b in cudagraph_sizes[::-1]:
if not piecewise:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, pool=pool):
output = model(input_ids[:b], positions[:b])
graphs[b] = (graph, output)
else:
model(input_ids, positions)
for b in cudagraph_sizes[::-1]:
if not piecewise:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, pool=pool):
output = model(input_ids[:b], positions[:b])
graphs[b] = (model, output)
graphs[b] = (graph, output)
else:
output = model(input_ids[:b], positions[:b])
graphs[b] = (model, output)
for b in cudagraph_sizes:
if piecewise:
# noqa is for `Function definition does not bind loop variable`
Expand Down
Loading

0 comments on commit bef565c

Please sign in to comment.