Skip to content

Commit

Permalink
Merge branch 'main' into custom_executor
Browse files Browse the repository at this point in the history
  • Loading branch information
Yard1 authored Jul 19, 2024
2 parents b2032fd + dbe5588 commit 7bdf67c
Show file tree
Hide file tree
Showing 36 changed files with 969 additions and 358 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.758
- name: "exact_match,flexible-extract"
value: 0.759
limit: 1000
num_fewshot: 5
1 change: 1 addition & 0 deletions .buildkite/lm-eval-harness/configs/models-small.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
11 changes: 6 additions & 5 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ steps:
fast_check: true
fast_check_only: true
commands:
- apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
- apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
- pytest -v -s metrics # Metrics
- "pip install \
opentelemetry-sdk \
Expand All @@ -46,14 +46,15 @@ steps:
commands:
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test
mirror_hardwares: [amd]
fast_check: true
commands:
commands:
- pytest -v -s core
- pytest -v -s distributed/test_parallel_state.py

Expand All @@ -72,7 +73,7 @@ steps:
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- pytest -v -s distributed/test_pipeline_parallel.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py

Expand Down Expand Up @@ -122,7 +123,7 @@ steps:

- label: Engine Test
mirror_hardwares: [amd]
commands:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
Expand Down Expand Up @@ -220,7 +221,7 @@ steps:
- label: Tensorizer Test
#mirror_hardwares: [amd]
commands:
- apt-get install curl libsodium23
- apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader

Expand Down
36 changes: 27 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base

ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3
ARG PYTHON_VERSION=3.10

ENV DEBIAN_FRONTEND=noninteractive

Expand All @@ -21,13 +21,16 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version \
&& python3 -m pip --version
&& python3 --version

RUN apt-get update -y \
&& apt-get install -y python3-pip git curl sudo
&& apt-get install -y git curl sudo

# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
Expand Down Expand Up @@ -58,7 +61,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### WHEEL BUILD IMAGE ####################
FROM base AS build

ARG PYTHON_VERSION=3
ARG PYTHON_VERSION=3.10

# install build dependencies
COPY requirements-build.txt requirements-build.txt
Expand Down Expand Up @@ -149,12 +152,27 @@ RUN pip --verbose wheel -r requirements-mamba.txt \

#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
WORKDIR /vllm-workspace

RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version

RUN apt-get update -y \
&& apt-get install -y python3-pip git vim
&& apt-get install -y python3-pip git curl

# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openvino
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

FROM ubuntu:22.04 AS dev
FROM ubuntu:20.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
Expand Down
8 changes: 8 additions & 0 deletions tests/basic_correctness/test_cpu_offload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from ..utils import compare_two_settings


def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
["--cpu-offload-gb", "4"])
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[], ["--cpu-offload-gb", "1"])
119 changes: 25 additions & 94 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,42 @@
import os

import pytest
from transformers import AutoTokenizer

from ..utils import RemoteOpenAIServer
from ..utils import compare_two_settings

VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


@pytest.mark.parametrize(
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME", [
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B"),
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B"),
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B"),
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
[
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
DIST_BACKEND):
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")

pp_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--tensor-parallel-size",
str(TP_SIZE),
"--distributed-executor-backend",
"ray",
DIST_BACKEND,
]

# compare without pipeline parallelism
Expand All @@ -48,85 +60,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
pp_args.append("--enforce-eager")
tp_args.append("--enforce-eager")

prompt = "Hello, my name is"
token_ids = tokenizer(prompt)["input_ids"]
results = []
for args in (pp_args, tp_args):
with RemoteOpenAIServer(MODEL_NAME, args) as server:
client = server.get_client()

# test models list
models = client.models.list()
models = models.data
served_model = models[0]
results.append({
"test": "models_list",
"id": served_model.id,
"root": served_model.root,
})

# test with text prompt
completion = client.completions.create(model=MODEL_NAME,
prompt=prompt,
max_tokens=5,
temperature=0.0)

results.append({
"test": "single_completion",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})

# test using token IDs
completion = client.completions.create(
model=MODEL_NAME,
prompt=token_ids,
max_tokens=5,
temperature=0.0,
)

results.append({
"test": "token_ids",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})

# test simple list
batch = client.completions.create(
model=MODEL_NAME,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
)

results.append({
"test": "simple_list",
"text0": batch.choices[0].text,
"text1": batch.choices[1].text,
})

# test streaming
batch = client.completions.create(
model=MODEL_NAME,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
results.append({
"test": "streaming",
"texts": texts,
})

n = len(results) // 2
pp_results = results[:n]
tp_results = results[n:]
for pp, tp in zip(pp_results, tp_results):
assert pp == tp
compare_two_settings(MODEL_NAME, pp_args, tp_args)
56 changes: 53 additions & 3 deletions tests/samplers/test_rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
high=vocab_size,
size=(batch_size, k),
dtype=torch.int64)
generators = [None] * batch_size

rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids)
draft_token_ids, generators)


@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("k", [1, 3, 6])
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int,
device: str):
torch.set_default_device(device)
rejection_sampler = RejectionSampler()
rejection_sampler.init_gpu_tensors(rank=0)

draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
bonus_token_ids = torch.randint(low=0,
high=vocab_size,
size=(batch_size, 1),
dtype=torch.int64)
draft_token_ids = torch.randint(low=0,
high=vocab_size,
size=(batch_size, k),
dtype=torch.int64)

seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded

results = []
for _ in range(n_rep):
generators = [
torch.Generator(
device=device).manual_seed(i) if seeded_mask[i] else None
for i in range(batch_size)
]
results.append(
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids, generators))

for i in range(batch_size):
if seeded_mask[i]:
for j in range(1, n_rep):
assert torch.equal(results[j][i], results[0][i])


@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
Expand Down Expand Up @@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
raise AssertionError()

oob_token_ids[0][0] = rogue_token_id
generators = [None] * batch_size

with pytest.raises(AssertionError):
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids)
draft_token_ids, generators)


@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
Expand Down Expand Up @@ -371,11 +417,15 @@ def _estimate_rejection_sampling_pdf(
dtype=torch.int64,
device="cuda").repeat(num_samples, 1)

# unseeded
generators = [None]

# Get output tokens via rejection sampling.
output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
bonus_token_ids.to("cuda"),
draft_probs.to("cuda"),
draft_token_ids.to("cuda"))
draft_token_ids.to("cuda"),
generators)

# Remove bonus tokens
output_token_ids = output_token_ids[:, :-1].flatten()
Expand Down
Loading

0 comments on commit 7bdf67c

Please sign in to comment.