Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi lora cpu #1

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model
from vllm.platforms import current_platform


class ContextIDInfo(TypedDict):
Expand Down Expand Up @@ -64,13 +65,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
@pytest.fixture
def dist_init():
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)

backend = "nccl"
if current_platform.is_cpu():
backend = "gloo"

init_distributed_environment(world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend=backend)
initialize_model_parallel(1, 1)
yield
cleanup_dist_env_and_memory(shutdown_ray=True)
Expand All @@ -80,13 +84,15 @@ def dist_init():
def dist_init_torch_only():
if torch.distributed.is_initialized():
return
backend = "nccl"
if current_platform.is_cpu():
backend = "gloo"

temp_file = tempfile.mkstemp()[1]
torch.distributed.init_process_group(
backend="nccl",
world_size=1,
rank=0,
init_method=f"file://{temp_file}",
)
torch.distributed.init_process_group(world_size=1,
rank=0,
init_method=f"file://{temp_file}",
backend=backend)


@pytest.fixture
Expand Down
46 changes: 34 additions & 12 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,19 @@
torch.float32: (5e-3, 5e-3),
torch.bfloat16: (3e-2, 2e-2),
}

pytestmark = pytest.mark.skipif(
not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
reason="Backend not supported")

CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]

CPU_DEVICES = ["cpu"]

DEVICES = CUDA_DEVICES if current_platform.is_cuda_alike() else CPU_DEVICES

# We will launch different triton kernels between the prefill and decode
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
STAGES = [True, False]
Expand Down Expand Up @@ -194,14 +203,15 @@ def create_random_inputs(

@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925
# Same below.
torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
max_loras = 8
Expand Down Expand Up @@ -296,13 +306,15 @@ def create_random_embedding_layer():
# @pytest.mark.skip(
# reason="Fails when loras are in any slot other than the first.")
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
vocab_size, stage) -> None:

torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
max_loras = 8
punica_wrapper = PunicaWrapper(8192, 256, device)
Expand Down Expand Up @@ -432,13 +444,15 @@ def create_random_embedding_layer():

@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
stage) -> None:

torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
max_loras = 8
punica_wrapper = PunicaWrapper(8192, 256, device)
Expand Down Expand Up @@ -563,13 +577,15 @@ def _pretest():

@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_replicated(dist_init, num_loras, device, stage,
bias_enabled) -> None:

torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
punica_wrapper = PunicaWrapper(8192, 256, device)
max_loras = 8
Expand Down Expand Up @@ -675,13 +691,15 @@ def create_random_linear_replicated_layer():
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("orientation", ["row", "column"])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
device, stage, bias_enabled) -> None:

torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
punica_wrapper = PunicaWrapper(8192, 256, device)
max_loras = 8
Expand Down Expand Up @@ -797,13 +815,15 @@ def create_random_linear_parallel_layer():
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("repeats", [1, 2, 3])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
device, stage, bias_enabled) -> None:

torch.cuda.set_device(device)
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)

torch.set_default_device(device)
punica_wrapper = PunicaWrapper(8192, 256, device)
max_loras = 8
Expand Down Expand Up @@ -955,6 +975,8 @@ class FakeConfig:
@pytest.mark.parametrize("rotary_dim", [None, 32])
@pytest.mark.parametrize("head_size", [32, 108])
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only CUDA backends are supported")
def test_rotary_embedding_long_context(dist_init, num_loras, device,
scaling_factors, max_position,
is_neox_style, rotary_dim, head_size,
Expand Down
20 changes: 12 additions & 8 deletions tests/lora/test_lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager)
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.platforms import current_platform

EMBEDDING_MODULES = {
"embed_tokens": "input_embeddings",
Expand All @@ -28,9 +29,12 @@
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
CPU_DEVICES = ["cpu"]

DEVICES = CUDA_DEVICES if current_platform.is_cuda_alike() else CPU_DEVICES

@pytest.mark.parametrize("device", CUDA_DEVICES)

@pytest.mark.parametrize("device", DEVICES)
def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file(
os.path.join(sql_lora_files, "adapter_model.safetensors"))
Expand Down Expand Up @@ -113,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model):
manager = LoRAModelManager(
model, 1, 1, 1,
LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
torch.device("cuda"))
torch.device(DEVICES[0]))
model = manager.model

assert isinstance(model.get_submodule("dense1"),
Expand All @@ -125,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model):
RowParallelLinearWithLoRA)


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device):
model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
Expand Down Expand Up @@ -186,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.punica_wrapper.device == device


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
Expand Down Expand Up @@ -278,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.device == device


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_lru_lora_model_manager(dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
Expand Down Expand Up @@ -408,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
Expand Down Expand Up @@ -487,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device)


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
# Should remove every LoRA not specified in the request.
Expand Down Expand Up @@ -563,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device)


@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_packed_loras(dist_init, dummy_model_gate_up, device):
model = dummy_model_gate_up
model.supported_lora_modules = ["gate_up_proj"]
Expand Down
4 changes: 3 additions & 1 deletion tests/lora/test_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import vllm
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"

Expand All @@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
@pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora(mixtral_lora_files, tp_size):
"""Original test, the LoRA model has the common target modules, not all"""
if torch.cuda.device_count() < tp_size:
if torch.cuda.device_count(
) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

prompts = [
Expand Down
19 changes: 9 additions & 10 deletions tests/lora/test_punica_sizes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,8 @@
import pytest
import torch

from vllm.lora.ops.bgmv_expand import bgmv_expand
from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
from vllm.lora.ops.bgmv_shrink import bgmv_shrink
from vllm.lora.ops.sgmv_expand import sgmv_expand
from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
from vllm.lora.ops.sgmv_shrink import sgmv_shrink
from vllm.lora.ops import (bgmv_expand, bgmv_expand_slice, bgmv_shrink,
sgmv_expand, sgmv_expand_slice, sgmv_shrink)
from vllm.platforms import current_platform

from .utils import (generate_data, generate_data_for_expand_nslices,
Expand Down Expand Up @@ -110,7 +106,10 @@
MAX_RANKS = [32]
SCALES = [0.5]
SEED = [0]
CUDA_DEVICES = [f"cuda:{0}"]

CUDA_DEVICES = ["cuda:0"]
CPU_DEVICES = ["cpu"]
DEVICES = CUDA_DEVICES if current_platform.is_cuda_alike() else CPU_DEVICES


def assert_close(a, b):
Expand All @@ -130,7 +129,7 @@ def assert_close(a, b):
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_sgmv(
batches: int,
num_loras: int,
Expand Down Expand Up @@ -220,7 +219,7 @@ def test_punica_sgmv(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv(
batches: int,
num_loras: int,
Expand Down Expand Up @@ -294,7 +293,7 @@ def test_punica_bgmv(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_expand_nslices(
batches: int,
num_loras: int,
Expand Down
Loading
Loading