Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware] using current_platform.seed_everything #9785

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions benchmarks/kernels/benchmark_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import torch

from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


@torch.inference_mode()
Expand All @@ -16,7 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device("cuda")

layer = RMSNorm(hidden_size).to(dtype=dtype)
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from transformers import AutoConfig

from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.utils import FlexibleArgumentParser, seed_everything
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser


class BenchmarkConfig(TypedDict):
Expand Down Expand Up @@ -167,7 +168,7 @@ class BenchmarkWorker:

def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
seed_everything(seed)
current_platform.seed_everything(seed)
self.seed = seed

def benchmark(
Expand All @@ -181,7 +182,7 @@ def benchmark(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
seed_everything(self.seed)
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import torch

from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random, seed_everything)
create_kv_caches_with_random)

NUM_BLOCKS = 1024
PARTITION_SIZE = 512
Expand All @@ -28,7 +29,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)

scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs,
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/kernels/benchmark_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import torch

from vllm import _custom_ops as ops
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
seed_everything)
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


@torch.inference_mode()
Expand All @@ -17,7 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device("cuda")

x = torch.randn(num_tokens, hidden_size, dtype=dtype)
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
from vllm.utils import FlexibleArgumentParser, seed_everything
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser


def benchmark_rope_kernels_multi_lora(
Expand All @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192,
base: int = 10000,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
GeluAndMul, NewGELU,
QuickGELU, SiluAndMul)
from vllm.utils import seed_everything
from vllm.platforms import current_platform

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -37,7 +37,7 @@ def test_act_and_mul(
seed: int,
device: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu":
Expand Down Expand Up @@ -85,7 +85,7 @@ def test_activation(
seed: int,
device: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation[0]()
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes, seed_everything
from vllm.utils import get_max_shared_memory_bytes

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -144,7 +144,7 @@ def test_paged_attention(
or (version == "rocm" and head_size not in (64, 128))):
pytest.skip()

seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down Expand Up @@ -382,7 +382,7 @@ def test_multi_query_kv_attention(
seed: int,
device: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_awq_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from vllm.model_executor.layers.quantization.awq_triton import (
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
from vllm.utils import seed_everything
from vllm.platforms import current_platform

device = "cuda"

Expand Down Expand Up @@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols
zeros_dtype = torch.int32

seed_everything(0)
current_platform.seed_everything(0)

qweight = torch.randint(0,
torch.iinfo(torch.int32).max,
Expand Down Expand Up @@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows
qzeros_cols = qweight_cols

seed_everything(0)
current_platform.seed_everything(0)

input = torch.rand((input_rows, input_cols),
dtype=input_dtype,
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_blocksparse_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes, seed_everything
from vllm.utils import get_max_shared_memory_bytes

from .allclose_default import get_default_atol, get_default_rtol

Expand Down Expand Up @@ -173,7 +173,7 @@ def test_paged_attention(
blocksparse_block_size: int,
blocksparse_head_sliding_step: int,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down Expand Up @@ -384,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill(
seed: int,
device: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
Expand Down
12 changes: 6 additions & 6 deletions tests/kernels/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops
from vllm.utils import seed_everything
from vllm.platforms import current_platform

COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
DTYPES = [torch.half, torch.bfloat16, torch.float]
Expand Down Expand Up @@ -56,7 +56,7 @@ def test_copy_blocks(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
Expand Down Expand Up @@ -132,7 +132,7 @@ def test_reshape_and_cache(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)
# Create a random slot mapping.
num_slots = block_size * num_blocks
Expand Down Expand Up @@ -224,7 +224,7 @@ def test_reshape_and_cache_flash(
device: str,
kv_cache_dtype: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)
torch.set_default_device(device)

# Create a random slot mapping.
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_swap_blocks(
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()

seed_everything(seed)
current_platform.seed_everything(seed)

src_device = device if direction[0] == "cuda" else 'cpu'
dst_device = device if direction[1] == "cuda" else 'cpu'
Expand Down Expand Up @@ -408,7 +408,7 @@ def test_fp8_e4m3_conversion(
seed: int,
device: str,
) -> None:
seed_everything(seed)
current_platform.seed_everything(seed)

low = -224.0
high = 224.0
Expand Down
12 changes: 6 additions & 6 deletions tests/kernels/test_causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn, causal_conv1d_update)
from vllm.utils import seed_everything
from vllm.platforms import current_platform


def causal_conv1d_ref(
Expand Down Expand Up @@ -70,7 +70,7 @@ def causal_conv1d_update_ref(x,
bias: (dim,)
cache_seqlens: (batch,), dtype int32.
If not None, the conv_state is treated as a circular buffer.
The conv_state will be updated by copying x to the
The conv_state will be updated by copying x to the
conv_state starting at the index
@cache_seqlens % state_len before performing the convolution.

Expand Down Expand Up @@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
# set seed
seed_everything(0)
current_platform.seed_everything(0)
x = torch.randn(batch, dim, seqlen, device=device,
dtype=itype).contiguous()

Expand Down Expand Up @@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
# set seed
seed_everything(0)
current_platform.seed_everything(0)
batch = 2
x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
x_ref = x.clone()
Expand Down Expand Up @@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width,
rtol, atol = 1e-2, 5e-2

# set seed
seed_everything(0)
current_platform.seed_everything(0)

batch_size = 3
padding = 5 if with_padding else 0
Expand Down Expand Up @@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
if itype == torch.bfloat16:
rtol, atol = 1e-2, 5e-2
# set seed
seed_everything(0)
current_platform.seed_everything(0)
seqlens = []
batch_size = 4
if seqlen < 10:
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import torch

from vllm.utils import seed_everything
from vllm.platforms import current_platform
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
flash_attn_with_kvcache)

Expand Down Expand Up @@ -91,7 +91,7 @@ def test_flash_attn_with_paged_kv(
sliding_window: Optional[int],
) -> None:
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
Expand Down Expand Up @@ -161,7 +161,7 @@ def test_varlen_with_paged_kv(
num_blocks: int,
) -> None:
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down
10 changes: 5 additions & 5 deletions tests/kernels/test_flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
import torch

from vllm.utils import seed_everything
from vllm.platforms import current_platform

NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
HEAD_SIZES = [128, 256]
Expand Down Expand Up @@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
soft_cap: Optional[float],
) -> None:
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
Expand Down Expand Up @@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
block_size: int,
soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down Expand Up @@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
head_size: int, dtype: torch.dtype, block_size: int,
soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down Expand Up @@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
) -> None:
# test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda")
seed_everything(0)
current_platform.seed_everything(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
Expand Down
Loading