-
-
Notifications
You must be signed in to change notification settings - Fork 5.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Misc] Consolidate and optimize logic for building padded tensors #6541
Changes from 5 commits
8866cd6
38c5ab8
cee1823
c2015fd
73a8930
75fc93c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,13 @@ | |
from dataclasses import dataclass | ||
from typing import Dict, List, Optional, Tuple | ||
|
||
import numpy as np | ||
import torch | ||
|
||
from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits | ||
from vllm.sampling_params import SamplingParams, SamplingType | ||
from vllm.sequence import SequenceData, SequenceGroupMetadata | ||
from vllm.utils import (async_tensor_h2d, is_pin_memory_available, | ||
maybe_expand_dim) | ||
make_tensor_with_pad, maybe_expand_dim) | ||
|
||
_SAMPLING_EPS = 1e-5 | ||
_SEED_0_REPLACEMENT = 3403598558 | ||
|
@@ -466,22 +465,24 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], | |
do_penalties = prompt_tokens or output_tokens | ||
|
||
if do_penalties: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have merged the if-else blocks based on |
||
prompt_max_len = max([len(tokens) for tokens in prompt_tokens], | ||
default=0) | ||
prompt_padded_tokens = np.full( | ||
(len(prompt_tokens), prompt_max_len), | ||
prompt_t = make_tensor_with_pad( | ||
prompt_tokens, | ||
vocab_size, | ||
dtype=np.int64) | ||
for i, tokens in enumerate(prompt_tokens): | ||
prompt_padded_tokens[i, :len(tokens)] = tokens | ||
output_max_len = max([len(tokens) for tokens in output_tokens], | ||
default=0) | ||
output_padded_tokens = np.full( | ||
(len(output_tokens), output_max_len), | ||
device="cpu", | ||
dtype=torch.int64, | ||
pin_memory=pin_memory, | ||
DarkLight1337 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
output_t = make_tensor_with_pad( | ||
output_tokens, | ||
vocab_size, | ||
dtype=np.int64) | ||
for i, tokens in enumerate(output_tokens): | ||
output_padded_tokens[i, :len(tokens)] = tokens | ||
device="cpu", | ||
dtype=torch.int64, | ||
pin_memory=pin_memory, | ||
) | ||
else: | ||
empty_tensor = torch.empty(0, device=device, dtype=torch.long) | ||
prompt_t = empty_tensor | ||
output_t = empty_tensor | ||
|
||
temperatures_t = torch.tensor( | ||
temperatures, | ||
|
@@ -531,15 +532,6 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], | |
dtype=torch.long, | ||
pin_memory=pin_memory, | ||
) | ||
if do_penalties: | ||
prompt_tensor = torch.from_numpy(prompt_padded_tokens) | ||
output_tensor = torch.from_numpy(output_padded_tokens) | ||
if pin_memory: | ||
prompt_tensor = prompt_tensor.pin_memory() | ||
output_tensor = output_tensor.pin_memory() | ||
else: | ||
prompt_tensor = None | ||
output_tensor = None | ||
# need to transpose and make contiguous to | ||
# copy the tensor correctly. | ||
# [batch_size, n_seeds] -> [n_seeds, batch_size] | ||
|
@@ -562,16 +554,6 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], | |
extra_seeds_gpu = None | ||
sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds] | ||
|
||
if do_penalties: | ||
prompt_tokens_gpu = prompt_tensor.to(device=device, | ||
non_blocking=True) | ||
output_tokens_gpu = output_tensor.to(device=device, | ||
non_blocking=True) | ||
else: | ||
empty_tensor = torch.empty(0, device=device, dtype=torch.long) | ||
prompt_tokens_gpu = empty_tensor | ||
output_tokens_gpu = empty_tensor | ||
|
||
return cls( | ||
temperatures=temperatures_t.to(device=device, non_blocking=True), | ||
top_ps=top_ps_t.to(device=device, non_blocking=True), | ||
|
@@ -583,8 +565,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], | |
non_blocking=True), | ||
repetition_penalties=repetition_penalties_t.to(device=device, | ||
non_blocking=True), | ||
prompt_tokens=prompt_tokens_gpu, | ||
output_tokens=output_tokens_gpu, | ||
prompt_tokens=prompt_t.to(device=device, non_blocking=True), | ||
output_tokens=output_t.to(device=device, non_blocking=True), | ||
sampling_seeds=sampling_seeds_gpu, | ||
sample_indices=sample_indices_t.to(device=device, | ||
non_blocking=True), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
Union) | ||
|
||
import numpy as np | ||
import numpy.typing as npt | ||
import psutil | ||
import torch | ||
import torch.types | ||
|
@@ -40,6 +41,15 @@ | |
"fp8_e5m2": torch.uint8, | ||
} | ||
|
||
TORCH_DTYPE_TO_NUMPY_DTYPE = { | ||
torch.float16: np.float16, | ||
torch.float32: np.float32, | ||
torch.float64: np.float64, | ||
torch.uint8: np.uint8, | ||
torch.int32: np.int32, | ||
torch.int64: np.int64, | ||
} | ||
|
||
P = ParamSpec('P') | ||
K = TypeVar("K") | ||
T = TypeVar("T") | ||
|
@@ -617,23 +627,54 @@ def str_to_int_tuple(s: str) -> Tuple[int, ...]: | |
f"(e.g., 1, 2, 3). Given input: {s}") from e | ||
|
||
|
||
def make_tensor_with_pad( | ||
x: List[List[int]], | ||
max_len: int, | ||
pad: int, | ||
dtype: torch.dtype, | ||
device: Optional[Union[str, torch.device]], | ||
) -> torch.Tensor: | ||
"""Make a padded tensor of a 2D inputs. | ||
def make_array_with_pad( | ||
DarkLight1337 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
x: List[List[T]], | ||
pad: T, | ||
dtype: npt.DTypeLike, | ||
*, | ||
max_len: Optional[int] = None, | ||
) -> npt.NDArray: | ||
""" | ||
Make a padded array from 2D inputs. | ||
|
||
The padding is applied to the end of each inner list until it reaches | ||
`max_len`. | ||
""" | ||
padded_x = np.zeros([len(x), max_len], dtype=np.int32) + pad | ||
if max_len is None: | ||
# Unlike for most functions, map is faster than a genexpr over `len` | ||
max_len = max(map(len, x), default=0) | ||
|
||
padded_x = np.full((len(x), max_len), pad, dtype=dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
python -m timeit "import numpy as np; np.zeros(100000) + 2"
python -m timeit "import numpy as np; np.full(100000, 2)" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've also fixed the dtype to be consistent with the pytorch one. |
||
for ind, blocktb in enumerate(x): | ||
assert len(blocktb) <= max_len | ||
padded_x[ind, :len(blocktb)] = blocktb | ||
return torch.tensor(padded_x, dtype=dtype, device=device) | ||
|
||
return padded_x | ||
|
||
|
||
def make_tensor_with_pad( | ||
x: List[List[T]], | ||
pad: T, | ||
dtype: torch.dtype, | ||
*, | ||
max_len: Optional[int] = None, | ||
device: Optional[Union[str, torch.device]] = None, | ||
pin_memory: bool = False, | ||
) -> torch.Tensor: | ||
""" | ||
Make a padded tensor from 2D inputs. | ||
|
||
The padding is applied to the end of each inner list until it reaches | ||
`max_len`. | ||
""" | ||
np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] | ||
padded_x = make_array_with_pad(x, pad, np_dtype, max_len=max_len) | ||
|
||
tensor = torch.from_numpy(padded_x).to(device) | ||
if pin_memory: | ||
tensor = tensor.pin_memory() | ||
|
||
return tensor | ||
|
||
|
||
def async_tensor_h2d( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I found that this mapping is a duplicate of the one in
vllm.utils
, so I've removed it.