forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Kernel] Replaced
blockReduce[...]
functions with cub::BlockReduce
(
vllm-project#7233) Co-authored-by: Michael Goin <[email protected]>
- Loading branch information
1 parent
3e051e6
commit 4d13eae
Showing
8 changed files
with
237 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import random | ||
import time | ||
|
||
import torch | ||
|
||
from vllm.model_executor.layers.layernorm import RMSNorm | ||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||
|
||
|
||
@torch.inference_mode() | ||
def main(num_tokens: int, | ||
hidden_size: int, | ||
add_residual: bool, | ||
dtype: torch.dtype, | ||
seed: int = 0, | ||
do_profile: bool = False, | ||
num_warmup_iters: int = 5, | ||
num_iters: int = 100) -> None: | ||
random.seed(seed) | ||
torch.random.manual_seed(seed) | ||
if torch.cuda.is_available(): | ||
torch.cuda.manual_seed(seed) | ||
torch.set_default_device("cuda") | ||
|
||
layer = RMSNorm(hidden_size).to(dtype=dtype) | ||
layer.weight.data.normal_(mean=1.0, std=0.1) | ||
scale = 1 / (2 * hidden_size) | ||
x = torch.randn(num_tokens, hidden_size, dtype=dtype) | ||
x *= scale | ||
residual = torch.randn_like(x) * scale if add_residual else None | ||
|
||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: | ||
torch.cuda.synchronize() | ||
if profile: | ||
torch.cuda.cudart().cudaProfilerStart() | ||
start_time = time.perf_counter() | ||
|
||
for _ in range(num_iters): | ||
layer(x, residual) | ||
torch.cuda.synchronize() | ||
|
||
end_time = time.perf_counter() | ||
if profile: | ||
torch.cuda.cudart().cudaProfilerStart() | ||
return (end_time - start_time) / num_iters | ||
|
||
# Warmup. | ||
print("Warming up...") | ||
run_benchmark = run_cuda_benchmark | ||
run_benchmark(num_iters=num_warmup_iters, profile=False) | ||
|
||
# Benchmark. | ||
if do_profile: | ||
latency = run_benchmark(num_iters=1, profile=True) | ||
else: | ||
latency = run_benchmark(num_iters=num_iters, profile=False) | ||
print(f"Kernel running time: {latency * 1000000:.3f} us") | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = FlexibleArgumentParser( | ||
description="Benchmark the layernorm kernel.") | ||
parser.add_argument("--num-tokens", type=int, default=4096) | ||
parser.add_argument("--hidden-size", type=int, default=8192) | ||
parser.add_argument("--add-residual", action="store_true") | ||
parser.add_argument("--dtype", | ||
type=str, | ||
choices=["half", "bfloat16", "float"], | ||
default="half") | ||
parser.add_argument("--seed", type=int, default=0) | ||
parser.add_argument("--profile", action="store_true") | ||
parser.add_argument("--num-warmup-iters", type=int, default=5) | ||
parser.add_argument("--num-iters", | ||
type=int, | ||
default=100, | ||
help="Number of benchmark iterations. " | ||
"If --profile is set, this number is ignored") | ||
|
||
args = parser.parse_args() | ||
print(args) | ||
|
||
main(num_tokens=args.num_tokens, | ||
hidden_size=args.hidden_size, | ||
add_residual=args.add_residual, | ||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], | ||
seed=args.seed, | ||
do_profile=args.profile, | ||
num_warmup_iters=args.num_warmup_iters, | ||
num_iters=args.num_iters) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import random | ||
import time | ||
|
||
import torch | ||
|
||
from vllm import _custom_ops as ops | ||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||
|
||
|
||
@torch.inference_mode() | ||
def main(num_tokens: int, | ||
hidden_size: int, | ||
static_scale: bool, | ||
quant_dtype: torch.dtype, | ||
dtype: torch.dtype, | ||
seed: int = 0, | ||
do_profile: bool = False, | ||
num_warmup_iters: int = 5, | ||
num_iters: int = 100) -> None: | ||
random.seed(seed) | ||
torch.random.manual_seed(seed) | ||
if torch.cuda.is_available(): | ||
torch.cuda.manual_seed(seed) | ||
torch.set_default_device("cuda") | ||
|
||
x = torch.randn(num_tokens, hidden_size, dtype=dtype) | ||
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None | ||
|
||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: | ||
torch.cuda.synchronize() | ||
if profile: | ||
torch.cuda.cudart().cudaProfilerStart() | ||
start_time = time.perf_counter() | ||
|
||
for _ in range(num_iters): | ||
if quant_dtype == torch.int8: | ||
ops.scaled_int8_quant(x, scale) | ||
else: | ||
ops.scaled_fp8_quant(x, scale) | ||
torch.cuda.synchronize() | ||
|
||
end_time = time.perf_counter() | ||
if profile: | ||
torch.cuda.cudart().cudaProfilerStart() | ||
return (end_time - start_time) / num_iters | ||
|
||
# Warmup. | ||
print("Warming up...") | ||
run_benchmark = run_cuda_benchmark | ||
run_benchmark(num_iters=num_warmup_iters, profile=False) | ||
|
||
# Benchmark. | ||
if do_profile: | ||
latency = run_benchmark(num_iters=1, profile=True) | ||
else: | ||
latency = run_benchmark(num_iters=num_iters, profile=False) | ||
print(f"Kernel running time: {latency * 1000000:.3f} us") | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
def to_torch_dtype(dt): | ||
if dt == "int8": | ||
return torch.int8 | ||
if dt == "fp8": | ||
return torch.float8_e4m3fn | ||
raise ValueError(f"Unsupported dtype: {dt}") | ||
|
||
parser = FlexibleArgumentParser( | ||
description="Benchmark the quantization (fp8 or int8) kernel.") | ||
parser.add_argument("--num-tokens", type=int, default=4096) | ||
parser.add_argument("--hidden-size", type=int, default=8192) | ||
parser.add_argument("--static-scale", action="store_true") | ||
parser.add_argument("--quant-dtype", | ||
type=str, | ||
choices=["fp8", "int8"], | ||
default="int8") | ||
parser.add_argument("--dtype", | ||
type=str, | ||
choices=["half", "bfloat16", "float"], | ||
default="half") | ||
|
||
parser.add_argument("--seed", type=int, default=0) | ||
parser.add_argument("--profile", action="store_true") | ||
parser.add_argument("--num-warmup-iters", type=int, default=5) | ||
parser.add_argument("--num-iters", | ||
type=int, | ||
default=100, | ||
help="Number of benchmark iterations. " | ||
"If --profile is set, this number is ignored") | ||
|
||
args = parser.parse_args() | ||
print(args) | ||
|
||
main(num_tokens=args.num_tokens, | ||
hidden_size=args.hidden_size, | ||
static_scale=args.static_scale, | ||
quant_dtype=to_torch_dtype(args.quant_dtype), | ||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], | ||
seed=args.seed, | ||
do_profile=args.profile, | ||
num_warmup_iters=args.num_warmup_iters, | ||
num_iters=args.num_iters) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.