diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml similarity index 60% rename from .buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml rename to .buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml index a0466748ea71e..3ea0b7bb5cd66 100644 --- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml @@ -1,11 +1,11 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1 -model_name: "nvidia/Minitron-4B-Base" +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 +model_name: "mgoin/Minitron-4B-Base-FP8" tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.252 + value: 0.233 - name: "exact_match,flexible-extract" - value: 0.252 + value: 0.236 limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index bca89f00653e3..bb9cd43e2df04 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -4,7 +4,7 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml -Minitron-4B-Base.yaml +Minitron-4B-Base-FP8.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-FP8W8.yaml Meta-Llama-3-8B-QQQ.yaml diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh index 1a88d038b4b52..f6e41fcfdd0be 100644 --- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -70,23 +70,13 @@ wait_for_server() { kill_gpu_processes() { # kill all processes on GPU. - pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) - if [ -z "$pids" ]; then - echo "No GPU processes found." - else - for pid in $pids; do - kill -9 "$pid" - echo "Killed process with PID: $pid" - done - echo "All GPU processes have been killed." - fi + ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9 + ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9 - # waiting for GPU processes to be fully killed - # loop while nvidia-smi returns any processes - while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do + # wait until GPU memory usage smaller than 1GB + while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do sleep 1 - echo "Waiting for GPU processes to be killed" done # remove vllm config file diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md index 891e4917070d9..da32d1f073cea 100644 --- a/.buildkite/nightly-benchmarks/tests/descriptions.md +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -1,47 +1,42 @@ ## Latency tests -This test suite aims to test vllm's end-to-end latency under a controlled setup. - - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: end-to-end latency (mean, median, p99). -### Latency benchmarking results {latency_tests_markdown_table} -## Throughput tests -This test suite aims to test vllm's throughput. +## Throughput tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: throughput. -### Throughput benchmarking results {throughput_tests_markdown_table} -## Serving tests -This test suite aims to test vllm's real serving metrics. +## Serving tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- We also added a speculative decoding test for llama-3 70B, under QPS 2 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). -### Serving benchmarking results {serving_tests_markdown_table} + ## json version of the benchmarking tables This section contains the data of the markdown tables above in JSON format. diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json index 06488cd79110a..1841186da158f 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -2,7 +2,7 @@ { "test_name": "latency_llama8B_tp1", "parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "num_iters_warmup": 5, @@ -12,7 +12,7 @@ { "test_name": "latency_llama70B_tp4", "parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "num-iters-warmup": 5, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 300af0524d7c0..facb0eac749ca 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -3,7 +3,7 @@ "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", @@ -11,7 +11,7 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -22,7 +22,7 @@ "test_name": "serving_llama70B_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", @@ -30,7 +30,7 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -60,7 +60,7 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "disable_log_requests": "", "tensor_parallel_size": 4, "swap_space": 16, @@ -70,7 +70,7 @@ "use_v2_block_manager": "" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json index 41ac135748704..91ef6d16be638 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -2,7 +2,7 @@ { "test_name": "throughput_llama8B_tp1", "parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -13,7 +13,7 @@ { "test_name": "throughput_llama70B_tp4", "parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 264b7f58ad1ac..7ab9c2056bc36 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -56,7 +56,7 @@ steps: - pytest -v -s worker # Worker - label: Basic Correctness Test # 30min - mirror_hardwares: [amd] + #mirror_hardwares: [amd] fast_check: true source_file_dependencies: - vllm/ @@ -81,7 +81,7 @@ steps: - label: Entrypoints Test # 20min working_dir: "/vllm-workspace/tests" fast_check: true - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/ commands: @@ -101,10 +101,9 @@ steps: - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py -##### fast check tests ##### -##### 1 GPU test ##### - - label: Metrics, Tracing Test # 10min + num_gpus: 2 + fast_check: true source_file_dependencies: - vllm/ - tests/metrics @@ -118,6 +117,9 @@ steps: opentelemetry-semantic-conventions-ai" - pytest -v -s tracing +##### fast check tests ##### +##### 1 GPU test ##### + - label: Regression Test # 5min mirror_hardwares: [amd] source_file_dependencies: @@ -139,7 +141,7 @@ steps: - label: Examples Test # 12min working_dir: "/vllm-workspace/examples" - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/entrypoints - examples/ @@ -171,14 +173,14 @@ steps: - label: Vision Language Models Test # 42min - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/ commands: - pytest -v -s models -m vlm - label: Prefix Caching Test # 7min - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/prefix_caching @@ -261,7 +263,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 - ##### 1 GPU test ##### ##### multi gpus test ##### @@ -290,7 +291,7 @@ steps: - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - label: Distributed Tests (2 GPUs) # 28min - mirror_hardwares: [amd] + #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -313,11 +314,11 @@ steps: num_gpus: 4 source_file_dependencies: - vllm/ - - tests/distributed/test_pipeline_parallel - tests/distributed/test_pp_cudagraph.py + - tests/distributed/test_pipeline_parallel commands: - - pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA Long Context (Distributed) # 11min # This test runs llama 13B, so it is required to run on 4 GPUs. diff --git a/.gitignore b/.gitignore index 17184b19127ca..2dfbf64d75c1b 100644 --- a/.gitignore +++ b/.gitignore @@ -189,4 +189,4 @@ _build/ hip_compat.h # Benchmark dataset -*.json +benchmarks/*.json diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index e00696d6d43cb..fd233c71b10a6 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -30,19 +30,36 @@ def benchmark_config( hidden_size: int, topk: int, dtype: torch.dtype, - use_fp8: bool, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, num_iters: int = 100, ) -> float: - init_dtype = torch.float16 if use_fp8 else dtype + init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) - w1 = torch.randn(num_experts, - shard_intermediate_size, - hidden_size, - dtype=init_dtype) - w2 = torch.randn(num_experts, - hidden_size, - shard_intermediate_size // 2, - dtype=init_dtype) + if use_int8_w8a16: + w1 = torch.randint(-127, + 127, ( + num_experts, + shard_intermediate_size, + hidden_size, + ), + dtype=torch.int8) + w2 = torch.randint(-127, + 127, ( + num_experts, + hidden_size, + shard_intermediate_size // 2, + ), + dtype=torch.int8) + else: + w1 = torch.randn(num_experts, + shard_intermediate_size, + hidden_size, + dtype=init_dtype) + w2 = torch.randn(num_experts, + hidden_size, + shard_intermediate_size // 2, + dtype=init_dtype) gating_output = torch.randn(num_iters, num_tokens, num_experts, @@ -52,7 +69,11 @@ def benchmark_config( w2_scale = None a1_scale = None a2_scale = None - if use_fp8: + if use_int8_w8a16: + w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size), + dtype=torch.float32) + w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_fp8_w8a8: w1_scale = torch.randn(num_experts, dtype=torch.float32) w2_scale = torch.randn(num_experts, dtype=torch.float32) a1_scale = torch.randn(1, dtype=torch.float32) @@ -76,7 +97,8 @@ def run(): renormalize=True, inplace=True, override_config=config, - use_fp8=use_fp8, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, @@ -155,11 +177,13 @@ def benchmark( hidden_size: int, topk: int, dtype: torch.dtype, - use_fp8: bool, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, ) -> Tuple[Dict[str, int], float]: torch.cuda.manual_seed_all(self.seed) - - dtype_str = "float8" if use_fp8 else None + dtype_str = get_config_dtype_str(dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. op_config = get_moe_configs(num_experts, shard_intermediate_size // 2, @@ -173,7 +197,8 @@ def benchmark( key=lambda x: abs(x - num_tokens))] kernel_time = benchmark_config(config, num_tokens, num_experts, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8) + topk, dtype, use_fp8_w8a8, + use_int8_w8a16) return config, kernel_time def tune( @@ -184,9 +209,10 @@ def tune( hidden_size: int, topk: int, dtype: torch.dtype, - use_fp8: bool, - search_space: List[BenchmarkConfig], - ) -> BenchmarkConfig: + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + search_space: List[Dict[str, int]], + ) -> Dict[str, int]: best_config = None best_time = float("inf") for config in tqdm(search_space): @@ -198,7 +224,8 @@ def tune( hidden_size, topk, dtype, - use_fp8, + use_fp8_w8a8, + use_int8_w8a16, num_iters=10) except triton.runtime.autotuner.OutOfResources: # Some configurations may be invalid and fail to compile. @@ -224,20 +251,19 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: } -def save_configs( - configs: Dict[int, BenchmarkConfig], - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8: bool, -) -> None: - dtype_str = "float8" if use_fp8 else None +def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, + shard_intermediate_size: int, hidden_size: int, topk: int, + dtype: torch.dtype, use_fp8_w8a8: bool, + use_int8_w8a16: bool) -> None: + dtype_str = get_config_dtype_str(dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8) + # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. filename = get_config_file_name(num_experts, shard_intermediate_size // 2, dtype_str) + print(f"Writing best config to {filename}...") with open(filename, "w") as f: json.dump(configs, f, indent=4) @@ -253,6 +279,11 @@ def main(args: argparse.Namespace): topk = config.ffn_config.moe_top_k intermediate_size = config.ffn_config.ffn_hidden_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Default: Mixtral. E = config.num_local_experts @@ -262,7 +293,8 @@ def main(args: argparse.Namespace): hidden_size = config.hidden_size dtype = config.torch_dtype - use_fp8 = args.dtype == "fp8" + use_fp8_w8a8 = args.dtype == "fp8_w8a8" + use_int8_w8a16 = args.dtype == "int8_w8a16" if args.batch_size is None: batch_sizes = [ @@ -294,21 +326,21 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: start = time.time() configs = _distribute( "tune", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8, search_space) + topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space) for batch_size in batch_sizes]) best_configs = { M: sort_config(config) for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8) + topk, dtype, use_fp8_w8a8, use_int8_w8a16) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: - outputs = _distribute("benchmark", - [(batch_size, E, shard_intermediate_size, - hidden_size, topk, dtype, use_fp8) - for batch_size in batch_sizes]) + outputs = _distribute( + "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, use_int8_w8a16) + for batch_size in batch_sizes]) for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): print(f"Batch size: {batch_size}, config: {config}") @@ -323,7 +355,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: parser.add_argument("--tp-size", "-tp", type=int, default=2) parser.add_argument("--dtype", type=str, - choices=["auto", "fp8"], + choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index e0f4d9f3c5cf3..b1e10fecb6b54 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -313,6 +313,8 @@ class ScalarType { // have ScalarType inherit from torch::CustomClassHolder and have a constexpr // constructor at the same time (torch::CustomClassHolder does not have a // constexpr destructor) +// See also: +// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType { public: ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias, @@ -382,6 +384,29 @@ class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType { exponent, mantissa, finite_values_only, NanRepr(nan_repr))); } + // This needs to be implemented and throw a TypeError in order for + // PyTorch's opcheck to work on ops that use ScalarTypes. + int64_t len() const { + throw c10::TypeError("__len__ not implemented"); + return 0; + } + + // Serialize a ScalarType into a tuple of pairs. Where each pair + // is a (fieldname, value). + // For simplicity, we are just going to convert to a ScalarTypeId. + std::tuple> obj_flatten() const { + return {{"ScalarType", id()}}; + } + + // Deserialize a scalar type that has been serialized by obj_flatten, + // ostensibly from a tuple of (member name, value) pairs, but in reality + // just a ScalarTypeId. + static SelfPtr obj_unflatten( + std::tuple> const& flat_type) { + return c10::make_intrusive( + from_id(std::get<1>(std::get<0>(flat_type)))); + } + template static void bind_readonly_property(torch::class_& cls, std::string const& name, T Base::*field) { @@ -457,6 +482,7 @@ class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType { self.get()->min()); }); + bind_function(cls, "__len__", &ScalarTypeTorch::len); bind_function(cls, "__str__", &Base::str); bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) { return *self == *other; @@ -465,6 +491,10 @@ class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType { return "ScalarType." + self.get()->str(); }); + bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten); + bind_static_function(cls, "__obj_unflatten__", + &ScalarTypeTorch::obj_unflatten); + // Bind static functions (convenience constructors) bind_static_function(cls, "int_", &ScalarTypeTorch::int_); bind_static_function(cls, "uint", &ScalarTypeTorch::uint); diff --git a/csrc/ops.h b/csrc/ops.h index 023455f8a1530..6094599901022 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -63,12 +63,12 @@ void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size, torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, + const std::vector& codebook_partition_sizes, const std::optional& bias); -torch::Tensor aqlm_dequant(const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& codebook_partition_sizes); +torch::Tensor aqlm_dequant( + const torch::Tensor& codes, const torch::Tensor& codebooks, + const std::vector& codebook_partition_sizes); torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, @@ -107,13 +107,13 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, int64_t size_n, int64_t num_bits); -torch::Tensor ggml_dequantize(torch::Tensor W, int8_t type, int64_t m, +torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n); -torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int8_t type, - int64_t row); +torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, + int64_t type, int64_t row); -torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int8_t type, +torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row); torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu index 22da5e4f08a18..79cd2c610b3c2 100644 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ b/csrc/quantization/aqlm/gemm_kernels.cu @@ -496,14 +496,14 @@ torch::Tensor code2x8_matmat(const torch::Tensor& input, } // Accumulate the partition sizes. -int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) { +int4 accumulate_sizes(const std::vector& codebook_partition_sizes) { int4 cumulative_sizes; auto cumulative_size = &cumulative_sizes.x; - int i = 0; + size_t i = 0; int last = 0; - assert(codebook_partition_sizes.size(0) <= 4); - for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) { - *cumulative_size = codebook_partition_sizes[i].item() + last; + assert(codebook_partition_sizes.size() <= 4); + for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) { + *cumulative_size = codebook_partition_sizes[i] + last; last = *cumulative_size; } // fill in the rest with unreachable. @@ -519,12 +519,12 @@ int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) { torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, const torch::Tensor& scales, - const torch::Tensor& codebook_partition_sizes, + const std::vector& codebook_partition_sizes, const std::optional& bias) { int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); int const entries = codebooks.size(1); if (nbooks == 1 && entries == (1 << 16)) { @@ -541,13 +541,13 @@ torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, return {}; } -torch::Tensor aqlm_dequant(const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& codebook_partition_sizes) { +torch::Tensor aqlm_dequant( + const torch::Tensor& codes, const torch::Tensor& codebooks, + const std::vector& codebook_partition_sizes) { int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0); + int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); int const entries = codebooks.size(1); const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); @@ -557,7 +557,8 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes, auto in_features = codes.size(1) * 8; auto out_features = codes.size(0); - assert(out_features = codebook_partition_sizes.sum().item()); + assert(out_features == std::accumulate(codebook_partition_sizes.begin(), + codebook_partition_sizes.end(), 0)); auto weights = torch::empty({out_features, in_features}, torch::TensorOptions() diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 6dae32b25f9c4..3f77c76ae7ec4 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -9,6 +9,18 @@ #include "../../reduction_utils.cuh" +#ifndef USE_ROCM +using FP8_TYPE = c10::Float8_e4m3fn; +C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = + std::numeric_limits::max(); +#else + #include "amd/hip_float8.h" +using FP8_TYPE = c10::Float8_e4m3fnuz; +// Using the default max value from pytorch (240.0) will cause accuracy +// issue when running dynamic quantization. Here use 224.0f for rocm. +constexpr auto FP8_E4M3_MAX = 224.0f; +#endif + namespace vllm { __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { @@ -21,11 +33,9 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { return old; } -#define FP8_E4M3_MAX std::numeric_limits::max() - template -__device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion( - float const val, float const scale) { +__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, + float const scale) { float x = 0.0f; if constexpr (is_scale_inverted) { x = val * scale; @@ -34,7 +44,13 @@ __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion( } float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); +#ifndef USE_ROCM return static_cast(r); +#else + // Use hardware cvt instruction for fp8 on rocm + return c10::Float8_e4m3fnuz(hip_fp8(r).data, + c10::Float8_e4m3fnuz::from_bits()); +#endif } // Compute the absolute maximum m of the input tensor and store @@ -74,8 +90,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, // Finally, since cache[0] contains the maximum for this thread block, // atomically write the max to the target location if (threadIdx.x == 0) { - atomicMaxFloat(scale, - cache[0] / std::numeric_limits::max()); + atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX); } } @@ -88,10 +103,10 @@ struct __align__(8) vec4_t { }; typedef struct __align__(4) { - c10::Float8_e4m3fn x; - c10::Float8_e4m3fn y; - c10::Float8_e4m3fn z; - c10::Float8_e4m3fn w; + FP8_TYPE x; + FP8_TYPE y; + FP8_TYPE z; + FP8_TYPE w; } float8x4_t; @@ -124,7 +139,7 @@ __device__ float thread_max_vec(scalar_t const* __restrict__ input, } template -__device__ void scaled_fp8_conversion_vec(c10::Float8_e4m3fn* __restrict__ out, +__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, scalar_t const* __restrict__ input, float const scale, int64_t const num_elems, @@ -160,7 +175,7 @@ __device__ void scaled_fp8_conversion_vec(c10::Float8_e4m3fn* __restrict__ out, } template -__global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out, +__global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out, const scalar_t* __restrict__ input, const float* __restrict__ scale, int64_t num_elems) { @@ -175,7 +190,7 @@ __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out, template __global__ void dynamic_per_token_scaled_fp8_quant_kernel( - c10::Float8_e4m3fn* __restrict__ out, float* __restrict__ scale, + FP8_TYPE* __restrict__ out, float* __restrict__ scale, scalar_t const* __restrict__ input, float const* __restrict__ scale_ub, const int hidden_size) { float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f); @@ -184,7 +199,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( int const token_idx = blockIdx.x; scalar_t const* __restrict__ token_input = &input[token_idx * hidden_size]; - c10::Float8_e4m3fn* __restrict__ token_output = &out[token_idx * hidden_size]; + FP8_TYPE* __restrict__ token_output = &out[token_idx * hidden_size]; // For vectorization, token_input and token_output pointers need to be // aligned at 8-byte and 4-byte addresses respectively. @@ -241,7 +256,7 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "scaled_fp8_quant_kernel", [&] { vllm::scaled_fp8_quant_kernel<<>>( - out.data_ptr(), input.data_ptr(), + out.data_ptr(), input.data_ptr(), scale.data_ptr(), num_elems); }); } @@ -261,7 +276,7 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] vllm::segmented_max_reduction<<>>( scale.data_ptr(), input.data_ptr(), num_elems); vllm::scaled_fp8_quant_kernel<<>>( - out.data_ptr(), input.data_ptr(), + out.data_ptr(), input.data_ptr(), scale.data_ptr(), num_elems); }); } @@ -284,7 +299,7 @@ void dynamic_per_token_scaled_fp8_quant( input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] { vllm::dynamic_per_token_scaled_fp8_quant_kernel <<>>( - out.data_ptr(), scales.data_ptr(), + out.data_ptr(), scales.data_ptr(), input.data_ptr(), scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, hidden_size); diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh index 03c080f645f02..2069fba759ea0 100644 --- a/csrc/quantization/gguf/dequantize.cuh +++ b/csrc/quantization/gguf/dequantize.cuh @@ -487,7 +487,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq4_xs<<>>(vx, y); } -static to_fp16_cuda_t ggml_get_to_fp16_cuda(int type) { +static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) { switch (type) { case 2: return dequantize_block_cuda; diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index 9beae1bec4034..966d9992b25fd 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -60,7 +60,7 @@ static void quantize_row_q8_1_cuda(const half* x, void* vy, const int kx, } torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight - int8_t type, int64_t m, int64_t n) { + int64_t type, int64_t m, int64_t n) { const at::cuda::OptionalCUDAGuard device_guard(device_of(W)); auto options = torch::TensorOptions().dtype(torch::kFloat16).device(W.device()); @@ -73,7 +73,7 @@ torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight torch::Tensor X, // input - int8_t type, int64_t row) { + int64_t type, int64_t row) { int col = X.sizes()[1]; const int padded = (col + 512 - 1) / 512 * 512; const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); @@ -172,7 +172,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight torch::Tensor ggml_mul_mat_a8(torch::Tensor W, // quant weight torch::Tensor X, // input - int8_t type, int64_t row) { + int64_t type, int64_t row) { int col = X.sizes()[1]; int padded = (col + 512 - 1) / 512 * 512; int batch = X.sizes()[0]; @@ -239,4 +239,4 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, // quant weight break; } return Y; -} \ No newline at end of file +} diff --git a/docs/source/index.rst b/docs/source/index.rst index 54e4806354575..4e79871e6e78f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -107,6 +107,7 @@ Documentation quantization/supported_hardware quantization/auto_awq quantization/bnb + quantization/int8 quantization/fp8 quantization/fp8_e5m2_kvcache quantization/fp8_e4m3_kvcache diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index 7f796fc3ab458..d7d9b21b4b949 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -1,6 +1,6 @@ .. _fp8: -FP8 +FP8 W8A8 ================== vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. @@ -15,6 +15,11 @@ The FP8 types typically supported in hardware have two distinct representations, - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``. - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values. +.. note:: + + FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). + FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. + Quick Start with Online Dynamic Quantization -------------------------------------------- @@ -33,106 +38,134 @@ In this mode, all Linear modules (except for the final ``lm_head``) have their w Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. -Offline Quantization +Installation +------------ + +To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor `_ library: + +.. code-block:: console + + $ pip install llmcompressor==0.1.0 + +Quantization Process -------------------- -For offline quantization to FP8, please install the `AutoFP8 library `_. +The quantization process involves three main steps: -.. code-block:: bash +1. Loading the model +2. Applying quantization +3. Evaluating accuracy in vLLM - git clone https://github.com/neuralmagic/AutoFP8.git - pip install -e AutoFP8 +1. Loading the Model +^^^^^^^^^^^^^^^^^^^^ -This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. +Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: -Offline Quantization with Dynamic Activation Scaling Factors ------------------------------------------------------------- +.. code-block:: python -You can use AutoFP8 to produce checkpoints with their weights quantized to FP8 ahead of time and let vLLM handle calculating dynamic scales for the activations at runtime for maximum accuracy. You can enable this with the ``activation_scheme="dynamic"`` argument. + from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoTokenizer -.. warning:: + MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + + model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +2. Applying Quantization +^^^^^^^^^^^^^^^^^^^^^^^^ - Please note that although this mode doesn't give you better performance, it reduces memory footprint compared to online quantization. +For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses: + +- Static, per-channel quantization on the weights +- Dynamic, per-token quantization on the activations + +Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. .. code-block:: python - from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import QuantizationModifier - pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" - quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8-Dynamic" + # Configure the simple PTQ quantization + recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) - # Define quantization config with static activation scales - quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="dynamic") - # For dynamic activation scales, there is no need for calbration examples - examples = [] + # Apply the quantization algorithm. + oneshot(model=model, recipe=recipe) - # Load the model, quantize, and save checkpoint - model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - model.quantize(examples) - model.save_quantized(quantized_model_dir) + # Save the model. + SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" + model.save_pretrained(SAVE_DIR) + tokenizer.save_pretrained(SAVE_DIR) + +3. Evaluating Accuracy +^^^^^^^^^^^^^^^^^^^^^^ + +Install ``vllm`` and ``lm-evaluation-harness``: + +.. code-block:: console + + $ pip install vllm lm_eval==0.4.3 + +Load and run the model in ``vllm``: + +.. code-block:: python + + from vllm import LLM + model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") + model.generate("Hello my name is") + +Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``): + +.. note:: + + Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations. + +.. code-block:: console + + $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic + $ lm_eval \ + --model vllm \ + --model_args pretrained=$MODEL,add_bos_token=True \ + --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 -In the output of the above script, you should be able to see the quantized Linear modules (FP8DynamicLinear) replaced in the model definition. -Note that the ``lm_head`` Linear module at the end is currently skipped by default. +Here's an example of the resulting scores: .. code-block:: text - LlamaForCausalLM( - (model): LlamaModel( - (embed_tokens): Embedding(128256, 4096) - (layers): ModuleList( - (0-31): 32 x LlamaDecoderLayer( - (self_attn): LlamaSdpaAttention( - (q_proj): FP8DynamicLinear() - (k_proj): FP8DynamicLinear() - (v_proj): FP8DynamicLinear() - (o_proj): FP8DynamicLinear() - (rotary_emb): LlamaRotaryEmbedding() - ) - (mlp): LlamaMLP( - (gate_proj): FP8DynamicLinear() - (up_proj): FP8DynamicLinear() - (down_proj): FP8DynamicLinear() - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm() - (post_attention_layernorm): LlamaRMSNorm() - ) - ) - (norm): LlamaRMSNorm() - ) - (lm_head): Linear(in_features=4096, out_features=128256, bias=False) - ) - Saving the model to Meta-Llama-3-8B-Instruct-FP8-Dynamic - -Your model checkpoint with quantized weights should be available at ``Meta-Llama-3-8B-Instruct-FP8/``. -We can see that the weights are smaller than the original BF16 precision. + |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| + |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| + |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| + | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| -.. code-block:: bash +Troubleshooting and Support +--------------------------- - ls -lh Meta-Llama-3-8B-Instruct-FP8-Dynamic/ - total 8.5G - -rw-rw-r-- 1 user user 869 Jun 7 14:43 config.json - -rw-rw-r-- 1 user user 194 Jun 7 14:43 generation_config.json - -rw-rw-r-- 1 user user 4.7G Jun 7 14:43 model-00001-of-00002.safetensors - -rw-rw-r-- 1 user user 3.9G Jun 7 14:43 model-00002-of-00002.safetensors - -rw-rw-r-- 1 user user 43K Jun 7 14:43 model.safetensors.index.json - -rw-rw-r-- 1 user user 296 Jun 7 14:43 special_tokens_map.json - -rw-rw-r-- 1 user user 50K Jun 7 14:43 tokenizer_config.json - -rw-rw-r-- 1 user user 8.7M Jun 7 14:43 tokenizer.json +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. -Finally, you can load the quantized model checkpoint directly in vLLM. -.. code-block:: python +Deprecated Flow +------------------ - from vllm import LLM - model = LLM(model="Meta-Llama-3-8B-Instruct-FP8-Dynamic/") - # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB - result = model.generate("Hello, my name is") +.. note:: + + The following information is preserved for reference and search purposes. + The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above. + +For static per-tensor offline quantization to FP8, please install the `AutoFP8 library `_. + +.. code-block:: bash + + git clone https://github.com/neuralmagic/AutoFP8.git + pip install -e AutoFP8 + +This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. Offline Quantization with Static Activation Scaling Factors ----------------------------------------------------------- -For the best inference performance, you can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. +You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. .. code-block:: python @@ -169,41 +202,3 @@ Finally, you can load the quantized model checkpoint directly in vLLM. # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB result = model.generate("Hello, my name is") -FP8 checkpoint structure explanation ------------------------------------------------------------ - -Here we detail the structure for the FP8 checkpoints. - -The following is necessary to be present in the model's ``config.json``: - -.. code-block:: text - - "quantization_config": { - "quant_method": "fp8", - "activation_scheme": "static" or "dynamic" - } - - -Each quantized layer in the state_dict will have these tensors: - -* If the config has ``"activation_scheme": "static"``: - -.. code-block:: text - - model.layers.0.mlp.down_proj.weight < F8_E4M3 - model.layers.0.mlp.down_proj.input_scale < F32 - model.layers.0.mlp.down_proj.weight_scale < F32 - -* If the config has ``"activation_scheme": "dynamic"``: - -.. code-block:: text - - model.layers.0.mlp.down_proj.weight < F8_E4M3 - model.layers.0.mlp.down_proj.weight_scale < F32 - - -Additionally, there can be `FP8 kv-cache scaling factors `_ contained within quantized checkpoints specified through the ``.kv_scale`` parameter present on the Attention Module, such as: - -.. code-block:: text - - model.layers.0.self_attn.kv_scale < F32 diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst new file mode 100644 index 0000000000000..04fa308449507 --- /dev/null +++ b/docs/source/quantization/int8.rst @@ -0,0 +1,145 @@ +.. _int8: + +INT8 W8A8 +================== + +vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM `_. + +.. note:: + + INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). + +Prerequisites +------------- + +To use INT8 quantization with vLLM, you'll need to install the `llm-compressor `_ library: + +.. code-block:: console + + $ pip install llmcompressor==0.1.0 + +Quantization Process +-------------------- + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +1. Loading the Model +^^^^^^^^^^^^^^^^^^^^ + +Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: + +.. code-block:: python + + from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoTokenizer + + MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +2. Preparing Calibration Data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When quantizing activations to INT8, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``: + +.. code-block:: python + + from datasets import load_dataset + + NUM_CALIBRATION_SAMPLES = 512 + MAX_SEQUENCE_LENGTH = 2048 + + # Load and preprocess the dataset + ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") + ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} + ds = ds.map(preprocess) + + def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) + ds = ds.map(tokenize, remove_columns=ds.column_names) + +3. Applying Quantization +^^^^^^^^^^^^^^^^^^^^^^^^ + +Now, apply the quantization algorithms: + +.. code-block:: python + + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + + # Configure the quantization algorithms + recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), + ] + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model + SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + +This process creates a W8A8 model with weights and activations quantized to 8-bit integers. + +4. Evaluating Accuracy +^^^^^^^^^^^^^^^^^^^^^^ + +After quantization, you can load and run the model in vLLM: + +.. code-block:: python + + from vllm import LLM + model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") + +To evaluate accuracy, you can use ``lm_eval``: + +.. code-block:: console + + $ lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' + +.. note:: + + Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations. + +Best Practices +-------------- + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +Troubleshooting and Support +--------------------------- + +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. \ No newline at end of file diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index ecc330d866dbd..6341b583f0cfe 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -5,25 +5,138 @@ Supported Hardware for Quantization Kernels The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== -Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== -AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -FP8 ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +.. list-table:: + :header-rows: 1 + :widths: 20 8 8 8 8 8 8 8 8 8 8 + + * - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU + * - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + * - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - SqueezeLLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ Notes: ^^^^^^ - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- "✅" indicates that the quantization method is supported on the specified hardware. -- "❌" indicates that the quantization method is not supported on the specified hardware. +- "✅︎" indicates that the quantization method is supported on the specified hardware. +- "✗" indicates that the quantization method is not supported on the specified hardware. Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. diff --git a/requirements-common.txt b/requirements-common.txt index e17ff06308011..b0e599a5e5af5 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -9,7 +9,7 @@ tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi aiohttp -openai +openai >= 1.0 # Ensure modern openai package (ensure types module present) uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. pillow # Required for image processing diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 4a339bc3a379c..4d54e43d5788c 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -80,6 +80,10 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, "VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", } + # Temporary. Currently when zeromq + SPMD is used, it does not properly + # terminate because of aDAG issue. + pp_args.append("--disable-frontend-multiprocessing") + tp_args.append("--disable-frontend-multiprocessing") try: compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 9bdace671487f..8f6a54ff5979c 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -2,6 +2,13 @@ import torch +from vllm.utils import is_hip + +# Using the default value (240.0) from pytorch will cause accuracy +# issue on dynamic quantization models. Here use 224.0 for rocm. +ROCM_FP8_MAX = 224.0 +FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn + def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: return torch.as_tensor(x, dtype=torch.float32, device='cuda') @@ -11,13 +18,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor, scale_ub: Optional[torch.tensor] = None) \ -> Tuple[torch.tensor, torch.tensor]: - assert quant_dtype in [torch.int8, torch.float8_e4m3fn] + assert quant_dtype in [torch.int8, FP8_DTYPE] if scale_ub is not None: - assert quant_dtype == torch.float8_e4m3fn + assert quant_dtype == FP8_DTYPE qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_max = as_float32_tensor(qtype_traits.max) + qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max + qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min + qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) @@ -37,15 +46,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor, iscales = as_float32_tensor(s_1 / scales) torch_out = as_float32_tensor(x) * iscales torch_out = torch_out.round() - torch_out = torch_out.clamp(qtype_traits.min, - qtype_traits.max).to(quant_dtype) + torch_out = torch_out.clamp(qtype_traits_min, + qtype_traits_max).to(quant_dtype) else: - assert quant_dtype == torch.float8_e4m3fn + assert quant_dtype == FP8_DTYPE min_scaling_factor = s_1 / (qtype_max * s_512) scales = scales.clamp(min=min_scaling_factor) torch_out = as_float32_tensor(x) / scales - torch_out = torch_out.clamp(qtype_traits.min, - qtype_traits.max).to(quant_dtype) + torch_out = torch_out.clamp(qtype_traits_min, + qtype_traits_max).to(quant_dtype) return torch_out, scales @@ -56,8 +65,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> Tuple[torch.tensor, torch.tensor]: - fp8_traits = torch.finfo(torch.float8_e4m3fn) - fp8_max = as_float32_tensor(fp8_traits.max) + fp8_traits = torch.finfo(FP8_DTYPE) + fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max + fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min + fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) # For fp8, in order to match the cuda kernel output, we have to do exactly @@ -68,5 +79,5 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ ref_scale = x_max / fp8_max ref_iscale = one / ref_scale ref_out = (as_float32_tensor(x) * ref_iscale).clamp( - fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn) + fp8_traits_min, fp8_traits_max).to(FP8_DTYPE) return ref_out, ref_scale.view((1, )) diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index 71a92cbc8f7c7..bae9b39203ff9 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -2,7 +2,8 @@ import torch import vllm._custom_ops as ops -from tests.kernels.quant_utils import (ref_dynamic_per_tensor_fp8_quant, +from tests.kernels.quant_utils import (FP8_DTYPE, + ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_token_quant) DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -31,8 +32,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \ if scale_ub else None - ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn, - scale_ub) + ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub) ops_out, ops_scales = ops.scaled_fp8_quant(x, scale_ub=scale_ub, use_per_token_if_dynamic=True) diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py index 774f2d9d9cdbc..efb7b1c607721 100644 --- a/tests/models/test_jamba.py +++ b/tests/models/test_jamba.py @@ -6,9 +6,12 @@ MODELS = ["ai21labs/Jamba-tiny-random"] +# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl +# TODO: Fix this with trained model +@pytest.mark.skip() @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [20]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [10]) def test_models( hf_runner, vllm_runner, @@ -17,8 +20,6 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) @@ -36,8 +37,8 @@ def test_models( @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [20]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [5]) def test_batching( vllm_runner, example_prompts, diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py new file mode 100644 index 0000000000000..ec31c94efa07f --- /dev/null +++ b/tests/quantization/test_experts_int8.py @@ -0,0 +1,28 @@ +# flake8: noqa +"""Tests experts_int8 quantization startup and generation, +doesn't test correctness +""" +import pytest + +from tests.quantization.utils import is_quant_method_supported + +MODELS = ["ai21labs/Jamba-tiny-random"] + + +@pytest.mark.skipif(not is_quant_method_supported("experts_int8"), + reason="ExpertsInt8 is not supported on this GPU type.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [10]) +def test_model_experts_int8_startup( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + + with vllm_runner(model, dtype=dtype, + quantization="experts_int8") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/test_logger.py b/tests/test_logger.py index 52aa73761fd68..29346cd0878b8 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -49,7 +49,8 @@ def test_default_vllm_root_logger_configuration(): handler = logger.handlers[0] assert isinstance(handler, logging.StreamHandler) assert handler.stream == sys.stdout - assert handler.level == logging.INFO + # we use DEBUG level for testing by default + # assert handler.level == logging.INFO formatter = handler.formatter assert formatter is not None diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 90f26400952b9..3cee3b890862a 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -114,5 +114,71 @@ def test_traces(trace_service): SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 assert attributes.get( SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + # Model forward and model execute should be none, since detailed traces is + # not enabled. + assert metrics.model_forward_time is None + assert metrics.model_execute_time is None + + +def test_traces_with_detailed_steps(trace_service): + os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" + + sampling_params = SamplingParams(temperature=0.01, + top_p=0.1, + max_tokens=256) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + collect_detailed_traces="all", + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) + + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") + + attributes = decode_attributes(trace_service.request.resource_spans[0]. + scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + assert attributes.get( + SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + assert attributes.get( + SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get( + SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of + assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert metrics.model_forward_time > 0 + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( + metrics.model_forward_time / 1000) + assert metrics.model_execute_time > 0 + assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE + ) == metrics.model_execute_time + assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/utils.py b/tests/utils.py index c20a6d9e2cada..beac8cfdb1f11 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -384,6 +384,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: os.setpgrp() from _pytest.outcomes import Skipped pid = os.fork() + print(f"Fork a new process to run a test {pid}") if pid == 0: try: f(*args, **kwargs) @@ -401,11 +402,11 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: pgid = os.getpgid(pid) _pid, _exitcode = os.waitpid(pid, 0) # ignore SIGTERM signal itself - old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) + old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) # kill all child processes os.killpg(pgid, signal.SIGTERM) # restore the signal handler - signal.signal(signal.SIGTERM, old_singla_handler) + signal.signal(signal.SIGTERM, old_signal_handler) assert _exitcode == 0, (f"function {f} failed when called with" f" args {args} and kwargs {kwargs}") diff --git a/vllm/_core_ext.py b/vllm/_core_ext.py index e3b9fbb938915..aa520e1eafbaf 100644 --- a/vllm/_core_ext.py +++ b/vllm/_core_ext.py @@ -1,6 +1,6 @@ import importlib.util from enum import Enum -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Tuple, Union import torch @@ -31,14 +31,14 @@ class NanRepr(Enum): @dataclass(frozen=True) class ScalarType: """ - ScalarType can represent a wide range of floating point and integer - types, in particular it can be used to represent sub-byte data types - (something that torch.dtype currently does not support). It is also + ScalarType can represent a wide range of floating point and integer + types, in particular it can be used to represent sub-byte data types + (something that torch.dtype currently does not support). It is also capable of representing types with a bias, i.e.: - `stored_value = value + bias`, - this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias - of 8). The implementation for this class can be found in - csrc/core/scalar_type.hpp, these type signatures should be kept in sync + `stored_value = value + bias`, + this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias + of 8). The implementation for this class can be found in + csrc/core/scalar_type.hpp, these type signatures should be kept in sync with that file. """ @@ -51,15 +51,15 @@ class ScalarType: mantissa: int """ Number of bits in the mantissa if this is a floating point type, - or the number bits representing an integer excluding the sign bit if + or the number bits representing an integer excluding the sign bit if this an integer type. """ bias: int """ - bias used to encode the values in this scalar type - (value = stored_value - bias, default 0) for example if we store the - type as an unsigned integer with a bias of 128 then the value 0 will be + bias used to encode the values in this scalar type + (value = stored_value - bias, default 0) for example if we store the + type as an unsigned integer with a bias of 128 then the value 0 will be stored as 128 and -1 will be stored as 127 and 1 will be stored as 129. """ @@ -73,7 +73,7 @@ class ScalarType: nan_repr: int = NanRepr.IEEE_754.value """ - How NaNs are represent in this scalar type, returns NanRepr value. + How NaNs are represent in this scalar type, returns NanRepr value. (not applicable for integer types) """ @@ -83,14 +83,14 @@ def size_bits(self): def min(self) -> Union[int, float]: """ - Min representable value for this scalar type. + Min representable value for this scalar type. (accounting for bias if there is one) """ raise NotImplementedError def max(self) -> Union[int, float]: """ - Max representable value for this scalar type. + Max representable value for this scalar type. (accounting for bias if there is one) """ raise NotImplementedError @@ -103,28 +103,28 @@ def is_signed(self) -> bool: """ ... - def is_floating_point(self): + def is_floating_point(self) -> bool: "If the type is a floating point type" return self.exponent != 0 - def is_integer(self): + def is_integer(self) -> bool: "If the type is an integer type" return self.exponent == 0 - def has_bias(self): + def has_bias(self) -> bool: "If the type has a non-zero bias" return self.bias != 0 - def has_infs(self): + def has_infs(self) -> bool: "If the type is floating point and supports infinity" return not self._finite_values_only - def has_nans(self): + def has_nans(self) -> bool: return self.nan_repr != NanRepr.NONE.value def is_ieee_754(self) -> bool: """ - If the type is a floating point type that follows IEEE 754 + If the type is a floating point type that follows IEEE 754 conventions """ return self.nan_repr == NanRepr.IEEE_754.value and \ @@ -136,6 +136,11 @@ def __str__(self) -> str: def __repr__(self) -> str: raise NotImplementedError + # __len__ needs to be defined (and has to throw TypeError) for pytorch's + # opcheck to work. + def __len__(self) -> int: + raise TypeError + # # Convenience Constructors # @@ -153,16 +158,16 @@ def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': @classmethod def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType': """ - Create a standard floating point type + Create a standard floating point type (i.e. follows IEEE 754 conventions). """ return cls(exponent, mantissa, 0, True) @classmethod def float_(cls, exponent: int, mantissa: int, finite_values_only: bool, - nan_repr: int): + nan_repr: int) -> 'ScalarType': """ - Create a non-standard floating point type + Create a non-standard floating point type (i.e. does not follow IEEE 754 conventions). """ return cls(exponent, mantissa, 0, True, finite_values_only, @@ -175,3 +180,93 @@ def float_(cls, exponent: int, mantissa: int, finite_values_only: bool, logger.warning("Failed to import from vllm._core_C with %r", e) ScalarType = torch.classes._core_C.ScalarType + + # Needed for dynamo support of ScalarType. + @torch._library.register_fake_class("_core_C::ScalarType") + class FakeScalarType: + + def __init__(self, scalar_type): + self.ScalarType = scalar_type + + def bias_getter(self) -> int: + return self.ScalarType.bias + + def exponent_getter(self) -> int: + return self.ScalarType.exponent + + def mantissa_getter(self) -> int: + return self.ScalarType.mantissa + + def signed_getter(self) -> bool: + return self.ScalarType.signed + + def size_bits_getter(self) -> int: + return self.ScalarType.size_bits + + @property + def size_bits(self) -> int: + return self.ScalarType.size_bits + + def min(self) -> Union[int, float]: + return self.ScalarType.min() + + def max(self) -> Union[int, float]: + return self.ScalarType.max() + + def is_signed(self) -> bool: + return self.ScalarType.is_signed() + + def is_floating_point(self) -> bool: + return self.ScalarType.is_floating_point() + + def is_integer(self) -> bool: + return self.ScalarType.is_integer() + + def has_bias(self) -> bool: + return self.ScalarType.has_bias() + + def has_infs(self) -> bool: + return self.ScalarType.has_infs() + + def has_nans(self) -> bool: + return self.ScalarType.has_nans() + + def is_ieee_754(self) -> bool: + return self.ScalarType.is_ieee_754() + + def __str__(self) -> str: + return self.ScalarType.__str__() + + def __repr__(self) -> str: + return self.ScalarType.__repr__() + + def __len__(self) -> int: + return self.ScalarType.__len__() + + def __obj_flatten__(self) -> Tuple[Tuple[str, Any], ...]: + return torch.classes._core_C.ScalarType.__obj_flatten__( + self.ScalarType) + + @classmethod + def __obj_unflatten__( + cls, flat_type: Tuple[Tuple[str, Any], ...]) -> 'ScalarType': + return cls( + torch.classes._core_C.ScalarType.__obj_unflatten__(flat_type)) + + @classmethod + def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': + return ScalarType.int_(size_bits, bias) + + @classmethod + def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': + return ScalarType.uint(size_bits, bias) + + @classmethod + def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType': + return ScalarType.float_IEEE754(exponent, mantissa) + + @classmethod + def float_(cls, exponent: int, mantissa: int, finite_values_only: bool, + nan_repr: int) -> 'ScalarType': + return ScalarType.float_(exponent, mantissa, finite_values_only, + nan_repr) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index b6329859830ca..1f0a111a53bcc 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -17,13 +17,7 @@ logger.warning("Failed to import from vllm._C with %r", e) with contextlib.suppress(ImportError): - # ruff: noqa: F401 - import vllm._moe_C - - -def is_custom_op_supported(op_name: str) -> bool: - op, overloads = torch._C._jit_get_operation(op_name) - return op is not None + import vllm._moe_C # noqa: F401 def hint_on_error(fn): @@ -280,14 +274,14 @@ def cutlass_scaled_mm_azp(a: torch.Tensor, # aqlm def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: torch.Tensor, + codebook_partition_sizes: List[int], bias: Optional[torch.Tensor]) -> torch.Tensor: return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, codebook_partition_sizes, bias) def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: torch.Tensor) -> torch.Tensor: + codebook_partition_sizes: List[int]) -> torch.Tensor: return torch.ops._C.aqlm_dequant(codes, codebooks, codebook_partition_sizes) @@ -369,9 +363,12 @@ def scaled_fp8_quant( # This code assumes batch_dim and num_tokens are flattened assert (input.ndim == 2) shape: Union[Tuple[int, int], torch.Size] = input.shape + # For rocm, the output fp8 dtype is torch.float_e3m3fnuz + out_dtype: torch.dtype = torch.float8_e4m3fnuz if vllm.utils.is_hip() \ + else torch.float8_e4m3fn if num_token_padding: shape = (max(num_token_padding, input.shape[0]), shape[1]) - output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn) + output = torch.empty(shape, device=input.device, dtype=out_dtype) if scale is None: if use_per_token_if_dynamic: @@ -431,25 +428,17 @@ def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # gguf -def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int): +def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, + n: int) -> torch.Tensor: return torch.ops._C.ggml_dequantize(W, quant_type, m, n) -def ggml_mul_mat_vec( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: int, -): - return torch.ops._C.ggml_mul_mat_vec(W, X, quant_type, row) - - def ggml_mul_mat_vec_a8( W: torch.Tensor, X: torch.Tensor, quant_type: int, row: int, -): +) -> torch.Tensor: return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row) @@ -458,7 +447,7 @@ def ggml_mul_mat_a8( X: torch.Tensor, quant_type: int, row: int, -): +) -> torch.Tensor: return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 97b13917ccfaa..23c7830cd6264 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -75,6 +75,9 @@ def copy_blocks( ) -> None: raise NotImplementedError + def advance_step(self, num_seqs: int, num_queries: int): + raise NotImplementedError + @dataclass class AttentionMetadata: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index f230bb57e3177..f146285bfc9e2 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -297,6 +297,51 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: ) return self._cached_decode_metadata + def advance_step(self, num_seqs: int, num_queries: int): + """ + Update metadata in-place to advance one decode step. + """ + # GPU in-place update is currently called separately through + # custom_ops.advance_step(). See draft_model_runner. TODO(will): Move + # this logic to the backend. + + # When using cudagraph, the num_seqs is padded to the next captured + # batch sized, but num_queries tracks the actual number of requests in + # the batch. For --enforce-eager mode, num_seqs == num_queries + if num_seqs != num_queries: + assert num_seqs > num_queries + assert self.use_cuda_graph + + assert self.num_prefills == 0 + assert self.num_prefill_tokens == 0 + assert self.num_decode_tokens == num_seqs + assert self.slot_mapping.shape == (num_seqs, ) + + assert self.seq_lens is not None + assert len(self.seq_lens) == num_seqs + assert self.seq_lens_tensor is not None + assert self.seq_lens_tensor.shape == (num_seqs, ) + assert self.max_query_len == 1 + assert self.max_prefill_seq_len == 0 + assert self.max_decode_seq_len == max(self.seq_lens) + + assert self.query_start_loc is not None + assert self.query_start_loc.shape == (num_queries + 1, ) + assert self.seq_start_loc is not None + assert self.seq_start_loc.shape == (num_seqs + 1, ) + + assert self.context_lens_tensor is not None + assert self.context_lens_tensor.shape == (num_queries, ) + + assert self.block_tables is not None + assert self.block_tables.shape[0] == num_seqs + + # Update query lengths. Note that we update only queries and not seqs, + # since tensors may be padded due to captured cuda graph batch size + for i in range(num_queries): + self.seq_lens[i] += 1 + self.max_decode_seq_len = max(self.seq_lens) + class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): diff --git a/vllm/config.py b/vllm/config.py index 19cd4d8b51d7c..95c0b95fbba01 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -240,10 +240,11 @@ def _parse_quant_hf_config(self): def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] - rocm_supported_quantization = ["gptq", "squeezellm"] + rocm_supported_quantization = ["gptq", "squeezellm", "fp8"] optimized_quantization_methods = [ "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin", - "fbgemm_fp8", "compressed_tensors", "compressed-tensors" + "fbgemm_fp8", "compressed_tensors", "compressed-tensors", + "experts_int8" ] tpu_supported_quantization = ["tpu_int8"] if self.quantization is not None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 6ed75a6e2ea6b..287de60149670 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1033,9 +1033,9 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # Schedule sequence groups. # This function call changes the internal states of the scheduler # such as self.running, self.swapped, and self.waiting. + scheduler_start_time = time.perf_counter() scheduler_outputs = self._schedule() now = time.time() - scheduler_start_time = time.perf_counter() if not self.cache_config.enable_prefix_caching: common_computed_block_nums = [] diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index 9c7f41a1f9d62..d5a53381ce621 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -49,8 +49,13 @@ def find_loaded_library(lib_name) -> Optional[str]: if not found: # the library is not loaded in the current process return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 start = line.index("/") path = line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" return path @@ -98,9 +103,9 @@ class CudaRTLibrary: def __init__(self, so_file: Optional[str] = None): if so_file is None: - so_file = find_loaded_library("libcudart.so") + so_file = find_loaded_library("libcudart") assert so_file is not None, \ - "libcudart.so is not loaded in the current process" + "libcudart is not loaded in the current process" if so_file not in CudaRTLibrary.path_to_library_cache: lib = ctypes.CDLL(so_file) CudaRTLibrary.path_to_library_cache[so_file] = lib diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 479dc95a8b667..6229f1d6ec788 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -15,7 +15,7 @@ from vllm.utils import cuda_device_count_stateless try: - assert ops.is_custom_op_supported("_C_custom_ar::meta_size") + ops.meta_size() custom_ar = True except Exception: # For AMD GPUs and CPUs diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 315b2f50a919d..6c7259129a109 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -948,11 +948,6 @@ def create_engine_config(self, ) -> EngineConfig: raise ValueError( f"Invalid module {m} in collect_detailed_traces. " f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}") - if (m == "model" - or m == "all") and self.pipeline_parallel_size > 1: - raise ValueError( - "Collection of detailed traces for the 'model' module is " - "not yet supported with pipeline parallelism.") observability_config = ObservabilityConfig( otlp_traces_endpoint=self.otlp_traces_endpoint, collect_model_forward_time="model" in detailed_trace_modules diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 32bdb2b7d14f4..ecd6dc64d343b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,7 +1,7 @@ from contextlib import contextmanager from typing import ClassVar, List, Optional, Sequence, Union, cast, overload -from tqdm import tqdm +from tqdm.auto import tqdm from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.engine.arg_utils import EngineArgs diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ad5ba4a93ba63..a641dcc24aaae 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -60,11 +60,13 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, trust_remote_code: bool) -> bool: +def model_is_embedding(model_name: str, trust_remote_code: bool, + quantization: str) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, + quantization=quantization, seed=0, dtype="auto").embedding_mode @@ -97,7 +99,8 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(args.model, args.trust_remote_code) + if (model_is_embedding(args.model, args.trust_remote_code, + args.quantization) or args.disable_frontend_multiprocessing): async_engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py index af406d8715403..471d62631135a 100644 --- a/vllm/entrypoints/openai/rpc/server.py +++ b/vllm/entrypoints/openai/rpc/server.py @@ -38,6 +38,8 @@ def cleanup(self): self.socket.close() self.context.destroy() self.engine.shutdown_background_loop() + # Clear the engine reference so that it can be GC'ed. + self.engine = None async def get_model_config(self, identity): """Send the ModelConfig""" diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index dcaf2e3d462cc..0bbc1844ef455 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -5,8 +5,6 @@ https://arxiv.org/abs/2310.18547 """ -from typing import Dict, Optional - import torch import triton import triton.language as tl @@ -86,14 +84,13 @@ def _bgmv_expand_kernel( @torch.inference_mode() -def bgmv_expand( +def _bgmv_expand( inputs: torch.Tensor, lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, add_inputs: bool = True, - override_config: Optional[Dict[str, int]] = None, -): +) -> None: """ Args: inputs (torch.Tensor): input tensor @@ -105,10 +102,7 @@ def bgmv_expand( batches (int): batch size add_inputs (bool, optional): Defaults to False. adds the final lora results to the output. - override_config (Optional[Dict[str, int]], optional): Defaults to None. - Triton grid config """ - assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] assert lora_b_weights.dtype in [ torch.float16, @@ -138,10 +132,7 @@ def bgmv_expand( ]: CAST_TYPE = True batches = lora_indices_tensor.size(0) - if override_config: - config = override_config - else: - config = get_lora_op_configs("expand", batches, N) + config = get_lora_op_configs("expand", batches, N) grid = lambda META: ( META["SPLIT_N"], batches, @@ -167,3 +158,8 @@ def bgmv_expand( **config, ) return + + +bgmv_expand = torch.library.custom_op("lora::bgmv_expand", + _bgmv_expand, + mutates_args=["output_tensor"]) diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index fa6571074f3ab..87d7d9902a4c1 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -5,8 +5,6 @@ https://arxiv.org/abs/2310.18547 """ -from typing import Dict, Optional - import torch import triton import triton.language as tl @@ -89,7 +87,7 @@ def _bgmv_expand_slice_kernel( @torch.inference_mode() -def bgmv_expand_slice( +def _bgmv_expand_slice( inputs: torch.Tensor, lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, @@ -97,8 +95,7 @@ def bgmv_expand_slice( slice_offset: int, slice_size: int, add_inputs: bool = True, - override_config: Optional[Dict[str, int]] = None, -): +) -> None: """ Args: inputs (torch.Tensor): input tensor @@ -111,10 +108,7 @@ def bgmv_expand_slice( slice_size (int): current output_tensor's size batches (int): batch size add_inputs (bool, optional): Defaults to False. - override_config (Optional[Dict[str, int]], optional): Defaults to None. - Triton grid config """ - assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] assert lora_b_weights.dtype in [ torch.float16, @@ -149,10 +143,7 @@ def bgmv_expand_slice( batches = lora_indices_tensor.size(0) - if override_config: - config = override_config - else: - config = get_lora_op_configs("expand", batches, N) + config = get_lora_op_configs("expand", batches, N) grid = lambda META: ( META["SPLIT_N"], @@ -180,3 +171,8 @@ def bgmv_expand_slice( **config, ) return + + +bgmv_expand_slice = torch.library.custom_op("lora::bgmv_expand_slice", + _bgmv_expand_slice, + mutates_args=["output_tensor"]) diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index e69d33078f5aa..c979d758492db 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -5,8 +5,6 @@ https://arxiv.org/abs/2310.18547 """ -from typing import Dict, Optional - import torch import triton import triton.language as tl @@ -78,14 +76,13 @@ def _bgmv_shrink_kernel( @torch.inference_mode() -def bgmv_shrink( +def _bgmv_shrink( inputs: torch.Tensor, lora_a_weights: torch.Tensor, output_tensor: torch.Tensor, lora_indices_tensor: torch.Tensor, scaling: float = 1.0, - override_config: Optional[Dict[str, int]] = None, -): +) -> None: """ Args: inputs (torch.Tensor): input tensor @@ -96,8 +93,6 @@ def bgmv_shrink( applied. batches (int): batch size scaling (float): Scaling factor. - override_config (Optional[Dict[str, int]], optional): Defaults to None. - Triton grid config """ assert inputs.dtype == lora_a_weights.dtype assert inputs.dtype in [torch.float16, torch.bfloat16] @@ -119,11 +114,8 @@ def bgmv_shrink( batches = lora_indices_tensor.size(0) N, K = lora_a_weights.shape[-2:] # K=hidden_size,N=rank BLOCK_N = triton.next_power_of_2(N) - if override_config: - config = override_config - else: - # First try to load optimal config from the file - config = get_lora_op_configs("bgmv_shrink", batches, K) + # First try to load optimal config from the file + config = get_lora_op_configs("bgmv_shrink", batches, K) grid = lambda META: ( META["SPLIT_K"], @@ -148,3 +140,8 @@ def bgmv_shrink( **config, ) return + + +bgmv_shrink = torch.library.custom_op("lora::bgmv_shrink", + _bgmv_shrink, + mutates_args=["output_tensor"]) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 4590495469096..80a0b605b0fe2 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -97,7 +97,7 @@ def _sgmv_expand_kernel( @torch.inference_mode() -def sgmv_expand( +def _sgmv_expand( inputs: torch.Tensor, lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, @@ -107,7 +107,7 @@ def sgmv_expand( batches: int, max_seq_length: int, add_inputs: bool = False, -): +) -> None: """ Args: inputs (torch.Tensor): input tensor @@ -190,3 +190,8 @@ def sgmv_expand( CAST_TYPE, ) return + + +sgmv_expand = torch.library.custom_op("lora::sgmv_expand", + _sgmv_expand, + mutates_args=["output_tensor"]) diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index ff3bcda071b80..53237166a1c68 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -103,7 +103,7 @@ def _sgmv_expand_slice_kernel( @torch.inference_mode() -def sgmv_expand_slice( +def _sgmv_expand_slice( inputs: torch.Tensor, lora_b_weights: torch.Tensor, output_tensor: torch.Tensor, @@ -115,7 +115,7 @@ def sgmv_expand_slice( slice_offset: int, slice_size: int, add_inputs: bool = False, -): +) -> None: """_summary_ Args: @@ -203,3 +203,8 @@ def sgmv_expand_slice( CAST_TYPE, ) return + + +sgmv_expand_slice = torch.library.custom_op("lora::sgmv_expand_slice", + _sgmv_expand_slice, + mutates_args=["output_tensor"]) diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index 8ab049989abef..51d2a09eee94b 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -101,7 +101,7 @@ def _sgmv_shrink_kernel( @torch.inference_mode() -def sgmv_shrink( +def _sgmv_shrink( inputs: torch.Tensor, lora_a_weights: torch.Tensor, output_tensor: torch.Tensor, @@ -111,7 +111,7 @@ def sgmv_shrink( batches: int, max_seq_length: int, scaling: float, -): +) -> None: """ Args: @@ -187,3 +187,8 @@ def sgmv_shrink( SPLIT_K, ) return + + +sgmv_shrink = torch.library.custom_op("lora::sgmv_shrink", + _sgmv_shrink, + mutates_args=["output_tensor"]) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..56c1a4e3af0b4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..d3677bebb82a7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..265768fb900cc --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..d3be23dfc903b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..589f5d39f3141 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..2c78bfaba7890 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..4da841e74a79f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..200356713c0d0 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..e076615ee541a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..ee896554b9210 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..05aed8b1c8149 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..51ad5b299eb22 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..ee5119182556c --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..68793c77b33c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..612910720ed94 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..51d03d8607122 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..26f9abd6b789e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..64be6e6591422 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..0a6a6a73fa45e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..7a7508aab0459 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..dbf9a2dd6f048 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000000000..bbb2386046b11 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 413c0b6d0924e..bcf25d2631042 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -11,48 +11,51 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) @triton.jit def fused_moe_kernel( - # Pointers to matrices - a_ptr, - b_ptr, - c_ptr, - a_scale_ptr, - b_scale_ptr, - topk_weights_ptr, - sorted_token_ids_ptr, - expert_ids_ptr, - num_tokens_post_padded_ptr, - # Matrix dimensions - N, - K, - EM, - num_valid_tokens, - # The stride variables represent how much to increase the ptr by when - # moving by 1 element in a particular dimension. E.g. `stride_am` is - # how much to increase `a_ptr` by to get the element one row down - # (A has M rows). - stride_am, - stride_ak, - stride_be, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - MUL_ROUTED_WEIGHT: tl.constexpr, - top_k: tl.constexpr, - compute_type: tl.constexpr, - use_fp8: tl.constexpr, -): + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr): """ Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. @@ -113,8 +116,12 @@ def fused_moe_kernel( off_experts = tl.load(expert_ids_ptr + pid_m) b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + if use_int8_w8a16: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) - if use_fp8: + if use_fp8_w8a8: a_scale = tl.load(a_scale_ptr) b_scale = tl.load(b_scale_ptr + off_experts) @@ -136,7 +143,9 @@ def fused_moe_kernel( mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) # We accumulate along the K dimension. - if use_fp8: + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8: accumulator = tl.dot(a, b, acc=accumulator) else: accumulator += tl.dot(a, b) @@ -149,8 +158,9 @@ def fused_moe_kernel( mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] - - if use_fp8: + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8: accumulator = (accumulator * a_scale * b_scale).to(compute_type) else: accumulator = accumulator.to(compute_type) @@ -229,16 +239,18 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, num_tokens_post_padded: torch.Tensor, mul_routed_weight: bool, top_k: int, config: Dict[str, Any], compute_type: tl.dtype, - use_fp8: bool) -> None: + use_fp8_w8a8: bool, use_int8_w8a16: bool) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 - if not use_fp8: - assert A_scale is None - assert B_scale is None - else: + if use_fp8_w8a8: A, A_scale = ops.scaled_fp8_quant(A, A_scale) assert B_scale is not None + elif use_int8_w8a16: + assert B_scale is not None + else: + assert A_scale is None + assert B_scale is None grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ 'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), ) @@ -264,16 +276,19 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, B.stride(1), C.stride(1), C.stride(2), + B_scale.stride(0) if B_scale is not None and use_int8_w8a16 else 0, + B_scale.stride(1) if B_scale is not None and use_int8_w8a16 else 0, MUL_ROUTED_WEIGHT=mul_routed_weight, top_k=top_k, compute_type=compute_type, - use_fp8=use_fp8, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, **config, ) def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str: - device_name = torch.cuda.get_device_name().replace(" ", "_") + device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" return f"E={E},N={N},device_name={device_name}{dtype_selector}.json" @@ -426,6 +441,20 @@ def grouped_topk(hidden_states: torch.Tensor, return topk_weights, topk_ids +def get_config_dtype_str(dtype: torch.dtype, + use_int8_w8a16: Optional[bool] = False, + use_fp8_w8a8: Optional[bool] = False): + if use_fp8_w8a8: + return "fp8_w8a8" + elif use_int8_w8a16: + return "int8_w8a16" + elif dtype == torch.float: + # avoiding cases where kernel fails when float32 MoE + # use fp16/bfloat16 configs + return "float32" + return None + + def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -433,7 +462,8 @@ def fused_experts(hidden_states: torch.Tensor, topk_ids: torch.Tensor, inplace: bool = False, override_config: Optional[Dict[str, Any]] = None, - use_fp8: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, @@ -454,13 +484,16 @@ def fused_experts(hidden_states: torch.Tensor, # https://github.com/vllm-project/vllm/issues/5938 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE M = min(num_tokens, CHUNK_SIZE) + config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + dtype=hidden_states.dtype) get_config_func = functools.partial( try_get_optimal_moe_config, w1.shape, w2.shape, topk_ids.shape[1], - "float8" if use_fp8 else None, + config_dtype, override_config=override_config, ) @@ -524,7 +557,8 @@ def fused_experts(hidden_states: torch.Tensor, topk_ids.shape[1], config, compute_type=compute_type, - use_fp8=use_fp8) + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16) ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) @@ -542,7 +576,8 @@ def fused_experts(hidden_states: torch.Tensor, 1, config, compute_type=compute_type, - use_fp8=use_fp8) + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16) torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1, @@ -562,7 +597,8 @@ def fused_moe( use_grouped_topk: bool = False, num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, - use_fp8: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, @@ -588,7 +624,9 @@ def fused_moe( - topk_group: Optional[int]: additional parameter for grouped_topk - use_grouped_topk: If True, use grouped_topk instead of fused_topk note: Deepseekv2 model uses grouped_topk - - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner + - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner + products for w1 and w2. Defaults to False. + - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner products for w1 and w2. Defaults to False. - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. @@ -617,7 +655,8 @@ def fused_moe( topk_ids, inplace=inplace, override_config=override_config, - use_fp8=use_fp8, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, w1_scale=w1_scale, w2_scale=w2_scale, a1_scale=a1_scale, diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index e1b3bc9b4ad54..95b160f4287f9 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -11,6 +11,8 @@ CompressedTensorsConfig) from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig) +from vllm.model_executor.layers.quantization.experts_int8 import ( + ExpertsInt8Config) from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config from vllm.model_executor.layers.quantization.fp8 import Fp8Config from vllm.model_executor.layers.quantization.gguf import GGUFConfig @@ -43,6 +45,7 @@ "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, + "experts_int8": ExpertsInt8Config, } diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 95ff05b986ab4..c88ca340ebcc5 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -95,7 +95,7 @@ def generic_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: torch.IntTensor, + output_partition_sizes: List[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: output_shape = input.shape[:-1] + (scales.shape[0], ) @@ -133,7 +133,7 @@ def optimized_dequantize_gemm( codebooks: torch. Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: torch.IntTensor, + output_partition_sizes: List[int], bias: Optional[torch.Tensor], ) -> torch.Tensor: weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) @@ -288,10 +288,8 @@ def create_weights(self, layer: torch.nn.Module, codebooks, { # metadata indicates fixed size concatenated along dim 0 - "is_metadata": - True, - "output_partition_sizes": - torch.tensor(output_partition_sizes, device='cpu'), + "is_metadata": True, + "output_partition_sizes": output_partition_sizes }, ) @@ -334,7 +332,7 @@ def apply( codes = layer.codes scales = layer.scales output_partition_sizes = getattr(codebooks, "output_partition_sizes", - None) + []) nbooks = codes.shape[2] ingroups = codebooks.shape[3] diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py new file mode 100644 index 0000000000000..dabf17df78fef --- /dev/null +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -0,0 +1,175 @@ +from typing import Any, Dict, List, Optional + +import torch + +from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.utils import set_weight_attrs + + +class ExpertsInt8Config(QuantizationConfig): + """Config class for Int8 experts quantization.""" + + def __init__(self) -> None: + pass + + @classmethod + def get_name(cls) -> str: + return "experts_int8" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "ExpertsInt8Config": + return cls() + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + return ExpertsInt8MoEMethod(self) + return None + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class ExpertsInt8MoEMethod(FusedMoEMethodBase): + + def __init__(self, quant_config: ExpertsInt8Config): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + int8_dtype = torch.int8 + + assert 'weight_loader' in extra_weight_attrs + weight_loader = extra_weight_attrs['weight_loader'] + wrapped_weight_loader = ExpertsInt8MoEMethod.quantizing_weight_loader( + layer, weight_loader) + extra_weight_attrs['weight_loader'] = wrapped_weight_loader + + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter(torch.empty(num_experts, + 2 * intermediate_size, + hidden_size, + dtype=int8_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter(torch.empty(num_experts, + hidden_size, + intermediate_size, + dtype=int8_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w13_scale = torch.nn.Parameter(torch.zeros(num_experts, + 2 * intermediate_size, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_scale", w13_scale) + + w2_scale = torch.nn.Parameter(torch.zeros(num_experts, + hidden_size, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_scale", w2_scale) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group) + + return fused_experts(x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_int8_w8a16=True, + w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale) + + @staticmethod + def quantizing_weight_loader(layer, weight_loader): + + def quantize_and_call_weight_loader(param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, shard_id: int, + expert_id: int): + tp_rank = get_tensor_model_parallel_rank() + shard_size = layer.intermediate_size_per_partition + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + device = get_tp_group().device + loaded_weight = loaded_weight.to(device) + # w1, gate_proj case: Load into first shard of w13. + if shard_id == "w1": + scales = quantize_in_place_and_get_scales( + loaded_weight[shard, :]) + layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:, + 0]) + # w3, up_proj case: Load into second shard of w13. + elif shard_id == "w3": + scales = quantize_in_place_and_get_scales( + loaded_weight[shard, :]) + layer.w13_scale.data[expert_id, shard_size:2 * + shard_size].copy_(scales[:, 0]) + # w2, down_proj case: Load into only shard of w2. + elif shard_id == "w2": + scales = quantize_in_place_and_get_scales(loaded_weight[:, + shard]) + layer.w2_scale.data[expert_id, :].copy_(scales[:, 0]) + else: + raise ValueError( + f"Shard id must be in [0,1,2] but got {shard_id}") + weight_loader(param, loaded_weight, weight_name, shard_id, + expert_id) + + return quantize_and_call_weight_loader + + +def quantize_in_place_and_get_scales(weight: torch.Tensor) -> torch.Tensor: + vmax = torch.iinfo(torch.int8).max + scales = (torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax) + + weight.div_(scales) + weight.round_() + weight.clamp_(-vmax, vmax) + + return scales diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8b8cf41cdfb3d..fd7682a1c0f51 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -20,10 +20,11 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, apply_fp8_linear, convert_to_channelwise, create_per_tensor_scale_param, cutlass_fp8_supported, - per_tensor_dequantize, requantize_with_max_scale) + normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize, + requantize_with_max_scale) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import print_warning_once +from vllm.utils import is_hip, print_warning_once ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -120,6 +121,9 @@ def __init__(self, quant_config: Fp8Config): capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] self.use_marlin = capability < 89 or envs.VLLM_TEST_FORCE_FP8_MARLIN + # Disable marlin for rocm + if is_hip(): + self.use_marlin = False def create_weights( self, @@ -168,6 +172,8 @@ def create_weights( scale = create_per_tensor_scale_param(output_partition_sizes, **extra_weight_attrs) layer.register_parameter("input_scale", scale) + else: + layer.register_parameter("input_scale", None) def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint not serialized fp8, quantize the weights. @@ -202,9 +208,23 @@ def process_weights_after_loading(self, layer: Module) -> None: # requantize the logical shards as a single weight. else: # Dequant -> Quant with max scale so we can run per tensor. + weight = layer.weight + weight_scale = layer.weight_scale + + # If rocm, use float8_e4m3fnuz. + if is_hip(): + weight, weight_scale, input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=weight_scale, + input_scale=layer.input_scale) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, + requires_grad=False) + weight_scale, weight = requantize_with_max_scale( - weight=layer.weight, - weight_scale=layer.weight_scale, + weight=weight, + weight_scale=weight_scale, logical_widths=layer.logical_widths, ) @@ -214,8 +234,6 @@ def process_weights_after_loading(self, layer: Module) -> None: if self.quant_config.activation_scheme == "static": layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) - else: - layer.input_scale = None if self.use_marlin: prepare_fp8_layer_for_marlin(layer) @@ -346,10 +364,12 @@ def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint is fp16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: + # If rocm, use float8_e4m3fnuz as dtype + fp8_dtype = torch.float8_e4m3fnuz \ + if is_hip() else torch.float8_e4m3fn w13_weight = torch.empty_like(layer.w13_weight.data, - dtype=torch.float8_e4m3fn) - w2_weight = torch.empty_like(layer.w2_weight.data, - dtype=torch.float8_e4m3fn) + dtype=fp8_dtype) + w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) # Re-initialize w13_scale because we directly quantize # merged w13 weights and generate a single scaling factor. @@ -393,6 +413,32 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w13_input_scale.max(), requires_grad=False) layer.w2_input_scale = torch.nn.Parameter( layer.w2_input_scale.max(), requires_grad=False) + # If rocm, normalize the weights and scales to e4m3fnuz + if is_hip(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, + layer.w13_input_scale) + w2_weight, w2_weight_scale, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, + requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False) # Fp8 moe kernel needs single weight scale for w13 per expert. # We take the max then dequant and requant each expert. @@ -442,7 +488,7 @@ def apply(self, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - use_fp8=True, + use_fp8_w8a8=True, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, a1_scale=layer.w13_input_scale, diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index aa04fcf8310bf..f456286899a53 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -212,6 +212,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int) else: layer.g_idx.data = torch.empty((0, ), + dtype=torch.int, device=layer.g_idx.device) layer.exllama_state = ExllamaState.READY ops.gptq_shuffle(layer.qweight, layer.g_idx, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index dbe86902853cd..6cc1c65ddfa82 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -6,9 +6,19 @@ from vllm import _custom_ops as ops from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.utils import is_hip + +# scaled_mm in pytorch on rocm has a bug that requires always +# providing scaling factor for result. This value is created +# as global value to avoid multiple tensor allocations, and +# can be removed once pytorch fixes the bug. +TORCH_SCALED_MM_SCALE_RESULT = torch.ones(1).cuda() if is_hip() else None def cutlass_fp8_supported() -> bool: + # cutlass is not supported on Rocm + if is_hip(): + return False capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] @@ -147,13 +157,19 @@ def apply_fp8_linear( if per_tensor_weights and per_tensor_activations: # Fused GEMM_DQ - output, _ = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) - return torch.narrow(output, 0, 0, input.shape[0]) + output = torch._scaled_mm( + qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + scale_result=TORCH_SCALED_MM_SCALE_RESULT, + bias=bias) + # Since in torch 2.5, scaled_mm only returns single value + # This should be removed when vllm-nvidia also moves to 2.5 + if is_hip(): + return torch.narrow(output, 0, 0, input.shape[0]) + return torch.narrow(output[0], 0, 0, input.shape[0]) else: # Fallback for channelwise case, where we use unfused DQ @@ -207,3 +223,27 @@ def apply_int8_linear( scale_b=weight_scale, out_dtype=input.dtype, bias=bias) + + +def normalize_e4m3fn_to_e4m3fnuz( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + assert weight.dtype == torch.float8_e4m3fn + # The bits pattern 10000000(-128) represents zero in e4m3fn + # but NaN in e4m3fnuz. So here we set it to 0. + # https://onnx.ai/onnx/technical/float8.html + weight_as_int8 = weight.view(torch.int8) + ROCM_FP8_NAN_AS_INT = -128 + weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0 + weight = weight_as_int8.view(torch.float8_e4m3fnuz) + + # For the same bits representation, e4m3fnuz value is half of + # the e4m3fn value, so we should double the scaling factor to + # get the same dequantized value. + # https://onnx.ai/onnx/technical/float8.html + weight_scale = weight_scale * 2.0 + if input_scale is not None: + input_scale = input_scale * 2.0 + return weight, weight_scale, input_scale diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 95888e7976ad3..7b3acd7f3c9ea 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -774,7 +774,7 @@ def get_rope( is_neox_style: bool = True, rope_scaling: Optional[Dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, - rotary_percent: float = 1.0, + partial_rotary_factor: float = 1.0, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() @@ -787,8 +787,8 @@ def get_rope( rope_scaling_args = tuple(rope_scaling_tuple.items()) else: rope_scaling_args = None - if rotary_percent < 1.0: - rotary_dim = int(rotary_dim * rotary_percent) + if partial_rotary_factor < 1.0: + rotary_dim = int(rotary_dim * partial_rotary_factor) key = (head_size, rotary_dim, max_position, base, is_neox_style, rope_scaling_args, dtype) if key in _ROPE_DICT: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index dd4d63661a692..b82eb14fb5f23 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -16,7 +16,6 @@ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -249,37 +248,6 @@ def forward( return hidden_states -class JambaMLP(nn.Module): - - def __init__( - self, - config: JambaConfig, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - hidden_size = config.hidden_size - intermediate_size = config.intermediate_size - hidden_act = config.hidden_act - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - class JambaMoE(nn.Module): def __init__(self, @@ -327,6 +295,21 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states.view(orig_shape) +class JambaMLP(JambaMoE): + + def __init__(self, + config: JambaConfig, + params_dtype: Optional[torch.dtype] = None, + tp_size: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__(config, + num_experts=1, + top_k=1, + params_dtype=params_dtype, + tp_size=tp_size, + quant_config=quant_config) + + class JambaMambaDecoderLayer(nn.Module): def __init__(self, @@ -884,8 +867,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), ] # Params for weights, fp8 weight scales, fp8 activation scales @@ -907,6 +888,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if ".self_attn." in name: name = name.replace(".self_attn", "") + if "feed_forward" in name and not _is_moe_layer(name): + ## map MLP layers to expert with ID=0 + name = name.replace("feed_forward", "feed_forward.experts.0") + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -921,16 +906,21 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, shard_id) break else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping + for ( + param_name, + weight_name, + expert_id, + shard_id, + ) in expert_params_mapping: if weight_name not in name: continue + name = name.replace(weight_name, param_name) param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, - name, + weight_name, shard_id=shard_id, expert_id=expert_id) break @@ -943,3 +933,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + + +def _is_moe_layer(name: str): + return any( + [experts_name in name for experts_name in [ + "experts", + "router", + ]]) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 57598b49bcca9..7d92a1ffe55df 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -53,7 +53,7 @@ # - There is no gate_proj, just up_proj # - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm # - Squared ReLU instead of SwiGLU -# - Adds a rotary_percent to RoPE +# - Adds a partial_rotary_factor to RoPE def _cast_if_autocast_enabled(*args): @@ -161,7 +161,7 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.rotary_percent = config.rope_percent + self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -187,7 +187,7 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, - rotary_percent=self.rotary_percent, + partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention(self.num_heads, self.head_dim, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 6642bb8e71bd1..c7557dc34ff64 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -44,6 +44,35 @@ def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]: return pynvml.nvmlDeviceGetCudaComputeCapability(handle) +@lru_cache(maxsize=8) +@with_nvml_context +def get_physical_device_name(device_id: int = 0) -> str: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetName(handle) + + +@with_nvml_context +def warn_if_different_devices(): + device_ids: int = pynvml.nvmlDeviceGetCount() + if device_ids > 1: + device_names = [get_physical_device_name(i) for i in range(device_ids)] + if len(set(device_names)) > 1 and os.environ.get( + "CUDA_DEVICE_ORDER") != "PCI_BUS_ID": + logger.warning( + "Detected different devices in the system: \n%s\nPlease" + " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " + "avoid unexpected behavior.", "\n".join(device_names)) + + +try: + from sphinx.ext.autodoc.mock import _MockModule + + if not isinstance(pynvml, _MockModule): + warn_if_different_devices() +except ModuleNotFoundError: + warn_if_different_devices() + + def device_id_to_physical_device_id(device_id: int) -> int: if "CUDA_VISIBLE_DEVICES" in os.environ: device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") @@ -61,6 +90,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]: physical_device_id = device_id_to_physical_device_id(device_id) return get_physical_device_capability(physical_device_id) + @staticmethod + def get_device_name(device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + return get_physical_device_name(physical_device_id) + @staticmethod @with_nvml_context def is_full_nvlink(physical_device_ids: List[int]) -> bool: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0760f9554fb78..25b6f26676ef0 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -27,6 +27,10 @@ def is_tpu(self) -> bool: def get_device_capability(device_id: int = 0) -> Tuple[int, int]: raise NotImplementedError + @staticmethod + def get_device_name(device_id: int = 0) -> str: + raise NotImplementedError + @staticmethod def inference_mode(): """A device-specific wrapper of `torch.inference_mode`. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 36b3ba8f7d1bb..3f6f5adee5a56 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -13,3 +13,8 @@ class RocmPlatform(Platform): @lru_cache(maxsize=8) def get_device_capability(device_id: int = 0) -> Tuple[int, int]: return torch.cuda.get_device_capability(device_id) + + @staticmethod + @lru_cache(maxsize=8) + def get_device_name(device_id: int = 0) -> str: + return torch.cuda.get_device_name(device_id) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 7707d38a0f666..324044c96d994 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -97,38 +97,6 @@ def __init__( self.flashinfer_prefill_workspace_buffer = None self.flashinfer_prefill_wrapper = None - def _update_flash_attn_metadata(self, attn_metadata, num_seqs, - num_queries): - assert isinstance(attn_metadata, FlashAttentionMetadata) - - if num_seqs != num_queries: - assert num_seqs > num_queries - assert attn_metadata.use_cuda_graph - - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_prefill_tokens == 0 - assert attn_metadata.num_decode_tokens == num_seqs - assert attn_metadata.slot_mapping.shape == (num_seqs, ) - - assert len(attn_metadata.seq_lens) == num_seqs - assert attn_metadata.seq_lens_tensor.shape == (num_seqs, ) - assert attn_metadata.max_query_len == 1 - assert attn_metadata.max_prefill_seq_len == 0 - assert attn_metadata.max_decode_seq_len == max(attn_metadata.seq_lens) - - assert attn_metadata.query_start_loc.shape == (num_queries + 1, ) - assert attn_metadata.seq_start_loc.shape == (num_seqs + 1, ) - - assert attn_metadata.context_lens_tensor.shape == (num_queries, ) - - assert attn_metadata.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - attn_metadata.seq_lens[i] += 1 - attn_metadata.max_decode_seq_len = max(attn_metadata.seq_lens) - def _update_sampling_metadata(self, sampling_metadata, num_seqs, num_queries): @@ -166,7 +134,7 @@ def _gpu_advance_step( # Update attn_metadata attn_metadata = model_input.attn_metadata assert isinstance(attn_metadata, FlashAttentionMetadata) - self._update_flash_attn_metadata(attn_metadata, num_seqs, num_queries) + attn_metadata.advance_step(num_seqs, num_queries) # Update GPU tensors ops.advance_step(num_seqs=num_seqs, diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index a22a9f475dda9..139e6b3cdacbe 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -35,20 +35,20 @@ class NemotronConfig(PretrainedConfig): Args: - vocab_size (`int`, *optional*, defaults to 32000): + vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Nemotron model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`NemotronModel`] - hidden_size (`int`, *optional*, defaults to 4096): + hidden_size (`int`, *optional*, defaults to 6144): Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): + intermediate_size (`int`, *optional*, defaults to 24576): Dimension of the MLP representations. num_hidden_layers (`int`, *optional*, defaults to 32): Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): + num_attention_heads (`int`, *optional*, defaults to 48): Number of attention heads for each attention layer in the Transformer decoder. - head_dim (`int`, *optional*, defaults to None): + head_dim (`int`, *optional*): Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if None num_key_value_heads (`int`, *optional*): @@ -63,16 +63,16 @@ class NemotronConfig(PretrainedConfig): heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): + max_position_embeddings (`int`, *optional*, defaults to 4096): The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): + initializer_range (`float`, *optional*, defaults to 0.0134): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - norm_eps (`float`, *optional*, defaults to 1e-06): + norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values @@ -80,21 +80,16 @@ class NemotronConfig(PretrainedConfig): `config.is_decoder=True`. pad_token_id (`int`, *optional*): Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): + bos_token_id (`int`, *optional*, defaults to 2): Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): + eos_token_id (`int`, *optional*, defaults to 3): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE - embeddings. Currently supports two scaling strategies: linear - and dynamic. Their scaling factor must be a float greater than 1. - The expected format is `{"type": strategy name, - "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. @@ -106,13 +101,10 @@ class NemotronConfig(PretrainedConfig): ```python >>> from transformers import NemotronModel, NemotronConfig - >>> # Initializing a Nemotron nemotron-15b style configuration >>> configuration = NemotronConfig() - >>> # Initializing a model from the nemotron-15b style configuration >>> model = NemotronModel(configuration) - >>> # Accessing the model configuration >>> configuration = model.config ```""" @@ -140,7 +132,7 @@ def __init__( tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, - rope_percent=0.5, + partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, mlp_bias=False, @@ -167,8 +159,10 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - rope_percent = rope_percent or kwargs.get("rope_percentage", None) - self.rope_percent = rope_percent + # for backward compatibility + partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get( + "rope_percentage", None) or partial_rotary_factor + self.partial_rotary_factor = partial_rotary_factor self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 47068f77ec5df..d01b4e781b608 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1554,6 +1554,21 @@ def execute_model( # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: + if (self.is_driver_worker + and hidden_or_intermediate_states is not None + and isinstance(hidden_or_intermediate_states, + IntermediateTensors) + and self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + hidden_or_intermediate_states.tensors["model_forward_time"] = ( + torch.tensor(model_forward_time + orig_model_forward_time)) return hidden_or_intermediate_states logits = self.model.compute_logits(hidden_or_intermediate_states, @@ -1573,11 +1588,16 @@ def execute_model( model_forward_end.synchronize() model_forward_time = model_forward_start.elapsed_time( model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() # If there are multiple workers, we are still tracking the latency # from the start time of the driver worker to the end time of the # driver worker. The model forward time will then end up covering # the communication time as well. - output.model_forward_time = model_forward_time + output.model_forward_time = (orig_model_forward_time + + model_forward_time) if self.return_hidden_states: # we only need to pass hidden states of most recent token @@ -1735,7 +1755,7 @@ def forward( **kwargs) if intermediate_tensors is not None: for key in intermediate_tensors.tensors: - if key != "model_execute_time": + if key != "model_execute_time" and key != "model_forward_time": self.input_buffers[key].copy_(intermediate_tensors[key], non_blocking=True) # Run the graph. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 4c30f20ff076e..8f4372e20d2e7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -368,7 +368,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): if torch_dtype == torch.bfloat16: compute_capability = current_platform.get_device_capability() if compute_capability[0] < 8: - gpu_name = torch.cuda.get_device_name() + gpu_name = current_platform.get_device_name() raise ValueError( "Bfloat16 is only supported on GPUs with compute capability " f"of at least 8.0. Your {gpu_name} GPU has compute capability "