diff --git a/tests/unit_tests/ops/data/awq/input.pt b/tests/unit_tests/ops/data/awq/input.pt new file mode 100644 index 00000000..23933bfd Binary files /dev/null and b/tests/unit_tests/ops/data/awq/input.pt differ diff --git a/tests/unit_tests/ops/data/awq/output.pt b/tests/unit_tests/ops/data/awq/output.pt new file mode 100644 index 00000000..aeda8918 Binary files /dev/null and b/tests/unit_tests/ops/data/awq/output.pt differ diff --git a/tests/unit_tests/ops/data/awq/qweight.pt b/tests/unit_tests/ops/data/awq/qweight.pt new file mode 100644 index 00000000..9286af88 Binary files /dev/null and b/tests/unit_tests/ops/data/awq/qweight.pt differ diff --git a/tests/unit_tests/ops/data/awq/qzeros.pt b/tests/unit_tests/ops/data/awq/qzeros.pt new file mode 100644 index 00000000..9a29a304 Binary files /dev/null and b/tests/unit_tests/ops/data/awq/qzeros.pt differ diff --git a/tests/unit_tests/ops/data/awq/scales.pt b/tests/unit_tests/ops/data/awq/scales.pt new file mode 100644 index 00000000..1a51dcdf Binary files /dev/null and b/tests/unit_tests/ops/data/awq/scales.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt new file mode 100644 index 00000000..ab392b5f Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt new file mode 100644 index 00000000..56e1062d Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt new file mode 100644 index 00000000..5e652925 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt new file mode 100644 index 00000000..d3f24ff8 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt new file mode 100644 index 00000000..8d08d55a Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt new file mode 100644 index 00000000..0df01685 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt new file mode 100644 index 00000000..09641c69 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt new file mode 100644 index 00000000..1beecf25 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt new file mode 100644 index 00000000..69384ea1 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt new file mode 100644 index 00000000..cac9e0a8 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt new file mode 100644 index 00000000..58ab4c17 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt new file mode 100644 index 00000000..8ef218bd Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt new file mode 100644 index 00000000..36abfd14 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt new file mode 100644 index 00000000..42633606 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt new file mode 100644 index 00000000..e7fdd634 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt differ diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt new file mode 100644 index 00000000..050acf67 Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt differ diff --git a/tests/unit_tests/ops/data/fp8/linear_input.pt b/tests/unit_tests/ops/data/fp8/linear_input.pt new file mode 100644 index 00000000..03fed11c Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_input.pt differ diff --git a/tests/unit_tests/ops/data/fp8/linear_output.pt b/tests/unit_tests/ops/data/fp8/linear_output.pt new file mode 100644 index 00000000..104d98c3 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_output.pt differ diff --git a/tests/unit_tests/ops/data/fp8/linear_weight.pt b/tests/unit_tests/ops/data/fp8/linear_weight.pt new file mode 100644 index 00000000..48c6f935 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_weight.pt differ diff --git a/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt new file mode 100644 index 00000000..3644d0cc Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt b/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt new file mode 100644 index 00000000..2adb8031 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt b/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt new file mode 100644 index 00000000..01ded36f Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_output.pt b/tests/unit_tests/ops/data/fp8/moe_output.pt new file mode 100644 index 00000000..4652f3c0 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_output.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt b/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt new file mode 100644 index 00000000..4345f6f6 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt new file mode 100644 index 00000000..409c1cf8 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt b/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt new file mode 100644 index 00000000..55a919b4 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt differ diff --git a/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt new file mode 100644 index 00000000..ef394af6 Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt differ diff --git a/tests/unit_tests/ops/data/gptq/input.pt b/tests/unit_tests/ops/data/gptq/input.pt new file mode 100644 index 00000000..4bbfd6a8 Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/input.pt differ diff --git a/tests/unit_tests/ops/data/gptq/output.pt b/tests/unit_tests/ops/data/gptq/output.pt new file mode 100644 index 00000000..88c44aa1 Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/output.pt differ diff --git a/tests/unit_tests/ops/data/gptq/qweight.pt b/tests/unit_tests/ops/data/gptq/qweight.pt new file mode 100644 index 00000000..eab3b77f Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/qweight.pt differ diff --git a/tests/unit_tests/ops/data/gptq/qzeros.pt b/tests/unit_tests/ops/data/gptq/qzeros.pt new file mode 100644 index 00000000..cd4bc995 Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/qzeros.pt differ diff --git a/tests/unit_tests/ops/data/gptq/scales.pt b/tests/unit_tests/ops/data/gptq/scales.pt new file mode 100644 index 00000000..aedc48de Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/scales.pt differ diff --git a/tests/unit_tests/ops/test_hpu_awq.py b/tests/unit_tests/ops/test_hpu_awq.py new file mode 100644 index 00000000..5ddd7970 --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_awq.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import habana_frameworks.torch as htorch +from utils import get_data_path +from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig +from vllm_gaudi.utils import HPUCompileConfig +from vllm.model_executor.layers.linear import RowParallelLinear + + +def test_awq_linear_method(dist_init): + config = {"bits": 4, "group_size": 128, "zero_point": True} + oot_quant_config = AWQHPUConfig.from_config(config) + + # Prepare linear layer with oot AWQHPULinearMethod + oot_op = RowParallelLinear(input_size=256, + output_size=128, + bias=False, + input_is_parallel=True, + skip_bias_add=False, + params_dtype=torch.bfloat16, + reduce_results=True, + quant_config=oot_quant_config, + return_bias=False, + disable_tp=False).to("hpu") + assert isinstance(oot_op.quant_method, AWQHPULinearMethod) + + # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ + # (with adjusted shape, to make tensors smaller) + qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu") + oot_op.qweight.copy_(qweight) + qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu") + oot_op.qzeros.copy_(qzeros) + scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16) + oot_op.scales.copy_(scales) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input + # (AWQLinearMethod was triggered offline with the same input as below to get the ref_output) + input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16) + ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False, + map_location="hpu").to(torch.bfloat16) + + # Execute layer + out = oot_op(input) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) diff --git a/tests/unit_tests/ops/test_hpu_compressed_tensors.py b/tests/unit_tests/ops/test_hpu_compressed_tensors.py new file mode 100644 index 00000000..301640a6 --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_compressed_tensors.py @@ -0,0 +1,304 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import habana_frameworks.torch as htorch +from utils import get_data_path +from unittest.mock import MagicMock +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsConfig +from vllm_gaudi.ops.hpu_compressed_tensors import (HPUCompressedTensorsLinearMethod, HPUCompressedTensorsW8A8Fp8, + HPUCompressedTensorsWNA16, HPUCompressedTensorsWNA16MoEMethod) +from vllm_gaudi.utils import HPUCompileConfig +from vllm.forward_context import override_forward_context +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +def test_compressed_tensors_linear_method_w8a8fp8(dist_init): + config = { + 'config_groups': { + 'group_0': { + 'input_activations': { + 'block_structure': None, + 'dynamic': True, + 'group_size': None, + 'num_bits': 8, + 'observer': 'memoryless', + 'observer_kwargs': {}, + 'strategy': 'token', + 'symmetric': True, + 'type': 'float' + }, + 'output_activations': None, + 'targets': ['Linear'], + 'weights': { + 'block_structure': None, + 'dynamic': False, + 'group_size': None, + 'num_bits': 8, + 'observer': 'minmax', + 'observer_kwargs': {}, + 'strategy': 'channel', + 'symmetric': True, + 'type': 'float' + } + } + }, + 'format': 'naive-quantized', + 'global_compression_ratio': 1.239290831149584, + 'ignore': [], + 'kv_cache_scheme': None, + 'quant_method': 'compressed-tensors', + 'quantization_status': 'frozen' + } + oot_quant_config = CompressedTensorsConfig.from_config(config) + + # Prepare linear layer with oot CompressedTensorsLinearMethod + # with HPUCompressedTensorsW8A8Fp8 scheme + oot_op = RowParallelLinear(input_size=256, + output_size=256, + bias=False, + input_is_parallel=True, + skip_bias_add=False, + params_dtype=torch.bfloat16, + reduce_results=True, + quant_config=oot_quant_config, + return_bias=False, + disable_tp=False).to("hpu") + assert isinstance(oot_op.quant_method, HPUCompressedTensorsLinearMethod) + assert isinstance(oot_op.scheme, HPUCompressedTensorsW8A8Fp8) + + # Weight and weight_scale_inv were extracted from first RowParallelLinear + # layer of RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic + # (with adjusted shapes, to make tensors smaller) + weight = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_weight.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight.copy_(weight) + weight_scale = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_weight_scale.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight_scale.copy_(weight_scale) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds data that was returned by cuda impl of CompressedTensorsLinearMethod for given input + # (CompressedTensorsLinearMethod was triggered offline with the same input as below to get the ref_output) + input = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_input.pt"), + weights_only=False, + map_location="hpu") + ref_output = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_output.pt"), + weights_only=False, + map_location="hpu") + + # Execute layer + out = oot_op(input) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) + + +def test_compressed_tensors_linear_method_wna16(dist_init): + config = { + 'config_groups': { + 'group_0': { + 'input_activations': None, + 'output_activations': None, + 'targets': ['Linear'], + 'weights': { + 'actorder': 'weight', + 'block_structure': None, + 'dynamic': False, + 'group_size': 128, + 'num_bits': 4, + 'observer': 'minmax', + 'observer_kwargs': {}, + 'strategy': 'group', + 'symmetric': False, + 'type': 'int' + } + } + }, + 'format': 'pack-quantized', + 'global_compression_ratio': None, + 'ignore': [], + 'kv_cache_scheme': None, + 'quant_method': 'compressed-tensors', + 'quantization_status': 'compressed' + } + oot_quant_config = CompressedTensorsConfig.from_config(config) + + # Prepare linear layer with oot CompressedTensorsLinearMethod + # with HPUCompressedTensorsWNA16 scheme + oot_op = RowParallelLinear(input_size=256, + output_size=256, + bias=False, + input_is_parallel=True, + skip_bias_add=False, + params_dtype=torch.bfloat16, + reduce_results=True, + quant_config=oot_quant_config, + return_bias=False, + disable_tp=False).to("hpu") + assert isinstance(oot_op.quant_method, HPUCompressedTensorsLinearMethod) + assert isinstance(oot_op.scheme, HPUCompressedTensorsWNA16) + + # Weights were extracted from first RowParallelLinear layer of RedHatAI/Qwen3-8B-quantized.w4a16 + # (with adjusted shapes, to make tensors smaller) + weight_packed = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_packed.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight_packed.copy_(weight_packed) + weight_scale = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_scale.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight_scale.copy_(weight_scale) + weight_zero_point = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_zero_point.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight_zero_point.copy_(weight_zero_point) + oot_op.weight_shape.data = torch.tensor([256, 256], device='hpu:0') + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds data that was returned by cuda impl of CompressedTensorsLinearMethod for given input + # (CompressedTensorsLinearMethod was triggered offline with the same input as below to get the ref_output) + input = torch.load(get_data_path("data/compressed_tensors/linear_wna16_input.pt"), + weights_only=False, + map_location="hpu") + ref_output = torch.load(get_data_path("data/compressed_tensors/linear_wna16_output.pt"), + weights_only=False, + map_location="hpu") + + # Execute layer + out = oot_op(input) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) + + +def test_compressed_tensors_wna16_moe_method(dist_init): + config = { + 'config_groups': { + 'group_0': { + 'input_activations': None, + 'output_activations': None, + 'targets': ['Linear'], + 'weights': { + 'actorder': 'weight', + 'block_structure': None, + 'dynamic': False, + 'group_size': 128, + 'num_bits': 4, + 'observer': 'minmax', + 'observer_kwargs': {}, + 'strategy': 'group', + 'symmetric': True, + 'type': 'int' + } + } + }, + 'format': 'pack-quantized', + 'global_compression_ratio': None, + 'ignore': [], + 'kv_cache_scheme': None, + 'quant_method': 'compressed-tensors', + 'quantization_status': 'compressed' + } + oot_quant_config = CompressedTensorsConfig.from_config(config) + + # Prepare FusedMoE layer with oot HPUCompressedTensorsWNA16MoEMethod + oot_op = FusedMoE(num_experts=128, + top_k=8, + hidden_size=512, + intermediate_size=256, + params_dtype=torch.bfloat16, + reduce_results=True, + renormalize=True, + use_grouped_topk=False, + num_expert_group=None, + topk_group=None, + quant_config=oot_quant_config, + tp_size=None, + ep_size=None, + dp_size=None, + custom_routing_function=None, + scoring_func="softmax", + routed_scaling_factor=1.0, + e_score_correction_bias=None, + apply_router_weight_on_input=False, + activation="silu", + enable_eplb=False, + num_redundant_experts=0, + has_bias=False, + is_sequence_parallel=False, + zero_expert_num=0, + zero_expert_type=None).to("hpu") + assert isinstance(oot_op.quant_method, HPUCompressedTensorsWNA16MoEMethod) + + # Weights were extracted from first FusedMoE layer of RedHatAI/Qwen3-30B-A3B-quantized.w4a16 + # (with adjusted shapes, to make tensors smaller) + w2_weight_packed = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w2_weight_packed.pt"), + weights_only=False, + map_location="hpu") + w2_weight_packed = torch.swapaxes(w2_weight_packed, 0, 1).repeat(128, 1, 1) + oot_op.w2_weight_packed.copy_(w2_weight_packed) + w13_weight_packed = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w13_weight_packed.pt"), + weights_only=False, + map_location="hpu") + w13_weight_packed = torch.swapaxes(w13_weight_packed, 0, 1).repeat(128, 1, 1) + oot_op.w13_weight_packed.copy_(w13_weight_packed) + + w2_weight_scale = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w2_weight_scale.pt"), + weights_only=False, + map_location="hpu") + w2_weight_scale = torch.swapaxes(w2_weight_scale, 0, 1).repeat(128, 1, 1) + oot_op.w2_weight_scale.copy_(w2_weight_scale) + w13_weight_scale = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w13_weight_scale.pt"), + weights_only=False, + map_location="hpu") + w13_weight_scale = torch.swapaxes(w13_weight_scale, 0, 1).repeat(128, 1, 1) + oot_op.w13_weight_scale.copy_(w13_weight_scale) + + w2_weight_shape = torch.tensor([512, 256], dtype=torch.bfloat16, device="hpu") + oot_op.w2_weight_shape.copy_(w2_weight_shape.repeat(128, 1)) + w13_weight_shape = torch.tensor([256, 512], dtype=torch.bfloat16, device="hpu") + oot_op.w13_weight_shape.copy_(w13_weight_shape.repeat(128, 1)) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds data that was returned by cuda impl of CompressedTensorsWNA16MarlinMoEMethod for given input + # (CompressedTensorsWNA16MarlinMoEMethod was triggered offline with the same input as below to get the ref_output) + hidden_states = torch.load(get_data_path("data/compressed_tensors/moe_wna16_input_hidden_states.pt"), + weights_only=False, + map_location="hpu") + router_logits = torch.load(get_data_path("data/compressed_tensors/moe_wna16_input_router_logits.pt"), + weights_only=False, + map_location="hpu") + ref_output = torch.load(get_data_path("data/compressed_tensors/moe_wna16_output.pt"), + weights_only=False, + map_location="hpu") + + # Execute layer + mock_ctx = MagicMock(spec=["dp_metadata"]) + mock_ctx.dp_metadata = None + with override_forward_context(mock_ctx): + out = oot_op.forward_impl(hidden_states, router_logits) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-4, rtol=1e-4) diff --git a/tests/unit_tests/ops/test_hpu_fp8.py b/tests/unit_tests/ops/test_hpu_fp8.py new file mode 100644 index 00000000..f643b918 --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_fp8.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import habana_frameworks.torch as htorch +from utils import get_data_path +from unittest.mock import MagicMock +from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod +from vllm_gaudi.utils import HPUCompileConfig +from vllm.forward_context import override_forward_context +from vllm.model_executor.layers.quantization.fp8 import Fp8Config +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +def test_fp8_linear_method(dist_init, monkeypatch): + monkeypatch.setenv("VLLM_HPU_FORCE_CHANNEL_FP8", "0") + config = {'activation_scheme': 'dynamic', 'fmt': 'e4m3', 'quant_method': 'fp8', 'weight_block_size': [128, 128]} + oot_quant_config = Fp8Config.from_config(config) + + # Prepare linear layer with oot Fp8LinearMethod + oot_op = RowParallelLinear(input_size=256, + output_size=256, + bias=False, + input_is_parallel=True, + skip_bias_add=False, + params_dtype=torch.bfloat16, + reduce_results=True, + quant_config=oot_quant_config, + return_bias=False, + disable_tp=False).to("hpu") + assert isinstance(oot_op.quant_method, Fp8LinearMethod) + + # Weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8 + # (with adjusted shapes, to make tensors smaller) + weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu") + oot_op.weight.copy_(weight) + weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"), + weights_only=False, + map_location="hpu") + oot_op.weight_scale_inv.copy_(weight_scale_inv) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + # Setting fullgraph to False, because currently there is a graph break + compile_config = HPUCompileConfig(fullgraph=False) + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input + # (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output) + input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu") + ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu") + + # Execute layer + out = oot_op(input) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) + + +def test_fp8_moe_method(dist_init, monkeypatch): + monkeypatch.setenv("VLLM_HPU_FORCE_CHANNEL_FP8", "0") + config = { + 'activation_scheme': 'dynamic', + 'modules_to_not_convert': [], + 'fmt': 'e4m3', + 'quant_method': 'fp8', + 'weight_block_size': [128, 128] + } + oot_quant_config = Fp8Config.from_config(config) + + # Prepare FusedMoE layer with oot HPUFp8MoEMethod + oot_op = FusedMoE(num_experts=128, + top_k=8, + hidden_size=512, + intermediate_size=256, + params_dtype=torch.bfloat16, + reduce_results=True, + renormalize=True, + use_grouped_topk=False, + num_expert_group=None, + topk_group=None, + quant_config=oot_quant_config, + tp_size=None, + ep_size=None, + dp_size=None, + custom_routing_function=None, + scoring_func="softmax", + routed_scaling_factor=1.0, + e_score_correction_bias=None, + apply_router_weight_on_input=False, + activation="silu", + enable_eplb=False, + num_redundant_experts=0, + has_bias=False, + is_sequence_parallel=False, + zero_expert_num=0, + zero_expert_type=None).to("hpu") + assert isinstance(oot_op.quant_method, HPUFp8MoEMethod) + + # Weights were extracted from first FusedMoE layer of Qwen/Qwen3-30B-A3B-FP8 + # (with adjusted shapes, to make tensors smaller) + w13_weight = torch.load(get_data_path("data/fp8/moe_w13_weight.pt"), weights_only=False, map_location="hpu") + oot_op.w13_weight.copy_(w13_weight.repeat(128, 1, 1)) + w13_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w13_weight_scale_inv.pt"), + weights_only=False, + map_location="hpu") + oot_op.w13_weight_scale_inv.copy_(w13_weight_scale_inv.repeat(128, 1, 1)) + w2_weight = torch.load(get_data_path("data/fp8/moe_w2_weight.pt"), weights_only=False, map_location="hpu") + oot_op.w2_weight.copy_(w2_weight.repeat(128, 1, 1)) + w2_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w2_weight_scale_inv.pt"), + weights_only=False, + map_location="hpu") + oot_op.w2_weight_scale_inv.copy_(w2_weight_scale_inv.repeat(128, 1, 1)) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input + # (Fp8MoEMethod was triggered offline with the same input as below to get the ref_output) + hidden_states = torch.load(get_data_path("data/fp8/moe_input_hidden_states.pt"), + weights_only=False, + map_location="hpu") + router_logits = torch.load(get_data_path("data/fp8/moe_input_router_logits.pt"), + weights_only=False, + map_location="hpu") + ref_output = torch.load(get_data_path("data/fp8/moe_output.pt"), weights_only=False, map_location="hpu") + + # Execute layer + mock_ctx = MagicMock(spec=["dp_metadata"]) + mock_ctx.dp_metadata = None + with override_forward_context(mock_ctx): + out = oot_op.forward_impl(hidden_states, router_logits) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) diff --git a/tests/unit_tests/ops/test_hpu_gptq.py b/tests/unit_tests/ops/test_hpu_gptq.py new file mode 100644 index 00000000..88818e6b --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_gptq.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import habana_frameworks.torch as htorch +from utils import get_data_path +from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig +from vllm_gaudi.utils import HPUCompileConfig +from vllm.model_executor.layers.linear import RowParallelLinear + + +def test_gptq_linear_method(dist_init): + config = {"bits": 4, "group_size": 128, "desc_act": False, "lm_head": False} + oot_quant_config = GPTQHPUConfig.from_config(config) + + # Prepare linear layer with oot GPTQHPULinearMethod + oot_op = RowParallelLinear(input_size=256, + output_size=8, + bias=False, + input_is_parallel=True, + skip_bias_add=False, + params_dtype=torch.bfloat16, + reduce_results=True, + quant_config=oot_quant_config, + return_bias=False, + disable_tp=False).to("hpu") + assert isinstance(oot_op.quant_method, GPTQHPULinearMethod) + + # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ + # (with adjusted shape, to make tensors smaller) + qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu") + oot_op.qweight.copy_(qweight) + qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu") + oot_op.qzeros.copy_(qzeros) + scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16) + oot_op.scales.copy_(scales) + + oot_op.quant_method.process_weights_after_loading(oot_op) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_op = torch.compile(oot_op, **compile_config.get_compile_args()) + + # Input and expected output + # Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input + # (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output) + input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16) + ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False, + map_location="hpu").to(torch.bfloat16) + + # Execute layer + out = oot_op(input) + + # Check correctness + torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3) diff --git a/tests/unit_tests/ops/test_hpu_layernorm.py b/tests/unit_tests/ops/test_hpu_layernorm.py new file mode 100644 index 00000000..c0920a23 --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_layernorm.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +import habana_frameworks.torch as htorch +from utils import temporary_op_registry_oot, register_op +from vllm_gaudi.ops.hpu_layernorm import HPURMSNorm +from vllm_gaudi.utils import HPUCompileConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.platforms import current_platform + +DTYPES = [torch.bfloat16, torch.float] +NUM_TOKENS = [7, 83, 4096] +HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199] +ADD_RESIDUAL = [False, True] +DEVICE = [current_platform.device_type] +IS_STRIDED = [False, True] + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICE) +@pytest.mark.parametrize("strided_input", IS_STRIDED) +def test_rms_norm( + num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + device: str, + strided_input: bool, +) -> None: + with temporary_op_registry_oot(): + # prepare native RMSNorm module + native_rms_norm = RMSNorm(hidden_size=hidden_size, eps=1e-05) + native_rms_norm = native_rms_norm.to(dtype=dtype).to(device) + native_rms_norm.weight.data.normal_(mean=1.0, std=0.1) + assert isinstance(native_rms_norm, RMSNorm) and not isinstance(native_rms_norm, HPURMSNorm) + + # Prepare oot HPURMSNorm module + register_op(RMSNorm, HPURMSNorm) + oot_rms_norm = RMSNorm(hidden_size=hidden_size, eps=1e-05) + oot_rms_norm = oot_rms_norm.to(dtype=dtype).to(device) + oot_rms_norm.weight.data = native_rms_norm.weight.data.clone() + assert isinstance(oot_rms_norm, RMSNorm) and isinstance(oot_rms_norm, HPURMSNorm) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_rms_norm = torch.compile(oot_rms_norm, **compile_config.get_compile_args()) + + # Prepare input data + scale = 1 / (2 * hidden_size) + last_dim = 2 * hidden_size if strided_input else hidden_size + x = torch.randn(num_tokens, last_dim, dtype=dtype, device=device) + x = x[..., :hidden_size] + assert x.is_contiguous() != strided_input + x *= scale + residual = torch.randn_like(x) * scale if add_residual else None + + # Execute layers + ref_out = native_rms_norm(x, residual) + out = oot_rms_norm(x, residual) + + # Check correctness + if add_residual: + torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2) + torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2) + else: + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) diff --git a/tests/unit_tests/ops/test_hpu_rotary_embedding.py b/tests/unit_tests/ops/test_hpu_rotary_embedding.py new file mode 100644 index 00000000..b26e6ace --- /dev/null +++ b/tests/unit_tests/ops/test_hpu_rotary_embedding.py @@ -0,0 +1,389 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +import habana_frameworks.torch as htorch +from typing import NamedTuple +from utils import temporary_op_registry_oot, register_op +from transformers.models.auto.configuration_auto import AutoConfig +from vllm_gaudi.utils import HPUCompileConfig +from vllm_gaudi.ops.hpu_rotary_embedding import (HPURotaryEmbedding, HPULinearScalingRotaryEmbedding, + HPUDynamicNTKScalingRotaryEmbedding, HPUYaRNScalingRotaryEmbedding, + HPUDeepseekScalingRotaryEmbedding, HPULlama3RotaryEmbedding, + HPUPhi3LongRoPEScaledRotaryEmbedding, HPULlama4VisionRotaryEmbedding, + HPUMRotaryEmbedding) +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, LinearScalingRotaryEmbedding, + DynamicNTKScalingRotaryEmbedding, YaRNScalingRotaryEmbedding, + DeepseekScalingRotaryEmbedding, Llama3RotaryEmbedding, + Phi3LongRoPEScaledRotaryEmbedding, Llama4VisionRotaryEmbedding, + MRotaryEmbedding) + +# General settings +HIDDEN_SIZES = [4096] +SEQ_LENGTHS = [4096] +HEAD_SIZES = [32, 128, 512, 1024] +ROTARY_DIMS = [8, 32] +MAX_POSITION_EMBEDDINGS = [131072] +BASES = [500000.0] +IS_NEOX_STYLE = [False, True] +SCALING_FACTORS = [1.0, 2.0, 4.0, 8.0] +SCALING_FACTORS_WITH_LIST = [1.0, 2.0, 4.0, 8.0, [2.0, 4.0]] + +# Vision model settings +IMAGE_SIZE = 336 +PATCH_SIZE = 14 +VISION_MAX_POSITION_EMBEDDINGS = [(IMAGE_SIZE // PATCH_SIZE)**2] +VISION_SEQ_LENGTHS = [x + 1 for x in VISION_MAX_POSITION_EMBEDDINGS] + + +class RotaryData(NamedTuple): + """ + Data structure for rotary embedding test parameters. + """ + cls: type + dtype: torch.dtype + device: str + + +def run_rotary_embedding_test(native_rotary_data: RotaryData, oot_rotary_data: RotaryData, seq_length: int, + hidden_size: int, **kwargs) -> None: + """ + Common code for running rotary embedding tests. It compares output of + natve operator and out-of-tree custom operator. It allows to + specify separate device for native operator and custom operator, + because for example native Llama4VisionRotaryEmbedding cannot be + used on hpu as it uses complex datatype. The same applies to dtype. + """ + with temporary_op_registry_oot(): + # prepare native RotaryEmbedding module + with torch.device(native_rotary_data.device): + kwargs["dtype"] = native_rotary_data.dtype + native_rotary_embedding = native_rotary_data.cls(**kwargs) + assert isinstance(native_rotary_embedding, + native_rotary_data.cls) and not isinstance(native_rotary_embedding, oot_rotary_data.cls) + + # Prepare oot RotaryEmbedding module + with torch.device(oot_rotary_data.device): + register_op(native_rotary_data.cls, oot_rotary_data.cls) + kwargs["dtype"] = oot_rotary_data.dtype + oot_rotary_embedding = native_rotary_data.cls(**kwargs) # Use native as it was registered above + assert isinstance(oot_rotary_embedding, native_rotary_data.cls) and isinstance( + oot_rotary_embedding, oot_rotary_data.cls) + + if not htorch.utils.internal.is_lazy(): + compile_config = HPUCompileConfig() + oot_rotary_embedding = torch.compile(oot_rotary_embedding, **compile_config.get_compile_args()) + + # Prepare input data + positions = torch.randint(high=seq_length, + size=(1, seq_length), + dtype=torch.int32, + device=native_rotary_data.device) + query = torch.randn(1, seq_length, hidden_size, dtype=torch.bfloat16, device=native_rotary_data.device) + key = torch.randn(1, seq_length, hidden_size, dtype=torch.bfloat16, device=native_rotary_data.device) + if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, DeepseekScalingRotaryEmbedding): + query = query.view(query.shape[0], query.shape[1], -1, kwargs["head_size"]) + key = key.view(key.shape[0], key.shape[1], -1, kwargs["head_size"]) + + # Execute layers + with torch.device(native_rotary_data.device): + if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, ): + ref_query_out, ref_key_out = native_rotary_embedding(query, key) + elif native_rotary_data.cls in (MRotaryEmbedding, ): + ref_query_out, ref_key_out = native_rotary_embedding(positions.flatten(), query, key) + else: + ref_query_out, ref_key_out = native_rotary_embedding(positions, query, key) + + if native_rotary_data.device != oot_rotary_data.device: + positions = positions.to(oot_rotary_data.device) + query = query.to(oot_rotary_data.device) + key = key.to(oot_rotary_data.device) + + with torch.device(oot_rotary_data.device): + if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, ): + query_out, key_out = oot_rotary_embedding(query, key) + else: + query_out, key_out = oot_rotary_embedding(positions, query, key) + + # Check correctness + if native_rotary_data.device != oot_rotary_data.device: + ref_query_out = ref_query_out.to("cpu") + query_out = query_out.to("cpu") + ref_key_out = ref_key_out.to("cpu") + key_out = key_out.to("cpu") + torch.testing.assert_close(query_out, ref_query_out, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(key_out, ref_key_out, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +def test_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + } + native_rotary_data = RotaryData(cls=RotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPURotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("scaling_factors", SCALING_FACTORS_WITH_LIST) +def test_linear_scaling_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factors: float | list[float], +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "scaling_factors": scaling_factors, + } + native_rotary_data = RotaryData(cls=LinearScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPULinearScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS) +def test_dynamic_ntk_scaling_rotary_rmbedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "scaling_factor": scaling_factor, + } + native_rotary_data = RotaryData(cls=DynamicNTKScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPUDynamicNTKScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS) +def test_yarn_scaling_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "scaling_factor": scaling_factor, + } + native_rotary_data = RotaryData(cls=YaRNScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPUYaRNScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS) +def test_deepseek_scaling_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": head_size, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "scaling_factor": scaling_factor, + } + native_rotary_data = RotaryData(cls=DeepseekScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPUDeepseekScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS) +def test_llama3_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "scaling_factor": scaling_factor, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + "orig_max_position": 8192 + } + native_rotary_data = RotaryData(cls=Llama3RotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPULlama3RotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.skip(reason="Phi3LongRoPEScaledRotaryEmbedding currently does not inherit CustomOp") +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", [True]) +def test_phi3_long_rope_scaled_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, +) -> None: + config = AutoConfig.from_pretrained("microsoft/Phi-4-mini-instruct") + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "original_max_position_embeddings": config.original_max_position_embeddings, + "short_factor": config.rope_scaling["short_factor"], + "long_factor": config.rope_scaling["long_factor"], + } + native_rotary_data = RotaryData(cls=Phi3LongRoPEScaledRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPUPhi3LongRoPEScaledRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", VISION_SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("max_position_embeddings", VISION_MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +def test_Llama4_vision_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, +) -> None: + rotary_dim = int(hidden_size // (hidden_size / head_size) // 2) + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + } + native_rotary_data = RotaryData(cls=Llama4VisionRotaryEmbedding, dtype=torch.complex64, device="cpu") + oot_rotary_data = RotaryData(cls=HPULlama4VisionRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) + + +@pytest.mark.parametrize("seq_length", SEQ_LENGTHS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS) +@pytest.mark.parametrize("base", BASES) +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +def test_m_rotary_embedding( + seq_length: int, + hidden_size: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, +) -> None: + kwargs = { + "head_size": head_size, + "rotary_dim": rotary_dim, + "max_position_embeddings": max_position_embeddings, + "base": base, + "is_neox_style": is_neox_style, + "mrope_section": [rotary_dim // 2] + } + native_rotary_data = RotaryData(cls=MRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + oot_rotary_data = RotaryData(cls=HPUMRotaryEmbedding, dtype=torch.bfloat16, device="hpu") + run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs) diff --git a/tests/unit_tests/ops/utils.py b/tests/unit_tests/ops/utils.py new file mode 100644 index 00000000..fd4aeb21 --- /dev/null +++ b/tests/unit_tests/ops/utils.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import contextlib +from vllm.model_executor.custom_op import CustomOp + + +@contextlib.contextmanager +def temporary_op_registry_oot(): + """ + Contextmanager which allows to temporarly modify the op registry content. + It clears current op_registry_oot and restors its content on exit. + It is usefull for testing purposes, e.g. to deregister hpu version + of the op. (Because when running tests, if registration happened in one + of them, then it is still valid in every other test). + """ + old_registry = CustomOp.op_registry_oot + CustomOp.op_registry_oot = {} + try: + yield + finally: + CustomOp.op_registry_oot = old_registry + + +def register_op(base_cls, oot_cls): + """ + Manual registration of the oot op. It should be used + within temporary_op_registry_oot context manager. + """ + CustomOp.op_registry_oot[base_cls.__name__] = oot_cls + + +def get_data_path(filename): + return os.path.join(os.path.dirname(__file__), filename) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index f2ae8aa5..648877d9 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -2,7 +2,6 @@ import torch from vllm_gaudi import envs -from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.quantization import fp8 @@ -59,7 +58,6 @@ def dequant_fp8_weight(self, layer) -> torch.Tensor: return dequant_weight -@CustomOp.register_oot(name='Fp8MoEMethod') class HPUFp8MoEMethod(Fp8MoEMethod): def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):