From 482531cf40444a91da436f9e255af0d02dfffb27 Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago <44771380+pablomlago@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:12:39 +0000 Subject: [PATCH] Fix llm tests transformers (#1118) --- tests/brevitas_examples/test_llm.py | 189 ++++++++++++++++++++++++---- 1 file changed, 162 insertions(+), 27 deletions(-) diff --git a/tests/brevitas_examples/test_llm.py b/tests/brevitas_examples/test_llm.py index 61cfae010..1a425296f 100644 --- a/tests/brevitas_examples/test_llm.py +++ b/tests/brevitas_examples/test_llm.py @@ -72,6 +72,19 @@ def assert_layer_types(model, exp_layer_types): assert matched, f"Layer key: {key} not found in {layer_names}" +def assert_layer_types_count(model, exp_layer_types_count): + layer_types_count = {} + for name, layer in model.named_modules(): + ltype = str(type(layer)) + if ltype not in layer_types_count: + layer_types_count[ltype] = 0 + layer_types_count[ltype] += 1 + + for name, count in exp_layer_types_count.items(): + curr_count = 0 if name not in layer_types_count else layer_types_count[name] + assert count == curr_count, f"Expected {count} instances of layer type: {name}, found {curr_count}." + + class UpdatableNamespace(Namespace): def update(self, **kwargs): @@ -293,9 +306,7 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "mistral-fp8_fnuz", "llama-mxfp8", "llama-int8-act_equalization=layerwise", - "mistral-int8-quant-last-layer", - "llama-rotation-mixed-fx", - "llama-rotation-full-fx",], + "mistral-int8-quant-last-layer",], params=[ { "model": "hf-internal-testing/tiny-random-MistralForCausalLM", @@ -307,7 +318,8 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj.input_quant.fused_activation_quant_proxy.tensor_quant": "", "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant": - "",}}, + "",}, + }, # input_quant/weight_quant { "model": "hf-internal-testing/tiny-random-MistralForCausalLM", "input_bit_width": None, @@ -318,7 +330,8 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj.input_quant": "", "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant": - "",}}, + "",}, + }, # input_quant/weight_quant { "model": "hf-internal-testing/tiny-random-MistralForCausalLM", "weight_quant_format": "float_ocp_e4m3", @@ -331,7 +344,8 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj.input_quant.fused_activation_quant_proxy.tensor_quant": "", "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant": - "",}}, + "",}, + }, # input_quant/weight_quant { "model": "hf-internal-testing/tiny-random-MistralForCausalLM", "weight_quant_format": "float_fnuz_e4m3", @@ -344,7 +358,8 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj.input_quant.fused_activation_quant_proxy.tensor_quant": "", "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant": - "",}}, + "",}, + }, # input_quant/weight_quant { "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", "weight_quant_format": "float_ocp_e4m3", @@ -371,7 +386,7 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant": "", "model.layers.0.self_attn.q_proj.weight_quant.tensor_quant.input_view_impl": - "",}}, + "",},}, { "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", "act_equalization": "layerwise", @@ -379,12 +394,130 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "model.layers.0.self_attn.q_proj": "", "model.layers.0.self_attn.q_proj.layer": - "",}}, + "",},}, { "model": "hf-internal-testing/tiny-random-MistralForCausalLM", "quantize_last_layer": True, "exp_layer_types": { - "lm_head": ""}}, + "lm_head": ""},}, + ]) # LM Head + Q/K/V/O projs + Up/Gate/Down projs +def layer_args(default_run_args, request): + args = default_run_args + layer_dict = request.param + exp_layer_types = layer_dict["exp_layer_types"] + del layer_dict["exp_layer_types"] + args.update(**layer_dict) + yield args, exp_layer_types + + +@pytest.mark.llm +@requires_pt_ge('2.2') +def test_small_models_quant_layer(caplog, layer_args): + caplog.set_level(logging.INFO) + args, exp_layer_types = layer_args + if args.replace_rmsnorm: + if torch_version < version.parse('2.4'): + pytest.skip("Replacing RMSNorm requires torch 2.4+ or greater") + if hasattr(args, 'rotation') and args.rotation == 'fx' and platform.system() == 'Windows': + pytest.skip("Skipping dynamo + windows") + float_ppl, quant_ppl, model = validate_args_and_run_main(args) + assert_layer_types(model, exp_layer_types) + + +@pytest_cases.fixture( + ids=[ + "mistral-int8", + "mistral-weight-only", + "mistral-fp8_ocp", + "mistral-fp8_fnuz", + "llama-mxfp8", + "llama-int8-act_equalization=layerwise", + "mistral-int8-quant-last-layer", + "llama-rotation-mixed-fx", + "llama-rotation-full-fx",], + params=[ + { + "model": "hf-internal-testing/tiny-random-MistralForCausalLM", + "exp_layer_types_count": { + "": 1, # LM Head + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 28, + }}, # input_quant/weight_quant + { + "model": "hf-internal-testing/tiny-random-MistralForCausalLM", + "input_bit_width": None, + "act_calibration": False, + "exp_layer_types_count": { + "": 1, # LM Head + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 14, + }}, # input_quant/weight_quant + { + "model": "hf-internal-testing/tiny-random-MistralForCausalLM", + "weight_quant_format": "float_ocp_e4m3", + "weight_quant_type": "sym", + "input_quant_format": "float_ocp_e5m2", + "input_quant_type": "sym", + "exp_layer_types_count": { + "": 1, # LM Head + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 28,}}, # input_quant/weight_quant + { + "model": "hf-internal-testing/tiny-random-MistralForCausalLM", + "weight_quant_format": "float_fnuz_e4m3", + "weight_quant_type": "sym", + "input_quant_format": "float_fnuz_e5m2", + "input_quant_type": "sym", + "exp_layer_types_count": { + "": 1, # LM Head + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 28,}}, # input_quant/weight_quant + { + "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", + "weight_quant_format": "float_ocp_e4m3", + "weight_scale_precision": "po2_scale", + "weight_param_method": "stats", + "weight_quant_granularity": "per_group", + "weight_group_size": 16, + "weight_quant_type": "sym", + "input_quant_format": "float_ocp_e5m2", + "input_scale_type": "dynamic", + "input_scale_precision": "po2_scale", + "input_param_method": "stats", + "input_quant_granularity": "per_group", + "input_group_size": 16, + "input_quant_type": "sym", + "act_calibration": False, + "exp_layer_types_count": { + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 28, # input_quant/weight_quant + "": + 14, # input_quant..input_view_impl/input_quant..scaling_impl.input_view_impl + "": + 28, # weight_quant..input_view_impl/weight_quant..scaling_impl.input_view_impl + "": 1, # LM Head + "": 5,}}, + { + "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", + "act_equalization": "layerwise", + "exp_layer_types_count": { + "": + 14, # Q/K/V/O projs + Up/Gate/Down projs + "": 1, # LM Head + "": + 15, # LM Head + Q/K/V/O projs + Up/Gate/Down projs + "": 5,}}, + { + "model": "hf-internal-testing/tiny-random-MistralForCausalLM", + "quantize_last_layer": True, + "exp_layer_types_count": { + "": 15, + }}, # LM Head + Q/K/V/O projs + Up/Gate/Down projs { "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", "ln_affine_merge": True, @@ -394,11 +527,13 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "rotation_orphan_sink": True, "convert_layernorm_to_rmsnorm": True, "rotation": "fx", - "exp_layer_types": { - "L__self___model_layers_0_self_attn_k_proj": - "", - "L__self___model_layers_0_self_attn_o_proj": - ""}}, + "exp_layer_types_count": { + "": + 4, # Sinks: O proj + Down proj + "": + 15, # LM Head + Q/K/V/O projs + Up/Gate/Down projs + "": 5, + "": 0,}}, { "model": "hf-internal-testing/tiny-random-LlamaForCausalLM", "ln_affine_merge": True, @@ -408,32 +543,32 @@ def test_small_models_acc_pt_ge_2_4(caplog, acc_args_and_acc_pt_ge_2_4): "rotation_orphan_sink": False, "convert_layernorm_to_rmsnorm": True, "rotation": "fx", - "exp_layer_types": { - "L__self___model_layers_0_self_attn_k_proj": - "", - "L__self___model_layers_0_self_attn_o_proj": - ""}},]) -def layer_args(default_run_args, request): + "exp_layer_types_count": { + "": + 15, # LM Head + Q/K/V projs + Up/Gate/Down projs + "": 5, # Input + Post attention + "": 0,}},]) +def layer_args_types_count(default_run_args, request): args = default_run_args layer_dict = request.param - exp_layer_types = layer_dict["exp_layer_types"] - del layer_dict["exp_layer_types"] + exp_layer_types_count = layer_dict["exp_layer_types_count"] + del layer_dict["exp_layer_types_count"] args.update(**layer_dict) - yield args, exp_layer_types + yield args, exp_layer_types_count @pytest.mark.llm @requires_pt_ge('2.2') -def test_small_models_quant_layer(caplog, layer_args): +def test_small_models_quant_layer_types_count(caplog, layer_args_types_count): caplog.set_level(logging.INFO) - args, exp_layer_types = layer_args + args, exp_layer_types_count = layer_args_types_count if args.replace_rmsnorm: if torch_version < version.parse('2.4'): pytest.skip("Replacing RMSNorm requires torch 2.4+ or greater") if hasattr(args, 'rotation') and args.rotation == 'fx' and platform.system() == 'Windows': pytest.skip("Skipping dynamo + windows") float_ppl, quant_ppl, model = validate_args_and_run_main(args) - assert_layer_types(model, exp_layer_types) + assert_layer_types_count(model, exp_layer_types_count) @pytest_cases.fixture(