vllm-project · dsikka · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/examples/finetuning/example_alternating_recipe.yaml b/examples/finetuning/example_alternating_recipe.yaml
@@ -4,12 +4,10 @@ initial_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
-      targets: [
-        "re:model.layers.\\d+$"
-      ]
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 initial_training_stage:
   run_type: train
   pruning_modifiers:
@@ -22,12 +20,10 @@ next_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
-      targets: [
-        "re:model.layers.\\d+$"
-      ]
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 next_training_stage:
   run_type: train
   pruning_modifiers:

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 finetuning_stage:
   run_type: train
   finetuning_modifiers:

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 finetuning_stage:
   run_type: train
   finetuning_modifiers:

diff --git a/src/llmcompressor/modifiers/obcq/base.py b/src/llmcompressor/modifiers/obcq/base.py
@@ -33,9 +33,10 @@ class SparseGPTModifier(SparsityModifierMixin, Modifier):
     |           SparseGPTModifier:
     |               sparsity: 0.5
     |               mask_structure: "2:4"
-    |               sequential_update: True
     |               dampening_frac: 0.001
     |               block_size: 128
+    |               targets: ['Linear']
+    |               ignore: ['re:.*lm_head']
 
     Lifecycle:
         - on_initialize

diff --git a/src/llmcompressor/modifiers/obcq/sgpt_mixin.py b/src/llmcompressor/modifiers/obcq/sgpt_mixin.py
@@ -139,10 +139,26 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
 
             for name, module in get_prunable_layers(layer).items():
                 name = f"{layer_name}.{name}"
-                if not match_targets(name, self.ignore)[0]:
-                    self._module_names[module] = name
-                    self._module_sparsities[module] = layer_sparsity
-                    self.register_hook(module, self.calibrate_module, "forward")
+
+                if match_targets(name, self.ignore)[0]:
+                    continue
+
+                # HACK: previously, embeddings were not quantized because they were not
+                # accessible by the layer compressor. For now, we manually ignore it,
+                # but in the FUTURE this should be ignored by the user
+                if isinstance(module, torch.nn.Embedding):
+                    continue
+
+                if name.endswith("lm_head"):
+                    logger.warning(
+                        "`lm_head` was previously auto-ignored by SparseGPT and Wanda "
+                        "modifiers and is not advised. Please add `re:.*lm_head` to "
+                        "your ignore list if this was unintentional"
+                    )
+
+                self._module_names[module] = name
+                self._module_sparsities[module] = layer_sparsity
+                self.register_hook(module, self.calibrate_module, "forward")
 
         # infer and run pipeline
         model_name = state.model.__class__.__name__

diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
@@ -3,4 +3,5 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -130,7 +130,6 @@ def test_vllm(self):
         session.reset()
 
         if SKIP_HF_UPLOAD.lower() != "yes":
-
             logger.info("================= UPLOADING TO HUB ======================")
 
             stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"

diff --git a/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml b/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml
@@ -2,6 +2,6 @@ pruning_stage:
     obcq_modifiers:
         SparseGPTModifier:
             sparsity: 0.5
-            sequential_update: true
             mask_structure: "2:4"
-            targets: ['re:model.layers.\d*$']
+            targets: ["Linear"]
+            ignore: ["re:.*lm_head"]
diff --git a/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml b/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml
@@ -2,9 +2,9 @@ pruning_stage:
     obcq_modifiers:
         SparseGPTModifier:
             sparsity: 0.5
-            sequential_update: true
             mask_structure: "2:4"
-            targets: ['re:model.layers.\d*$']
+            targets: ["Linear"]
+            ignore: ["re:.*lm_head"]
 quant_stage:
     quant_modifiers:
         QuantizationModifier:

diff --git a/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml b/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml
@@ -3,10 +3,10 @@ test_oneshot_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
-      target_ids: ["attention_mask", "position_ids"]  
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 test_train_stage:
   pruning_modifiers:
     ConstantPruningModifier:

diff --git a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml
@@ -3,7 +3,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      sequential_update: True
       percdamp: 0.01
       mask_structure: "0:0"
       targets: ["model.layers.0"]

diff --git a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml
@@ -11,7 +11,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
       targets: [

diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml
@@ -18,7 +18,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
       targets: ["model.layers.0"]
diff --git a/tests/llmcompressor/transformers/obcq/recipes/sparse.yaml b/tests/llmcompressor/transformers/obcq/recipes/sparse.yaml
@@ -3,7 +3,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.3
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       targets: ["model.layers.0", "model.layers.1"]
       mask_structure: "0:0"
diff --git a/tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml b/tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml
@@ -3,7 +3,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "2:4"
       targets: [

diff --git a/tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml b/tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml
@@ -3,7 +3,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
       targets: [

diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml
@@ -3,7 +3,6 @@ test_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       targets: [
         're:model.layers.3.mlp.gate_proj.weight'
       ]
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml
@@ -9,7 +9,6 @@ recipe: |
       SparseGPTModifier:
         sparsity: 0.5
         block_size: 128
-        sequential_update: False
         targets: [
           're:model.layers.3.mlp.gate_proj.weight'
         ]
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml
@@ -10,7 +10,6 @@ recipe: |
       SparseGPTModifier:
         sparsity: 0.5
         block_size: 128
-        sequential_update: False
         targets: [
           're:model.layers.3.mlp.gate_proj.weight'
         ]