codreview updates

brian-dellabetta · brian-dellabetta · commit b04cf98a7f2b · 2025-09-23T16:57:39.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/README.md b/README.md
@@ -33,8 +33,6 @@ Some of the exciting new features include:
 * **DeepSeekV3-style Block Quantization Support**:  This allows for more efficient compression of large language models without needing a calibration dataset. Quantize a Qwen3 model to [W8A8](examples/quantization_w8a8_fp8/fp8_block_example.py). 
 * **Llama4 Quantization Support**: Quantize a Llama4 model to [W4A16](examples/multimodal_vision/llama4_example.py) or [NVFP4](examples/quantization_w4a4_fp4/llama4_example.py). The checkpoint produced can seamlessly run in vLLM.
 * **FP4 Quantization - now with MoE and non-uniform support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py), [MoE support](examples/quantization_w4a4_fp4/qwen_30b_a3b.py), and [Non-uniform quantization support](examples/quantization_non_uniform) where some layers are selectively quantized to fp8 for better recovery. You can also mix other quantization schemes, such as int8 and int4.
-* **Large Model Support with Sequential Onloading**: As of llm-compressor>=0.6.0, you can now quantize very large language models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeek-R1 Example](examples/quantizing_moe/deepseek_r1_example.py).
-* **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
 
 ### Supported Formats
 * Activation Quantization: W8A8 (int8 and fp8)
diff --git a/examples/quantization_non_uniform/quantization_multiple_modifiers.py b/examples/quantization_non_uniform/quantization_multiple_modifiers.py
@@ -1,4 +1,5 @@
 import argparse
+
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -19,8 +20,6 @@ def parse_args():
     return parser.parse_args()
 
 
-args = parse_args()
-
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
@@ -35,10 +34,6 @@ def parse_args():
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
 
 def preprocess(example):
     return {
@@ -49,9 +44,6 @@ def preprocess(example):
     }
 
 
-ds = ds.map(preprocess)
-
-
 # Tokenize inputs.
 def tokenize(sample):
     return tokenizer(
@@ -63,8 +55,6 @@ def tokenize(sample):
     )
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
 # Configure the quantization algorithm to run.
 #   * quantize self_attn layers to W8A8 with GPTQ
 #   * quantize mlp layers to W4A16 with AWQ
@@ -87,27 +77,37 @@ def tokenize(sample):
     ),
 ]
 
-# Apply algorithms.
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    pipeline="independent" if args.independent else "sequential",
-)
-
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to(model.device) for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-gptq-w8a8-self_attn-awq-w4a16-mlp"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
+if __name__ == "__main__":
+    args = parse_args()
+    # Load dataset and preprocess.
+    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+    ds = ds.shuffle(seed=42)
+    ds = ds.map(preprocess)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+    # Apply algorithms.
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+        pipeline="independent" if args.independent else "sequential",
+    )
+
+    # Confirm generations of the quantized model look sane.
+    print("\n\n")
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to(model.device) for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
+    print("==========================================\n\n")
+
+    # Save to disk compressed.
+    SAVE_DIR = (
+        model_id.rstrip("/").split("/")[-1] + "-gptq-w8a8-self_attn-awq-w4a16-mlp"
+    )
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)