diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md index c29d0ed39..84cd9ddc1 100644 --- a/examples/big_models_with_accelerate/README.md +++ b/examples/big_models_with_accelerate/README.md @@ -29,8 +29,8 @@ will work properly out of the box for basic quantization with `QuantizationModif even for CPU offloaded models. To enable CPU offloading for second-order quantization methods such as GPTQ, we need to -allocate additional memory upfront when computing the device map. Note that this -device map will only compatible with `GPTQModifier(sequential_update=True, ...)` +allocate additional memory upfront when computing the device map. Not doing so risks +potentially going out-of-memory. ```python from llmcompressor.transformers.compression.helpers import calculate_offload_device_map @@ -48,12 +48,7 @@ model = SparseAutoModelForCausalLM.from_pretrained( ### Practical Advice -When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. - -General rules of thumb: -- CPU offloading is best used with data-free quantization methods (e.g. PTQ with `FP8_DYNAMIC`) -- Multi-GPU is fast enough to be used with calibration data-based methods with `sequential_update=False` -- It is possible to use Multi-GPU with `sequential_update=True` to save GPU memory, but the runtime will be slower +When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`. ## Examples diff --git a/examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py similarity index 88% rename from examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py rename to examples/big_models_with_accelerate/mult_gpus_int8_device_map.py index a9befa0e8..4cb6fb70b 100644 --- a/examples/big_models_with_accelerate/multi_gpu_int8_sequential_update.py +++ b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py @@ -10,8 +10,10 @@ MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407" # adjust based off number of desired GPUs +# reserve_for_hessians=True reserves memory which is required by +# GPTQModifier and SparseGPTModifier device_map = calculate_offload_device_map( - MODEL_ID, reserve_for_hessians=True, num_gpus=2, torch_dtype=torch.bfloat16 + MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16 ) model = SparseAutoModelForCausalLM.from_pretrained( @@ -60,7 +62,9 @@ def tokenize(sample): recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True + targets="Linear", + scheme="W8A8", + ignore=["lm_head"], ), ] diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py index 4daf8c63e..8d95ffb54 100644 --- a/examples/big_models_with_accelerate/multi_gpu_int8.py +++ b/examples/big_models_with_accelerate/multi_gpu_int8.py @@ -58,14 +58,13 @@ def tokenize(sample): # 3) Configure algorithms. In this case, we: # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# * run non-sequentially (for seq update, see multi_gpu_int8_sequential_update.py) recipe = [ - GPTQModifier( - targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=False - ), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), ] # 4) Apply algorithms and save in `compressed-tensors` format. +# if you encounter GPU out-of-memory issues, consider using an explicit +# device map (see multi_gpus_int8_device_map.py) oneshot( model=model, tokenizer=tokenizer, diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml b/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml index b31504a5a..166e41a66 100644 --- a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml +++ b/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml @@ -23,7 +23,6 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: - sequential_update: true ignore: ["lm_head"] config_groups: group_0: diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml b/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml index cc42da3f0..2ad00b457 100644 --- a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml +++ b/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml @@ -23,7 +23,6 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: - sequential_update: true ignore: ["lm_head"] config_groups: group_0: diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index 976ac5473..d76ca5379 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -55,7 +55,6 @@ def tokenize(sample): # 3) Select quantization algorithms. In this case, we: # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# Note: set sequential_update: true in the recipe to reduce memory recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]) # 4) Apply quantization and save to disk compressed. diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index 123bc62bc..49652ad8a 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -57,7 +57,6 @@ def tokenize(sample): # * apply SmoothQuant to make the activations easier to quantize # * quantize the weights to int8 with GPTQ (static per channel) # * quantize the activations to int8 (dynamic per token) -# Note: set sequential_update: true in the recipe to reduce memory recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py index 3c02f5d8d..c8a933b2d 100644 --- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py +++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py @@ -70,7 +70,6 @@ def tokenize(sample): targets="Linear", scheme="W8A8", ignore=["lm_head", "re:.*mlp.gate$"], - sequential_update=True, ), ] diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml index 05e294365..23f276e2f 100644 --- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: true ignore: [lm_head, "re:.*mlp.gate$"] config_groups: group_0: diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index b472e289e..dac1d1534 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -1,4 +1,4 @@ -import gc +import warnings from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch @@ -48,7 +48,6 @@ class GPTQModifier(Modifier): | test_stage: | obcq_modifiers: | GPTQModifier: - | sequential_update: true | dampening_frac: 0.001 | block_size: 128 | config_groups: @@ -66,8 +65,8 @@ class GPTQModifier(Modifier): | actorder: False - :param sequential_update: Whether or not to update weights sequentially by layer, - True saves on GPU memory, default is True + :param sequential_update: Whether or not to update weights sequentially by layer. + This option is depreciated and setting to False is no longer supported :param targets: list of layer names to compress during GPTQ, or '__ALL__' to compress every layer in the model :param block_size: Used to determine number of columns to compress in one pass @@ -97,7 +96,7 @@ class GPTQModifier(Modifier): and activation 8 bit quantization on the Linear layers. """ - sequential_update: bool = True + sequential_update: bool = True # DEPRECIATED targets: Union[str, List[str], None] = None sequential_targets: Union[str, List[str], None] = None block_size: int = 128 @@ -117,13 +116,13 @@ class GPTQModifier(Modifier): @field_validator("sequential_update", mode="before") def validate_sequential_update(cls, value: bool) -> bool: if not value: - logger.warning( - "Not using sequential_update requires allocating all hessians in " - "GPU memory. If you are running into GPU memory issues, consider " - "using sequential_update=True" + warnings.warn( + "`sequential_update=False` is no longer supported, setting " + "sequential_update=True", + DeprecationWarning, ) - return value + return True def on_initialize_structure(self, state: State, **kwargs): """ @@ -245,7 +244,7 @@ def initialize_compression( compressible layers of model, and sets the device :param model: model to initialize for compression - :param dataloader: calibration data for GPTQ + :param dataloader: calibration data, not used by GPTQ in this function """ self.model = model self.compressible_layers_ = self.compressible_layers() @@ -257,16 +256,12 @@ def initialize_compression( args = self._pruning_arguments() comp_cls = self._compression_class() compressor = LayerCompressor(comp_cls, self.model, layer, idx, name, args) - - # if running sequentially, allocate all hessians now - if not self.sequential_update: - compressor.pre_compress() - self.layer_compressors_.append(compressor) - if self.sequential_update: - first_layer_compressor = self.layer_compressors_[0] - first_layer_compressor.set_early_stop() + # for the initial forward data pass, add an early stop exception in order + # to capture inputs right before being compressed by first module + first_layer_compressor = self.layer_compressors_[0] + first_layer_compressor.set_early_stop() @torch.no_grad() def apply_compression( @@ -289,43 +284,32 @@ def apply_compression( forward_pass_use_cache = self.model.config.use_cache self.model.config.use_cache = False - # in non-sequential mode we run calibration through the full model - # in sequential mode we run calibration up to the first transformer target + # run_calibration_forward uses the early stop exception to capture values + # as intermediates right before the forward pass of the first module intermediates = run_calibration_forward( self.model, dataloader, mask_padding=True ) self.layer_compressors_[0].clear_early_stop() - # empty cache if not using sequential update - if not self.sequential_update: - del intermediates - gc.collect() - torch.cuda.empty_cache() - num_layers = len(self.compressible_layers_) for idx, layer_compressor in enumerate(self.layer_compressors_): logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====") - if self.sequential_update: - # in sequential mode we run the forward pass for each transformer layer - # one at a time, caching the intermediate outputs between layers - logger.info(f"Calibrating {layer_compressor.name}...") - layer_compressor.pre_compress() - unquantized_outputs = layer_compressor.calibrate_layer(intermediates) + # run the forward pass for each transformer layer (block) one at a time + logger.info(f"Calibrating {layer_compressor.name}...") + layer_compressor.pre_compress() + unquantized_outputs = layer_compressor.calibrate_layer(intermediates) layer_compressor.compress() layer_compressor.post_compress() layer_compressor.revert_layer_wrappers() - if self.sequential_update: - quantized_outputs = layer_compressor.calibrate_layer(intermediates) - error = get_output_error(unquantized_outputs, quantized_outputs) - logger.info(f"Mean output error from quantization: {error:.3f}") - intermediates = quantized_outputs - del unquantized_outputs - - gc.collect() - torch.cuda.empty_cache() + # perform a second forward pass of the module to calculate weight-quantized + # outputs for use as inputs to the next layer (block) + quantized_outputs = layer_compressor.calibrate_layer(intermediates) + error = get_output_error(unquantized_outputs, quantized_outputs) + logger.info(f"Mean output error from quantization: {error:.3f}") + intermediates = quantized_outputs self.model.config.use_cache = forward_pass_use_cache diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml index 84d6505cb..f61fba898 100644 --- a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml +++ b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml index 8a6dfbde6..ce6c1498a 100644 --- a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml +++ b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml index 6cfa275af..7528f7dfb 100644 --- a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml +++ b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml index 6ddcc63b4..16e39d8b0 100644 --- a/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml +++ b/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml index b667b2d10..1c4fcf7ab 100644 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml index bafd7928d..ecf57221a 100644 --- a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml +++ b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: QuantizationModifier: - sequential_update: false ignore: [lm_head] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml index a42e2922e..b9b9db154 100644 --- a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml +++ b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: false ignore: ["lm_head"] config_groups: group_0: diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml index 54b172477..0c8476883 100644 --- a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml +++ b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml @@ -1,7 +1,6 @@ quant_stage: quant_modifiers: GPTQModifier: - sequential_update: false ignore: ["lm_head"] config_groups: group_0: diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py index 26141ec88..3901bfd70 100644 --- a/tests/examples/test_big_models_with_accelerate.py +++ b/tests/examples/test_big_models_with_accelerate.py @@ -49,9 +49,9 @@ def test_readme_has_install_command(self, example_dir: str): ], ), pytest.param( - "multi_gpu_int8_sequential_update.py", + "mult_gpus_int8_device_map.py", "", - id="multi_gpu_int8_sequential_update", + id="mult_gpus_int8_device_map", marks=[requires_gpu_count(2), pytest.mark.multi_gpu], ), ], diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml index 44ad696b4..6ee1d31d5 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml @@ -15,5 +15,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml index def4b362f..468259a9c 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml @@ -15,5 +15,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml index 0d386df13..f36ac0595 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml @@ -13,6 +13,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False - \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml index f3c2db93c..6df9cd8af 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml @@ -2,7 +2,6 @@ test_stage: quant_modifiers: GPTQModifier: block_size: 128 - sequential_update: False ignore: ["lm_head", "model.layers.0.mlp.down_proj"] config_groups: group_0: diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml index 31ba456bd..02387f6c9 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml @@ -14,5 +14,4 @@ test_stage: output_activations: null targets: ["Linear"] GPTQModifier: - block_size: 128 - sequential_update: False \ No newline at end of file + block_size: 128 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml index 34e0a77e0..67aa5df3f 100644 --- a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml @@ -14,5 +14,4 @@ test_stage: targets: ["Linear", "Embedding"] GPTQModifier: block_size: 128 - sequential_update: False targets: ["re:model.layers.\\d+$"] \ No newline at end of file diff --git a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml index 2764ac033..906d0c8da 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml @@ -19,7 +19,6 @@ test_stage: ] preserve_sparsity_mask: True GPTQModifier: - sequential_update: False dampening_frac: 0.01 targets: [ "model.layers.0", diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant.yaml index f5dd8a271..435503e50 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/quant.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/quant.yaml @@ -4,7 +4,6 @@ test_stage: smoothing_strength: 0.6 GPTQModifier: block_size: 128 - sequential_update: False percdamp: 0.01 config_groups: group_0: diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml index f0e98f0ed..05022fd80 100644 --- a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml +++ b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml @@ -15,7 +15,6 @@ test_stage: targets: ["Linear"] GPTQModifier: block_size: 128 - sequential_update: True SparseGPTModifier: sparsity: 0.5 block_size: 128