diff --git a/src/brevitas/core/scaling/pre_scaling.py b/src/brevitas/core/scaling/pre_scaling.py index 632242507..dd125396d 100644 --- a/src/brevitas/core/scaling/pre_scaling.py +++ b/src/brevitas/core/scaling/pre_scaling.py @@ -14,6 +14,7 @@ from brevitas.core.stats import SCALAR_SHAPE from brevitas.core.stats.stats_wrapper import _Stats from brevitas.function import abs_binary_sign_grad +from brevitas.function import get_upper_bound_on_l1_norm __all__ = ["ParameterPreScalingWeightNorm", "AccumulatorAwareParameterPreScaling"] @@ -170,25 +171,6 @@ def __init__( ) self.accumulator_bit_width = accumulator_bit_width_impl - @brevitas.jit.script_method - def get_upper_bound_on_l1_norm(self, input_bit_width: Tensor, input_is_signed: bool) -> Tensor: - """Calculate the upper bound on the l1-norm of the weights using the derivations from - `Quantized Neural Networks for Low-Precision Accumulation with Guaranteed Overflow Avoidance` - by I.Colbert, A.Pappalardo, and J.Petri-Koenig.""" - assert input_bit_width is not None, "A2Q relies on input bit-width." - assert input_is_signed is not None, "A2Q relies on input sign." - input_is_signed = float(input_is_signed) # 1. if signed else 0. - # This is the minimum of the two maximum magnitudes that P could take, which are -2^{P-1} - # and 2^{P-1}-1. Note that evaluating to -2^{P-1} would mean there is a possibility of overflow - # on the positive side of this range. - max_accumulator_bit_width = self.accumulator_bit_width() # P - max_accumulator_mag = pow(2., max_accumulator_bit_width - 1.) - 1. # 2^{P-1}-1 - # This is the maximum possible magnitude that the input data could take. When the data is signed, - # this is 2^{N-1}. When the data is unsigned, this is 2^N - 1. We use a slightly looser bound here - # to simplify our derivations on the export validation. - max_input_mag_inverse = pow(2., input_is_signed - input_bit_width) - return max_accumulator_mag * max_input_mag_inverse - @brevitas.jit.script_method def forward(self, weights: Tensor, input_bit_width: Tensor, input_is_signed: bool) -> Tensor: """Takes weights as input and returns the pre-clipping scaling factor""" @@ -196,7 +178,8 @@ def forward(self, weights: Tensor, input_bit_width: Tensor, input_is_signed: boo d_w = self.stats(weights) # denominator for weight normalization s = self.scaling_impl(weights) # s g = abs_binary_sign_grad(self.restrict_clamp_scaling(self.value)) # g - T = self.get_upper_bound_on_l1_norm(input_bit_width, input_is_signed) # T / s + T = get_upper_bound_on_l1_norm( + self.accumulator_bit_width(), input_bit_width, input_is_signed) # T / s g = torch.clamp_max(g / s, T) value = d_w / g # calculating final pre-clipping scaling factor # re-apply clamp_min_ste from restrict_scaling_impl to the specified pre_scaling_min_val diff --git a/src/brevitas/function/ops.py b/src/brevitas/function/ops.py index f68ae9ede..ec326602d 100644 --- a/src/brevitas/function/ops.py +++ b/src/brevitas/function/ops.py @@ -201,3 +201,18 @@ def max_float(exponent_bit_width: Tensor, mantissa_bit_width: Tensor, exponent_b device=mantissa_bit_width.device))) max_val = max_mantissa * (2 ** max_exponent) return max_val + + +def get_upper_bound_on_l1_norm( + accumulator_bit_width: Tensor, input_bit_width: Tensor, input_is_signed: bool) -> Tensor: + """Calculate the upper bound on the l1-norm of the weights using the derivations from + `Quantized Neural Networks for Low-Precision Accumulation with Guaranteed Overflow Avoidance` + by I.Colbert, A.Pappalardo, and J.Petri-Koenig.""" + assert input_bit_width is not None, "A2Q relies on input bit-width." + assert input_is_signed is not None, "A2Q relies on input sign." + assert accumulator_bit_width is not None, "A2Q relies on accumulator bit-width." + input_is_signed = float(input_is_signed) # 1. if signed else 0. + max_accumulator_bit_width = accumulator_bit_width # P + max_accumulator_mag = pow(2., max_accumulator_bit_width - 1.) - 1. # 2^{P-1}-1 + max_input_mag_inverse = pow(2., input_is_signed - input_bit_width) + return max_accumulator_mag * max_input_mag_inverse diff --git a/src/brevitas/graph/gpfq.py b/src/brevitas/graph/gpfq.py index 2d312a549..cad5d9043 100644 --- a/src/brevitas/graph/gpfq.py +++ b/src/brevitas/graph/gpfq.py @@ -8,6 +8,7 @@ import torch import unfoldNd +from brevitas.function import get_upper_bound_on_l1_norm from brevitas.graph.gpxq import GPxQ from brevitas.graph.gpxq import gpxq_mode from brevitas.graph.gpxq import StopFwdException @@ -45,7 +46,9 @@ def __init__( use_quant_activations: bool = True, p: float = 1.0, return_forward_output: bool = False, - act_order: bool = False) -> None: + act_order: bool = False, + use_gpfa2q: bool = False, + accumulator_bit_width: Optional[int] = None) -> None: if not inplace: model = deepcopy(model) super().__init__( @@ -61,6 +64,10 @@ def __init__( self.model.forward = self.catch_stopfwd self.p = p + # GPFA2Q params + self.use_gpfa2q = use_gpfa2q + self.accumulator_bit_width = accumulator_bit_width + def catch_stopfwd(self, *args, **kwargs): # Collect quant input try: @@ -96,13 +103,23 @@ def catch_stopfwd(self, *args, **kwargs): def initialize_module_optimizer( self, layer, name, act_order, len_parallel_layers, create_weight_orig): - return GPFQ( - layer=layer, - name=name, - act_order=act_order, - len_parallel_layers=len_parallel_layers, - create_weight_orig=create_weight_orig, - p=self.p) + if not self.use_gpfa2q: + return GPFQ( + layer=layer, + name=name, + act_order=act_order, + len_parallel_layers=len_parallel_layers, + create_weight_orig=create_weight_orig, + p=self.p) + else: + return GPFA2Q( + layer=layer, + name=name, + act_order=act_order, + len_parallel_layers=len_parallel_layers, + create_weight_orig=create_weight_orig, + p=self.p, + accumulator_bit_width=self.accumulator_bit_width) class GPFQ(GPxQ): @@ -110,14 +127,7 @@ class GPFQ(GPxQ): Based on https://github.com/YixuanSeanZhou/Quantized_Neural_Nets/tree/main """ - def __init__( - self, - layer, - name, - act_order, - len_parallel_layers=1, - create_weight_orig=True, - p=1.0) -> None: + def __init__(self, layer, name, act_order, len_parallel_layers, create_weight_orig, p) -> None: super().__init__(layer, name, act_order, len_parallel_layers, create_weight_orig) @@ -256,3 +266,103 @@ def single_layer_update(self): del self.float_input del self.quantized_input + + +class GPFA2Q(GPFQ): + + def __init__( + self, + layer, + name, + act_order, + len_parallel_layers, + create_weight_orig, + accumulator_bit_width, + p) -> None: + GPFQ.__init__( + self, + layer=layer, + name=name, + act_order=act_order, + len_parallel_layers=len_parallel_layers, + create_weight_orig=create_weight_orig, + p=p) + self.accumulator_bit_width = accumulator_bit_width + assert self.accumulator_bit_width is not None + + def single_layer_update(self): + # raise error in case no quant-input is here + if self.quant_input is None: + raise ValueError( + 'Expected quant input to calculate Upper Bound on L1 norm, but received None') + weight = self.layer.weight.data + dev = weight.device + dtype = weight.dtype + if isinstance(self.layer, SUPPORTED_CONV_OP): + if isinstance(self.layer, (qnn.QuantConvTranspose1d, qnn.QuantConvTranspose2d)): + weight = weight.transpose(1, 0) # This performs a view + weight = weight.flatten(1) + weight = weight.view(self.groups, -1, weight.shape[-1]) # [Groups, OC/Groups, IC] + U = torch.zeros( + weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev, dtype=dtype) + self.float_input = self.float_input.to(dev) + self.quantized_input = self.quantized_input.to(dev) + + # get upper bound + input_bit_width = self.quant_input.bit_width + input_is_signed = self.quant_input.signed + T = get_upper_bound_on_l1_norm( + torch.tensor(self.accumulator_bit_width), input_bit_width, input_is_signed) + s = self.layer.quant_weight_scale() + s = s.view(self.groups, -1) # [Groups, OC/Groups] + + l1_norm = torch.zeros(weight.shape[:-1], device=dev) + + # We don't need full Hessian, we just need the diagonal + self.H_diag = self.quantized_input.transpose(2, 1).square().sum( + 2) # summing over Batch dimension + permutation_list = [] + for group_index in range(self.groups): + if self.act_order: + # Re-order Hessian_diagonal so that weights associated to + # higher magnitude activations are quantized first + perm = torch.argsort(self.H_diag[group_index, :], descending=True) + else: + # No permutation, permutation tensor is a ordered index + perm = torch.tensor(range(weight.shape[-1]), device=dev) + permutation_list.append(perm) + + for t in range(weight.shape[-1]): + for group_index in range(self.groups): + U[group_index] += torch.matmul( + weight[group_index, :, permutation_list[group_index][t]].unsqueeze(1), + self.float_input[group_index, :, permutation_list[group_index][t]].unsqueeze( + 0)) #[OC/Groups, 1] * [1, INSHAPE[1]] + norm = torch.linalg.norm( + self.quantized_input[group_index, :, permutation_list[group_index][t]], 2) ** 2 + if norm > 0: + q_arg = U[group_index].matmul( + self.quantized_input[group_index, :, + permutation_list[group_index][t]]) / norm + else: + q_arg = torch.zeros_like(U[group_index, :, 0]) + + weight[group_index, :, permutation_list[group_index][t]] = q_arg + q = self.get_quant_weights(t, 0, permutation_list) + + for group_index in range(self.groups): + candidate_l1 = l1_norm[group_index] + torch.abs(q[group_index]) + candidate_l1_mask = candidate_l1 > T * s[group_index] + if torch.any(candidate_l1_mask): + # set all values to 0 that are exceeding T * s + weight[group_index, :, permutation_list[group_index][t]][candidate_l1_mask] = 0 + q[group_index][candidate_l1_mask] = 0 + else: + l1_norm[group_index] = candidate_l1 + U[group_index] -= torch.matmul( + q[group_index].unsqueeze(1), + self.quantized_input[group_index, :, + permutation_list[group_index][t]].unsqueeze(0)) + + del self.float_input + del self.quantized_input diff --git a/src/brevitas/graph/gptq.py b/src/brevitas/graph/gptq.py index b10943f1b..56171ac6f 100644 --- a/src/brevitas/graph/gptq.py +++ b/src/brevitas/graph/gptq.py @@ -114,13 +114,8 @@ class GPTQ(GPxQ): """ def __init__( - self, - layer, - name, - act_order, - len_parallel_layers=1, - create_weight_orig=True, - num_blocks=100) -> None: + self, layer, name, act_order, len_parallel_layers, create_weight_orig, + num_blocks) -> None: super().__init__(layer, name, act_order, len_parallel_layers, create_weight_orig) dev = self.layer.weight.device diff --git a/src/brevitas/graph/gpxq.py b/src/brevitas/graph/gpxq.py index 1279950a8..ddeef1c53 100644 --- a/src/brevitas/graph/gpxq.py +++ b/src/brevitas/graph/gpxq.py @@ -11,6 +11,8 @@ from typing import List, Optional, Set import warnings +import torch + from brevitas.graph.calibrate import DisableEnableQuantization import brevitas.nn as qnn from brevitas.quant_tensor import QuantTensor @@ -175,13 +177,23 @@ def process_input(self, inp): if self.layer.weight_quant_requires_quant_input: # Can minimize memory allocation by not storing actual values self.quant_input = QuantTensor( - value=None, + value=torch.empty( + 1, dtype=self.layer.weight.dtype, device=self.layer.weight.device), scale=inp.scale, zero_point=inp.zero_point, bit_width=inp.bit_width, signed=inp.signed, training=inp.training) inp = inp.value + elif self.layer.is_input_quant_enabled: + self.quant_input = QuantTensor( + value=torch.empty( + 1, dtype=self.layer.weight.dtype, device=self.layer.weight.device), + scale=self.layer.quant_input_scale(), + zero_point=self.layer.quant_input_zero_point(), + bit_width=self.layer.quant_input_bit_width(), + signed=self.layer.is_quant_input_signed, + training=self.layer.training) # If input is unbatched, add batch_size = 1 if len(inp.shape) == 1: