From 2cb8c9dfd781a6ce45ed9c19c12b7ed0b650892c Mon Sep 17 00:00:00 2001
From: Ian Colbert <88047104+i-colbert@users.noreply.github.com>
Date: Sat, 26 Oct 2024 15:33:08 -0700
Subject: [PATCH] Feat (ptq): adding accumulator-aware extensions to GPxQ
 (#1060)

---
 src/brevitas/graph/gpfq.py                    | 239 +++++-----
 src/brevitas/graph/gptq.py                    | 163 +++----
 src/brevitas_examples/common/axe.py           | 436 ++++++++++++++++++
 .../imagenet_classification/ptq/ptq_common.py |  54 ++-
 .../ptq/ptq_evaluate.py                       |  58 +--
 src/brevitas_examples/llm/README.md           |  13 +
 src/brevitas_examples/llm/llm_quant/gpxq.py   |  56 ++-
 src/brevitas_examples/llm/main.py             |  50 +-
 8 files changed, 812 insertions(+), 257 deletions(-)
 create mode 100644 src/brevitas_examples/common/axe.py

diff --git a/src/brevitas/graph/gpfq.py b/src/brevitas/graph/gpfq.py
index 33ee5fbb4..92e3da2bf 100644
--- a/src/brevitas/graph/gpfq.py
+++ b/src/brevitas/graph/gpfq.py
@@ -31,110 +31,6 @@
 from brevitas.quant_tensor import _unpack_quant_tensor
 
 
-class gpfq_mode(gpxq_mode):
-    """
-    Apply GPFQ algorithm.
-
-    Args:
-        model (Module): The model to quantize with GPFQ
-        group_of_parallel_layers (Optional, List[str]): .List of lists where each inner list is a group
-            of layer names that can be optimized in parallel. Default: None
-        inplace (bool): Wheter to apply GPFQ inplace or perform a deepcopy. Default: True
-        create_weight_orig (bool): If True, store the original floating point weights before applying
-            gpfq. These weights will be used anytime quantization is disabled. Default: True
-        use_quant_activations (bool): Wheter to leave quantize activations enabled while performing
-            GPFQ. Default: False
-        p (float): The percentage of processed inputs to use. Default: 1.0
-        return_forward_output (bool): If True, returns the output of the forward pass. Otherwise the
-            forward call inside the context manager returns None. Default: False
-        act_order (bool): Whether to order greedy path following by Hessian approximation. Default: False
-
-    Example:
-        >>> with torch.no_grad():
-        >>>     with gpfq_mode(model) as gpfq:
-        >>>         gpfq_model = gpfq.model
-        >>>         for i in tqdm(range(gpfq.num_layers)):
-        >>>             for img, t in calib_loader:
-        >>>                 img = img.cuda()
-        >>>                 gpfq_model(img)
-        >>>             gpfq.update()
-    """
-
-    def __init__(
-            self,
-            model: nn.Module,
-            group_of_parallel_layers: Optional[List[str]] = None,
-            inplace: bool = True,
-            create_weight_orig: bool = True,
-            use_quant_activations: bool = True,
-            p: float = 1.0,
-            return_forward_output: bool = False,
-            act_order: bool = False,
-            gpfq_class: Optional[nn.Module] = None) -> None:
-        if not inplace:
-            model = deepcopy(model)
-        super().__init__(
-            model,
-            group_of_parallel_layers,
-            inplace,
-            create_weight_orig,
-            use_quant_activations,
-            act_order,
-            return_forward_output)
-
-        self.p = p
-        if gpfq_class is None:
-            gpfq_class = GPFQ
-        self.gpfq_class = gpfq_class
-        assert issubclass(gpfq_class, GPxQ), \
-            "Error: expected `gpfq_class` to be derived from `brevitas.graph.gpxq.GPxQ`."
-
-    def catch_stopfwd(self, *args, **kwargs):
-        # Collect quant input
-        try:
-            self.orig_forward(*args, **kwargs)
-        except StopFwdException:
-            pass
-
-        # Disable quantization
-        self.return_quant_tensor_state = disable_return_quant_tensor(self.model)
-        self.disable_quant_inference.disable_param_quantization(self.model, is_training=False)
-        self.disable_quant_inference.disable_act_quantization(self.model, is_training=False)
-        # Collect float input
-        try:
-            self.orig_forward(*args, **kwargs)
-        except StopFwdException:
-            pass
-
-        # Re-enable quantization. If activation quantization is disabled,
-        # we also disable bias quantization
-        self.disable_quant_inference.enable_param_quantization(self.model, is_training=False)
-        if self.use_quant_activations:
-            self.disable_quant_inference.enable_act_quantization(self.model, is_training=False)
-        else:
-            self.disable_quant_inference.disable_bias_quantization(self.model, is_training=False)
-        restore_return_quant_tensor(self.model, self.return_quant_tensor_state)
-
-        if self.return_forward_output:
-            # If we want to return the output of the network, we need to disable all hooks
-            for name, gpxq_class in self.gpxq_layers.items():
-                gpxq_class.disable_pre_forward_hook = True
-            out = self.orig_forward(*args, **kwargs)
-            for name, gpxq_class in self.gpxq_layers.items():
-                gpxq_class.disable_pre_forward_hook = False
-            return out
-
-    def initialize_module_optimizer(
-            self, layer, name, act_order, len_parallel_layers, create_weight_orig):
-        return self.gpfq_class(
-            layer=layer,
-            name=name,
-            act_order=act_order,
-            len_parallel_layers=len_parallel_layers,
-            create_weight_orig=create_weight_orig,
-            p=self.p)
-
-
 class GPFQ(GPxQ):
     """
     Based on https://github.com/YixuanSeanZhou/Quantized_Neural_Nets/tree/main
@@ -243,7 +139,11 @@ def single_layer_update(self):
         self.float_input = self.float_input.to(dev)
         self.quant_input = self.quant_input.to(dev)
         U = torch.zeros(
-            weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev, dtype=dtype)
+            weight.shape[0],
+            weight.shape[1],
+            self.float_input.shape[1],
+            device=dev,
+            dtype=torch.float32)
         # We don't need full Hessian, we just need the diagonal
         # Summing over batch dimension
         H_diag = self.quant_input.transpose(2, 1).square().sum(2)
@@ -261,7 +161,8 @@ def single_layer_update(self):
         for t in range(weight.shape[-1]):
             for group_index in range(self.groups):
                 U[group_index] += torch.matmul(
-                    weight[group_index, :, permutation_list[group_index][t]].unsqueeze(1),
+                    weight[group_index, :,
+                           permutation_list[group_index][t]].unsqueeze(1).to(torch.float32),
                     self.float_input[group_index, :, permutation_list[group_index][t]].unsqueeze(
                         0))  #[OC/Groups, 1] * [1, INSHAPE[1]]
                 norm = torch.linalg.norm(
@@ -272,11 +173,11 @@ def single_layer_update(self):
                 else:
                     q_arg = torch.zeros_like(U[group_index, :, 0])
 
-                weight[group_index, :, permutation_list[group_index][t]] = q_arg
+                weight[group_index, :, permutation_list[group_index][t]] = q_arg.to(dtype)
             q = self.get_quant_weights(t, 0, permutation_list)
             for group_index in range(self.groups):
                 U[group_index] -= torch.matmul(
-                    q[group_index].unsqueeze(1),
+                    q[group_index].unsqueeze(1).to(torch.float32),
                     self.quant_input[group_index, :, permutation_list[group_index][t]].unsqueeze(0))
 
         del self.float_input
@@ -360,7 +261,7 @@ def update_batch(self, module, input, current_layer):
         # if quant is not enabled, then it is the float input; if it is a float input
         # then a quant input has already happened and we can update G
         if not is_quant_enabled:
-            # Computing the normalized H matrix using CPU buffer
+            # Computing the normalized G matrix using CPU buffer
             self.B.copy_(self.quant_input.bmm(inp_processed.transpose(2, 1)))
             self.G += self.B
             self.quant_input = None  # NOTE: set back to None now that we've used it
@@ -401,6 +302,8 @@ def _get_permutation_list(self, weight: Tensor):
     def single_layer_update(self, percdamp: float = 0.01):
         assert not self.layer.weight_quant.requires_quant_input, \
             "Error: GPFQ does not support weight quantizers that require quantized inputs."
+        if hasattr(self.layer, "allocate_params"):
+            self.layer.allocate_params(self.layer)
         weight = self.layer.weight.data
         dev = weight.device
         dtype = weight.dtype
@@ -446,14 +349,17 @@ def single_layer_update(self, percdamp: float = 0.01):
         permutation_list = self._get_permutation_list(weight)
 
         U = torch.zeros(
-            weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev,
-            dtype=dtype)  # [Groups, OC/groups, Samples]
+            weight.shape[0],
+            weight.shape[1],
+            self.float_input.shape[1],
+            device=dev,
+            dtype=torch.float32)  # [Groups, OC/groups, Samples]
 
         for t in range(weight.shape[-1]):
             for group_index in range(self.groups):
                 i = permutation_list[group_index][t]
                 U[group_index] += torch.matmul(
-                    weight[group_index, :, i].unsqueeze(1),
+                    weight[group_index, :, i].unsqueeze(1).to(torch.float32),
                     self.float_input[group_index, :, i].unsqueeze(0),
                 )  # [OC/Groups, 1] * [1, INSHAPE[1]]
                 norm = norms[group_index, i]
@@ -461,13 +367,116 @@ def single_layer_update(self, percdamp: float = 0.01):
                     q_arg = U[group_index].matmul(self.quant_input[group_index, :, i]) / norm
                 else:
                     q_arg = torch.zeros_like(U[group_index, :, 0])
-                weight[group_index, :, i] = q_arg
+                weight[group_index, :, i] = q_arg.to(dtype)
             q_groups = self.get_quant_weights(t, 0, permutation_list)
             for group_index in range(self.groups):
                 U[group_index] -= torch.matmul(
-                    q_groups[group_index].unsqueeze(1),
+                    q_groups[group_index].unsqueeze(1).to(torch.float32),
                     self.quant_input[group_index, :, permutation_list[group_index][t]].unsqueeze(0),
                 )
-
+        if hasattr(self.layer, 'offload_params'):
+            self.layer.offload_params(self.layer)
         del self.float_input
         del self.quant_input
+
+
+class gpfq_mode(gpxq_mode):
+    """
+    Apply GPFQ algorithm.
+
+    Args:
+        model (Module): The model to quantize with GPFQ
+        group_of_parallel_layers (Optional, List[str]): .List of lists where each inner list is a group
+            of layer names that can be optimized in parallel. Default: None
+        inplace (bool): Wheter to apply GPFQ inplace or perform a deepcopy. Default: True
+        create_weight_orig (bool): If True, store the original floating point weights before applying
+            gpfq. These weights will be used anytime quantization is disabled. Default: True
+        use_quant_activations (bool): Wheter to leave quantize activations enabled while performing
+            GPFQ. Default: False
+        p (float): The percentage of processed inputs to use. Default: 1.0
+        return_forward_output (bool): If True, returns the output of the forward pass. Otherwise the
+            forward call inside the context manager returns None. Default: False
+        act_order (bool): Whether to order greedy path following by Hessian approximation. Default: False
+        gpfq_class (GPFQ): The uninitialized class to perform GPFQ.  Default: `brevitas.graph.gpfq.GPFQv2`,
+            which is the memory-efficient formulation
+
+    Example:
+        >>> with torch.no_grad():
+        >>>     with gpfq_mode(model) as gpfq:
+        >>>         gpfq_model = gpfq.model
+        >>>         for i in tqdm(range(gpfq.num_layers)):
+        >>>             for img, t in calib_loader:
+        >>>                 img = img.cuda()
+        >>>                 gpfq_model(img)
+        >>>             gpfq.update()
+    """
+
+    def __init__(
+            self,
+            model: nn.Module,
+            group_of_parallel_layers: Optional[List[str]] = None,
+            inplace: bool = True,
+            create_weight_orig: bool = True,
+            use_quant_activations: bool = True,
+            p: float = 1.0,
+            return_forward_output: bool = False,
+            act_order: bool = False,
+            gpfq_class: GPFQ = GPFQv2) -> None:
+        if not inplace:
+            model = deepcopy(model)
+        super().__init__(
+            model,
+            group_of_parallel_layers,
+            inplace,
+            create_weight_orig,
+            use_quant_activations,
+            act_order,
+            return_forward_output)
+
+        self.p = p
+        self.gpfq_class = gpfq_class
+
+    def catch_stopfwd(self, *args, **kwargs):
+        # Collect quant input
+        try:
+            self.orig_forward(*args, **kwargs)
+        except StopFwdException:
+            pass
+
+        # Disable quantization
+        self.return_quant_tensor_state = disable_return_quant_tensor(self.model)
+        self.disable_quant_inference.disable_param_quantization(self.model, is_training=False)
+        self.disable_quant_inference.disable_act_quantization(self.model, is_training=False)
+        # Collect float input
+        try:
+            self.orig_forward(*args, **kwargs)
+        except StopFwdException:
+            pass
+
+        # Re-enable quantization. If activation quantization is disabled,
+        # we also disable bias quantization
+        self.disable_quant_inference.enable_param_quantization(self.model, is_training=False)
+        if self.use_quant_activations:
+            self.disable_quant_inference.enable_act_quantization(self.model, is_training=False)
+        else:
+            self.disable_quant_inference.disable_bias_quantization(self.model, is_training=False)
+        restore_return_quant_tensor(self.model, self.return_quant_tensor_state)
+
+        if self.return_forward_output:
+            # If we want to return the output of the network, we need to disable all hooks
+            for name, gpxq_class in self.gpxq_layers.items():
+                gpxq_class.disable_pre_forward_hook = True
+            out = self.orig_forward(*args, **kwargs)
+            for name, gpxq_class in self.gpxq_layers.items():
+                gpxq_class.disable_pre_forward_hook = False
+            return out
+
+    def initialize_module_optimizer(
+            self, layer, name, act_order, len_parallel_layers, create_weight_orig):
+        return self.gpfq_class(
+            layer=layer,
+            name=name,
+            act_order=act_order,
+            len_parallel_layers=len_parallel_layers,
+            create_weight_orig=create_weight_orig,
+            p=self.p)
diff --git a/src/brevitas/graph/gptq.py b/src/brevitas/graph/gptq.py
index a1380da4e..667e47d40 100644
--- a/src/brevitas/graph/gptq.py
+++ b/src/brevitas/graph/gptq.py
@@ -23,85 +23,6 @@
 import brevitas.nn as qnn
 
 
-class gptq_mode(gpxq_mode):
-    """
-    Apply GPTQ algorithm https://arxiv.org/abs/2210.17323.
-
-    Args:
-        model (Module): The model to quantize with GPTQ
-        group_of_parallel_layers (Optional, List[str]): .List of lists where each inner list is a group
-            of layer names that can be optimized in parallel. Default: None
-        inplace (bool): Wheter to apply GPTQ inplace or perform a deepcopy. Default: True
-        create_weight_orig (bool): If True, store the original floating point weights before applying
-            gptq. These weights will be used anytime quantization is disabled. Default: True
-        use_quant_activations (bool): Wheter to leave quantize activations enabled while performing
-            GPTQ. Default: False
-        num_blocks (int): The number of sub-blocks to use to speed-up GPTQ computation. Default: 100
-        act_order (bool): Whether to order greedy path following by Hessian approximation. Default: False
-        return_forward_output (bool): If True, returns the output of the forward pass. Otherwise the
-            forward call inside the context manager returns None. Default: False
-
-    Example:
-        >>> with torch.no_grad():
-        >>>     with gptq_mode(model) as gptq:
-        >>>         gptq_model = gptq.model
-        >>>         for i in tqdm(range(gptq.num_layers)):
-        >>>             for img, t in calib_loader:
-        >>>                 img = img.cuda()
-        >>>                 gptq_model(img)
-        >>>             gptq.update()
-    """
-
-    def __init__(
-            self,
-            model,
-            group_of_parallel_layers: Optional[List[str]] = None,
-            inplace: bool = True,
-            create_weight_orig: bool = True,
-            use_quant_activations: bool = True,
-            num_blocks: int = 100,
-            return_forward_output: bool = False,
-            act_order: bool = False) -> None:
-        if not inplace:
-            model = deepcopy(model)
-        super().__init__(
-            model,
-            group_of_parallel_layers,
-            inplace,
-            create_weight_orig,
-            use_quant_activations,
-            act_order,
-            return_forward_output)
-
-        # How many subblock to use during GPTQ for each layer
-        self.num_blocks = num_blocks
-
-    def catch_stopfwd(self, *args, **kwargs):
-        try:
-            self.orig_forward(*args, **kwargs)
-        except StopFwdException:
-            pass
-        finally:
-            if self.return_forward_output:
-                # If we want to return the output of the network, we need to disable all hooks
-                for name, gpxq_class in self.gpxq_layers.items():
-                    gpxq_class.disable_pre_forward_hook = True
-                out = self.orig_forward(*args, **kwargs)
-                for name, gpxq_class in self.gpxq_layers.items():
-                    gpxq_class.disable_pre_forward_hook = False
-                return out
-
-    def initialize_module_optimizer(
-            self, layer, name, act_order, len_parallel_layers, create_weight_orig):
-        return GPTQ(
-            layer=layer,
-            name=name,
-            act_order=act_order,
-            len_parallel_layers=len_parallel_layers,
-            create_weight_orig=create_weight_orig,
-            num_blocks=self.num_blocks)
-
-
 class GPTQ(GPxQ):
     """
     Adapted from https://github.com/IST-DASLab/gptq, released under the following LICENSE:
@@ -275,7 +196,7 @@ def single_layer_update(self, percdamp=.01):
                 q_groups = self.get_quant_weights(i, i1, permutation_list)  # [groups, OC/groups]
                 for group_index in range(self.groups):
                     perm = permutation_list[group_index]
-                    q = q_groups[group_index]  # [OC/groups]
+                    q = q_groups[group_index].to(torch.float32)  # [OC/groups]
                     w = weight[group_index, :, perm[i1:i2][i]].to(torch.float32)  # [OC/groups]
                     d = h_inv_block[group_index, i, i]  # [1]
                     error = (w - q) / d  # [OC/groups]
@@ -292,3 +213,85 @@ def single_layer_update(self, percdamp=.01):
                                                           i2:].to(dev))).to(dtype)
         if hasattr(self.layer, 'offload_params'):
             self.layer.offload_params(self.layer)
+
+
+class gptq_mode(gpxq_mode):
+    """
+    Apply GPTQ algorithm https://arxiv.org/abs/2210.17323.
+
+    Args:
+        model (Module): The model to quantize with GPTQ
+        group_of_parallel_layers (Optional, List[str]): .List of lists where each inner list is a group
+            of layer names that can be optimized in parallel. Default: None
+        inplace (bool): Wheter to apply GPTQ inplace or perform a deepcopy. Default: True
+        create_weight_orig (bool): If True, store the original floating point weights before applying
+            gptq. These weights will be used anytime quantization is disabled. Default: True
+        use_quant_activations (bool): Wheter to leave quantize activations enabled while performing
+            GPTQ. Default: False
+        num_blocks (int): The number of sub-blocks to use to speed-up GPTQ computation. Default: 100
+        act_order (bool): Whether to order greedy path following by Hessian approximation. Default: False
+        return_forward_output (bool): If True, returns the output of the forward pass. Otherwise the
+            forward call inside the context manager returns None. Default: False
+        gptq_class (GPTQ): The uninitialized class to perform GPTQ. Default: `brevitas.graph.gptq.GPTQ`
+
+    Example:
+        >>> with torch.no_grad():
+        >>>     with gptq_mode(model) as gptq:
+        >>>         gptq_model = gptq.model
+        >>>         for i in tqdm(range(gptq.num_layers)):
+        >>>             for img, t in calib_loader:
+        >>>                 img = img.cuda()
+        >>>                 gptq_model(img)
+        >>>             gptq.update()
+    """
+
+    def __init__(
+            self,
+            model,
+            group_of_parallel_layers: Optional[List[str]] = None,
+            inplace: bool = True,
+            create_weight_orig: bool = True,
+            use_quant_activations: bool = True,
+            num_blocks: int = 100,
+            return_forward_output: bool = False,
+            act_order: bool = False,
+            gptq_class: GPTQ = GPTQ) -> None:
+        if not inplace:
+            model = deepcopy(model)
+        super().__init__(
+            model,
+            group_of_parallel_layers,
+            inplace,
+            create_weight_orig,
+            use_quant_activations,
+            act_order,
+            return_forward_output)
+
+        # How many subblock to use during GPTQ for each layer
+        self.num_blocks = num_blocks
+        self.gptq_class = gptq_class
+
+    def catch_stopfwd(self, *args, **kwargs):
+        try:
+            self.orig_forward(*args, **kwargs)
+        except StopFwdException:
+            pass
+        finally:
+            if self.return_forward_output:
+                # If we want to return the output of the network, we need to disable all hooks
+                for name, gpxq_class in self.gpxq_layers.items():
+                    gpxq_class.disable_pre_forward_hook = True
+                out = self.orig_forward(*args, **kwargs)
+                for name, gpxq_class in self.gpxq_layers.items():
+                    gpxq_class.disable_pre_forward_hook = False
+                return out
+
+    def initialize_module_optimizer(
+            self, layer, name, act_order, len_parallel_layers, create_weight_orig):
+        return self.gptq_class(
+            layer=layer,
+            name=name,
+            act_order=act_order,
+            len_parallel_layers=len_parallel_layers,
+            create_weight_orig=create_weight_orig,
+            num_blocks=self.num_blocks)
diff --git a/src/brevitas_examples/common/axe.py b/src/brevitas_examples/common/axe.py
new file mode 100644
index 000000000..39e22535d
--- /dev/null
+++ b/src/brevitas_examples/common/axe.py
@@ -0,0 +1,436 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+import math
+import warnings
+
+import numpy as np
+import torch
+from torch import Tensor
+
+try:
+    from torch.linalg import LinAlgError
+except:
+    LinAlgError = RuntimeError
+
+from brevitas.graph.gpfq import GPFQv2
+from brevitas.graph.gptq import GPTQ
+from brevitas.graph.gpxq import SUPPORTED_CONV_OP
+from brevitas.graph.gpxq import SUPPORTED_TCONV_OP
+
+
+def _get_average_of_nonzero_magnitudes(vec: np.ndarray, radius: float = 1.0):
+    assert radius > 0, "Error: radius needs to be strictly positive."
+    assert vec.ndim == 1, "Error: projection assumes a vector, not a matrix."
+    assert vec.min() >= 0, "Error: assuming a vector of non-negative numbers."
+    n_elems = vec.shape[0]
+    # if we are already within the simplex, then the best projection is itself
+    if vec.sum() <= radius:
+        return 0.0
+    # using algorithm detailed in "Efficient Projections onto the L1-Ball for Learning in High Dimensions"
+    v = vec
+    u = np.sort(v)[::-1]
+    cumsum_u = np.cumsum(u)
+    rho = np.nonzero(u * np.arange(1, n_elems + 1) > (cumsum_u - radius))[0][-1]
+    theta = float(cumsum_u[rho] - radius) / (rho + 1)
+    return theta
+
+
+def calc_average_nonzero_mag(weight: Tensor, lim: Tensor) -> Tensor:
+    thetas = torch.zeros(weight.shape[0], device=weight.device)
+    for i in range(weight.shape[0]):
+        l = lim[i].item() if lim.ndim > 0 else lim.item()
+        w = weight[i].cpu().detach().numpy()
+        t = _get_average_of_nonzero_magnitudes(np.abs(w), l)
+        thetas[i] = t
+    return thetas
+
+
+def pad_tensor_with_zeros(tensor: Tensor, tile_size: int) -> Tensor:
+    pad_size = tile_size - (tensor.shape[1] % tile_size)
+    if pad_size == tile_size:
+        return tensor
+    padding = torch.zeros((tensor.shape[0], pad_size), device=tensor.device)
+    pad_tensor = torch.concat([tensor, padding], axis=1)
+    return pad_tensor
+
+
+class A2GPTQ(GPTQ):
+    """
+    Accumulator-aware GPTQ as proposed in https://arxiv.org/pdf/2409.17092
+    """
+
+    def __init__(
+            self,
+            layer,
+            name,
+            act_order,
+            len_parallel_layers,
+            create_weight_orig,
+            num_blocks,
+            max_accumulator_bit_width,
+            max_accumulator_tile_size) -> None:
+        super().__init__(
+            layer, name, act_order, len_parallel_layers, create_weight_orig, num_blocks)
+        self.max_accumulator_bit_width = max_accumulator_bit_width
+        self.max_accumulator_tile_size = max_accumulator_tile_size
+        if self.max_accumulator_tile_size is None:
+            self.max_accumulator_tile_size = self.columns
+        assert self.max_accumulator_tile_size > 2, "Error: accumulator tile size needs to be bigger than 2."
+        assert self.max_accumulator_bit_width > 2, "Error: accumulator bit width needs to be bigger than 2."
+
+    def single_layer_update(self, percdamp=0.01):
+        assert not self.layer.weight_quant.requires_quant_input, "Error: GPTQ does not support weight quantizers that require quantized inputs."
+        if self.quant_metadata is None:
+            raise ValueError(
+                "Expected self.quant_metadata to calculate accumualtor bounds, but recevied None. "
+                "Make sure that either the input to the model is an IntQuantTensor or the layer has an input quant enabled. "
+                "Also, check if `use_quant_activations=True` in `gptq_mode` when `max_accumulator_bit_width` is specified. "
+            )
+        if hasattr(self.layer, "allocate_params"):
+            self.layer.allocate_params(self.layer)
+        weight = self.layer.weight.data
+        dev = weight.device
+
+        # Store the original dtype of the weights
+        # During computation, everything is converted to float32.
+        # When the weights are updated, we cast everything back to the original dtype
+        dtype = weight.dtype
+
+        if isinstance(self.layer, SUPPORTED_CONV_OP):
+            if isinstance(self.layer, SUPPORTED_TCONV_OP):
+                weight = weight.transpose(1, 0)  # This performs a view
+            weight = weight.flatten(1)
+
+        # TODO: add support for signed input activations
+        if self.quant_metadata.signed:
+            raise NotImplementedError("Signed inputs not yet supported.")
+
+        # TODO: currently assuming round-to-zero; need to handle other rounding functions
+        rounding_mode = self.layer.weight_quant.rounding_mode
+        if rounding_mode.lower() != "round":
+            raise NotImplementedError(f"{rounding_mode} not yet supported.")
+
+        n_tiles = math.ceil(weight.shape[-1] / self.max_accumulator_tile_size)
+        scales: Tensor = self.layer.weight_quant.scale()
+        if isinstance(self.layer, SUPPORTED_CONV_OP):
+            if isinstance(self.layer, SUPPORTED_TCONV_OP):
+                scales = scales.transpose(1, 0)  # This performs a view
+            scales = scales.flatten(1)
+        P = torch.tensor(self.max_accumulator_bit_width)
+        N = self.quant_metadata.bit_width
+        # NOTE: using sign-magnitude here, which is sufficient to support both
+        # sign-magnitude and 2s complement accumulators
+        self.upper_lim = (pow(2, P - 1) - 1) / float(pow(2, N) - 1)  # A
+        self.lower_lim = -self.upper_lim  # B
+        Z = (pow(2, P) - 2) / float(pow(2, N) - 1)  # l1-norm lim for zero-centered weight vector
+        # translating into the quantized range; need to pad to get these thresholds
+        wT = pad_tensor_with_zeros(weight / scales, self.max_accumulator_tile_size).view(
+            -1, self.max_accumulator_tile_size)  # [OC * Tiles, IC / Tiles]
+        thresholds = calc_average_nonzero_mag(
+            wT - wT.mean(axis=1, keepdim=True), Z)  # [Groups * OC * Tiles]
+        thresholds = thresholds.view(self.groups, -1,
+                                     n_tiles).transpose(1, 2)  # [Groups, Tiles, OC/Groups]
+        del wT
+        # supporting groupwise quantization where each tile has its own scaling factor
+        if self.layer.weight_quant.is_groupwise:
+            scales = pad_tensor_with_zeros(scales, self.max_accumulator_tile_size).view(
+                -1, self.max_accumulator_tile_size)  # [Groups, OC * Tiles, IC / Tiles]
+            scales = scales[:, 0]  # [Groups * OC * Tiles, 1]
+            scales = scales.view(self.groups, -1,
+                                 n_tiles).transpose(1, 2)  # [Groups, Tiles, OC/Groups]
+        # else each tile has the same scaling factor (per-tensor or per-channel)
+        else:
+            scales = scales.view(self.groups, 1, -1)  # [Groups, 1, OC/Groups]
+            scales = scales.repeat(1, n_tiles, 1)  # [Groups, Tiles, OC/Groups]
+        thresholds *= scales  # translating centers back to the float range
+        weight = weight.view(self.groups, -1, weight.shape[-1])  # [Groups, OC/Groups, IC]
+
+        # List with permutation tensors for the Hessian and weight matrix.
+        # If act_order is False, the tensors will be ordered indexes.
+        # For groupwise convolution, we have one tensor per group,
+        # thus len(permutation_list) is always equal to self.groups.
+        # We do not explicity permute the weight matrix, only the Hessian.
+        permutation_list = []
+        weight = weight.view(self.groups, -1, weight.shape[-1])
+        # For groupwise convolution, these operations are groupwise so we iterate
+        for i in range(self.groups):
+            # If a diagonal element on the Hessian is zero, we can set to 0 the corresponding
+            # column in the weight matrix.
+            # The diagonal element is set to 1 to avoid division-by-zero
+            dead = torch.diag(self.H[i, :, :]) == 0
+            self.H[i, dead, dead] = 1
+            # If the diagonal of activations is zero, we set the weight to zero
+            weight[i, :, dead] = 0
+            if self.act_order:
+                # Re-order Hessian so that weights associated to
+                # higher magnitude activations are quantized first
+                perm = torch.argsort(torch.diag(self.H[i, :, :]), descending=True)
+                self.H[i, :, :] = self.H[i, perm, :][:, perm]
+            else:
+                # No permutation, permutation tensor is a ordered index
+                perm = torch.tensor(range(self.H.shape[-1]), device=dev)
+            permutation_list.append(perm)
+
+        # Try/Except in case the inverse Hessian cannot be computed
+        try:
+            for i in range(self.groups):
+                damp = percdamp * torch.mean(torch.diag(self.H[i, :, :]))
+                diag = torch.arange(self.columns, device='cpu')
+                self.H[i, diag, diag] += damp
+                self.H[i, :, :] = torch.linalg.cholesky(self.H[i, :, :])
+                self.H[i, :, :] = torch.cholesky_inverse(self.H[i, :, :])
+                self.H[i, :, :] = torch.linalg.cholesky(self.H[i, :, :], upper=True)
+            h_inv = self.H
+        except LinAlgError:
+            warnings.warn(
+                f'Failed to compute the inverse of the Hessian for layer {self.name} '
+                f'GPTQ will not be applied. '
+                f'Increasing the number of samples might fix this issue')
+            return
+        finally:
+            del self.H, self.B
+
+        # initialize cumulative l1-norm
+        a = torch.zeros_like(thresholds, device=dev)  # positive limits
+        b = torch.zeros_like(thresholds, device=dev)  # negative limits
+
+        for i1 in range(0, self.columns, self.blocksize):
+            i2 = min(i1 + self.blocksize, self.columns)
+            count = i2 - i1
+            error_block = torch.zeros_like(
+                weight[:, :, permutation_list[-1][i1:i2]],
+                dtype=torch.float32)  # [groups, OC/groups, i2-i1]
+
+            h_inv_block = h_inv[:, i1:i2, i1:i2]
+            for i in range(count):
+                # need to apply soft thresholding and clamping before quantization
+                for group_index in range(self.groups):
+                    perm = permutation_list[group_index]
+                    bx = perm[i1:i2][i] // self.max_accumulator_tile_size  # block index
+                    # calculate the q_max and q_min for the right group and right block
+                    q_max = scales[group_index, bx, :] * torch.clamp_min(
+                        self.upper_lim - a[group_index, bx, :] - 0.5, 0.0)  # [OC/groups]
+                    q_min = scales[group_index, bx, :] * torch.clamp_max(
+                        self.lower_lim - b[group_index, bx, :] + 0.5, 0.0)  # [OC/groups]
+                    q_arg = weight[group_index, :, perm[i1:i2][i]]  # [OC/groups]
+                    # soft thresholding then clamping
+                    q_arg = q_arg.sign() * torch.relu(
+                        q_arg.abs() - thresholds[group_index, bx])  # [OC/groups]
+                    q_arg.clamp_(q_min, q_max)  # clamping to bounds
+                    weight[group_index, :, perm[i1:i2][i]] = q_arg
+                q_groups = self.get_quant_weights(i, i1, permutation_list)  # [Groups, OC/groups]
+                for group_index in range(self.groups):
+                    perm = permutation_list[group_index]
+                    q = q_groups[group_index].to(torch.float32)  # [OC/groups]
+                    w = weight[group_index, :, perm[i1:i2][i]].to(torch.float32)  # [OC/groups]
+                    d = h_inv_block[group_index, i, i]  # [1]
+                    error = (w - q) / d  # [OC/groups]
+                    error_block[group_index, :, i] = error
+                    # We need to update the original weights
+                    weight[group_index, :, perm[i1:i2][i:]] -= (
+                        error.unsqueeze(1).matmul(
+                            h_inv_block[group_index, i, i:].unsqueeze(0).to(dev))).to(dtype)
+                # update the tracking mechanisms
+                for group_index in range(self.groups):
+                    perm = permutation_list[group_index]
+                    bx = perm[i1:i2][i] // self.max_accumulator_tile_size  # block index
+                    q = q_groups[group_index] / scales[group_index, bx]  # [OC/groups]
+                    # increment cumulative l1-norm
+                    a[group_index, bx, q >= 0] += q[q >= 0]
+                    b[group_index, bx, q <= 0] += q[q <= 0]
+                    assert (a <= self.upper_lim).all() and (a >= 0).all()
+                    assert (b >= self.lower_lim).all() and (b <= 0).all()
+
+            for group_index in range(self.groups):
+                perm = permutation_list[group_index]
+                weight[group_index, :, perm[i2:]] -= (
+                    error_block[group_index].matmul(h_inv[group_index, i1:i2,
+                                                          i2:].to(dev))).to(dtype)
+        if hasattr(self.layer, "offload_params"):
+            self.layer.offload_params(self.layer)
+
+        del thresholds, scales  # memory management
+
+
+class A2GPFQ(GPFQv2):
+    """
+    Memory-efficient, accumulator-aware GPFQ as proposed in https://arxiv.org/pdf/2409.17092
+    """
+
+    def __init__(
+            self,
+            layer,
+            name,
+            act_order,
+            len_parallel_layers,
+            create_weight_orig,
+            p,
+            max_accumulator_bit_width,
+            max_accumulator_tile_size) -> None:
+        super().__init__(layer, name, act_order, len_parallel_layers, create_weight_orig, p)
+        self.max_accumulator_bit_width = max_accumulator_bit_width
+        self.max_accumulator_tile_size = max_accumulator_tile_size
+        if self.max_accumulator_tile_size is None:
+            self.max_accumulator_tile_size = self.columns
+        assert self.max_accumulator_tile_size > 2, "Error: accumulator tile size needs to be bigger than 2."
+        assert self.max_accumulator_bit_width > 2, "Error: accumulator bit width needs to be bigger than 2."
+
+    def single_layer_update(self, percdamp=0.01):
+        assert not self.layer.weight_quant.requires_quant_input, \
+            "Error: GPFQ does not support weight quantizers that require quantized inputs."
+        if self.quant_metadata is None:
+            raise ValueError(
+                "Expected self.quant_metadata to calculate accumualtor bounds, but recevied None. "
+                "Make sure that either the input to the model is an IntQuantTensor or the layer has an input quant enabled. "
+                "Also, check if `use_quant_activations=True` in `gpfq_mode` when `max_accumulator_bit_width` is specified. "
+            )
+        if hasattr(self.layer, "allocate_params"):
+            self.layer.allocate_params(self.layer)
+        weight: Tensor = self.layer.weight.data
+        dev = weight.device
+
+        # Store the original dtype of the weights
+        # During computation, everything is converted to float32.
+        # When the weights are updated, we cast everything back to the original dtype
+        dtype = weight.dtype
+
+        if isinstance(self.layer, SUPPORTED_CONV_OP):
+            if isinstance(self.layer, SUPPORTED_TCONV_OP):
+                weight = weight.transpose(1, 0)  # This performs a view
+            weight = weight.flatten(1)
+
+        # TODO: add support for signed input activations
+        if self.quant_metadata.signed:
+            raise NotImplementedError("Signed inputs not yet supported.")
+
+        # TODO: currently assuming round-to-zero; need to handle other rounding functions
+        rounding_mode = self.layer.weight_quant.rounding_mode
+        if rounding_mode.lower() != "round":
+            raise NotImplementedError(f"{rounding_mode} not yet supported.")
+
+        n_tiles = math.ceil(weight.shape[-1] / self.max_accumulator_tile_size)
+        scales: Tensor = self.layer.weight_quant.scale()
+        if isinstance(self.layer, SUPPORTED_CONV_OP):
+            if isinstance(self.layer, SUPPORTED_TCONV_OP):
+                scales = scales.transpose(1, 0)  # This performs a view
+            scales = scales.flatten(1)
+        P = torch.tensor(self.max_accumulator_bit_width)
+        N = self.quant_metadata.bit_width
+        # NOTE: using sign-magnitude here, which is sufficient to support both
+        # sign-magnitude and 2s complement accumulators
+        self.upper_lim = (pow(2, P - 1) - 1) / float(pow(2, N) - 1)  # A
+        self.lower_lim = -self.upper_lim  # B
+        Z = (pow(2, P) - 2) / float(pow(2, N) - 1)  # l1-norm lim for zero-centered weight vector
+        # translating into the quantized range; need to pad to get these thresholds
+        wT = pad_tensor_with_zeros(weight / scales, self.max_accumulator_tile_size).view(
+            -1, self.max_accumulator_tile_size)  # [OC * Tiles, IC / Tiles]
+        thresholds = calc_average_nonzero_mag(
+            wT - wT.mean(axis=1, keepdim=True), Z)  # [Groups * OC * Tiles]
+        thresholds = thresholds.view(self.groups, -1,
+                                     n_tiles).transpose(1, 2)  # [Groups, Tiles, OC/Groups]
+        del wT
+        # supporting groupwise quantization where each tile has its own scaling factor
+        if self.layer.weight_quant.is_groupwise:
+            scales = pad_tensor_with_zeros(scales, self.max_accumulator_tile_size).view(
+                -1, self.max_accumulator_tile_size)  # [Groups, OC * Tiles, IC / Tiles]
+            scales = scales[:, 0]  # [Groups * OC * Tiles, 1]
+            scales = scales.view(self.groups, -1,
+                                 n_tiles).transpose(1, 2)  # [Groups, Tiles, OC/Groups]
+        # else each tile has the same scaling factor (per-tensor or per-channel)
+        else:
+            scales = scales.view(self.groups, 1, -1)  # [Groups, 1, OC/Groups]
+            scales = scales.repeat(1, n_tiles, 1)  # [Groups, Tiles, OC/Groups]
+        thresholds *= scales  # translating centers back to the float range
+
+        weight = weight.view(self.groups, -1, weight.shape[-1])  # [Groups, OC/Groups, IC]
+
+        # initialize cumulative l1-norm
+        a = torch.zeros_like(thresholds, device=dev)  # positive limit
+        b = torch.zeros_like(thresholds, device=dev)  # negative limit
+
+        # Try/Except in case the square root of H cannot be computed
+        try:
+            norms = torch.zeros((self.groups, self.columns), device=dev, dtype=torch.float32)
+            self.H = self.H.to(dev)
+            diag = torch.arange(self.columns, device='cpu')
+            for i in range(self.groups):
+                # stablize H with a dampening factor and then square root the matrix
+                damp = percdamp * self.H[i].diag().mean()
+                self.H[i, diag, diag] += damp
+                norms[i] = self.H[i].diag()  # set the norms post-dampening
+                eigvals, eigvecs = torch.linalg.eigh(self.H[i])
+                eigvals.clamp_min_(0.0).sqrt_()  # should be positive-definite
+                self.H[i] = eigvecs @ torch.diag(eigvals) @ eigvecs.t()
+            del eigvecs, eigvals, diag
+            self.quant_input = self.H  # NOTE: do this here for the `get_permutation_list` function
+        except LinAlgError:
+            warnings.warn(
+                f'Failed to compute the matrix square root of H for layer {self.name} '
+                f'GPFQ will not be applied. '
+                f'Increasing the number of samples might fix this issue')
+            return
+
+        # Try/Except in case the inverse of H cannot be computed
+        try:
+            self.float_input = self.H.clone()  # going to calculate H^{-1} here
+            for i in range(self.groups):
+                # from our matrix sqrt, we know G is symmetric and positive-definite, so we
+                # can use Cholesky decomposition as an efficient, numerically stable inverse
+                L = torch.linalg.cholesky(self.float_input[i])
+                self.float_input[i] = torch.cholesky_inverse(L)
+            self.float_input = torch.bmm(self.float_input.to(dev), self.G.to(dev))
+            del L  # memory management
+        except LinAlgError:
+            warnings.warn(
+                f'Failed to compute the inverse of H for layer {self.name} '
+                f'GPFQ will not be applied. '
+                f'Increasing the number of samples might fix this issue')
+            return
+        finally:
+            del self.H, self.G, self.B  # memory management
+
+        permutation_list = self._get_permutation_list(weight)
+
+        U = torch.zeros(
+            weight.shape[0],
+            weight.shape[1],
+            self.float_input.shape[1],
+            device=dev,
+            dtype=torch.float32)  # [Groups, OC/groups, Samples]
+
+        for t in range(weight.shape[-1]):
+            for group_index in range(self.groups):
+                i = permutation_list[group_index][t]
+                U[group_index] += torch.matmul(
+                    weight[group_index, :, i].unsqueeze(1).to(torch.float32),
+                    self.float_input[group_index, :, i].unsqueeze(0))
+                norm = norms[group_index, i]
+                if norm > 0:
+                    q_arg = U[group_index].matmul(self.quant_input[group_index, :, i]) / norm
+                else:
+                    q_arg = torch.zeros_like(U[group_index, :, 0])
+                bx = i // self.max_accumulator_tile_size  # block index
+                q_arg = q_arg.sign() * torch.relu(
+                    q_arg.abs() - thresholds[group_index, bx, :])  # soft thresholding
+                q_max = scales[group_index, bx] * torch.clamp_min(
+                    self.upper_lim - a[group_index, bx, :] - 0.5, 0.0)
+                q_min = scales[group_index, bx] * torch.clamp_max(
+                    self.lower_lim - b[group_index, bx, :] + 0.5, 0.0)
+                q_arg.clamp_(q_min, q_max)
+                weight[group_index, :, i] = q_arg.to(dtype)
+            q_groups: Tensor = self.get_quant_weights(t, 0, permutation_list)
+            for group_index in range(self.groups):
+                i = permutation_list[group_index][t]
+                U[group_index] -= torch.matmul(
+                    q_groups[group_index].unsqueeze(1).to(torch.float32),
+                    self.quant_input[group_index, :, i].unsqueeze(0))
+                bx = i // self.max_accumulator_tile_size  # block index
+                q = q_groups[group_index] / scales[group_index, bx]  # [OC/groups]
+                # increment cumulative l1-norm
+                a[group_index, bx, q >= 0] += q[q >= 0]
+                b[group_index, bx, q <= 0] += q[q <= 0]
+                assert (a <= self.upper_lim).all() and (a >= 0).all()
+                assert (b >= self.lower_lim).all() and (b <= 0).all()
+
+        del self.quant_input, self.float_input
diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py
index 0151c9232..38ed85678 100644
--- a/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py
+++ b/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py
@@ -1,11 +1,10 @@
 # Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 
-from copy import deepcopy
+from functools import partial
 import math
 
 import torch
-import torch.backends.cudnn as cudnn
 from tqdm import tqdm
 
 from brevitas.core.function_wrapper.shape import OverBatchOverTensorView
@@ -16,6 +15,8 @@
 from brevitas.graph.calibrate import norm_correction_mode
 from brevitas.graph.equalize import activation_equalization_mode
 from brevitas.graph.gpfq import gpfq_mode
+from brevitas.graph.gpfq import GPFQv2
+from brevitas.graph.gptq import GPTQ
 from brevitas.graph.gptq import gptq_mode
 from brevitas.graph.quantize import layerwise_quantize
 from brevitas.graph.quantize import quantize
@@ -60,7 +61,6 @@
 from brevitas.quant.scaled_int import Int32Bias
 from brevitas.quant.shifted_scaled_int import ShiftedUint8ActPerTensorFixedPoint
 from brevitas.quant.shifted_scaled_int import ShiftedUint8ActPerTensorFloat
-from brevitas.quant.shifted_scaled_int import ShiftedUint8ActPerTensorFloatHQO
 from brevitas.quant.shifted_scaled_int import ShiftedUint8ActPerTensorFloatMSE
 from brevitas.quant.shifted_scaled_int import ShiftedUint8WeightPerChannelFloat
 from brevitas.quant.shifted_scaled_int import ShiftedUint8WeightPerChannelFloatHQO
@@ -68,6 +68,8 @@
 from brevitas.quant.shifted_scaled_int import ShiftedUint8WeightPerTensorFloat
 from brevitas.quant.shifted_scaled_int import ShiftedUint8WeightPerTensorFloatHQO
 from brevitas.quant.shifted_scaled_int import ShiftedUint8WeightPerTensorFloatMSE
+from brevitas_examples.common.axe import A2GPFQ
+from brevitas_examples.common.axe import A2GPTQ
 from brevitas_examples.common.generative.quantizers import Int8DynamicActPerTensorFloat
 from brevitas_examples.common.generative.quantizers import ShiftedUint8DynamicActPerTensorFloat
 from brevitas_examples.imagenet_classification.ptq.learned_round_utils import learned_round_iterator
@@ -574,12 +576,32 @@ def apply_act_equalization(model, calib_loader, layerwise):
                 model(images)
 
 
-def apply_gptq(calib_loader, model, act_order=False):
+def apply_gptq(
+        calib_loader,
+        model,
+        act_order=False,
+        use_quant_activations=False,
+        create_weight_orig=False,
+        max_accumulator_bit_width=None,
+        max_accumulator_tile_size=128):
+    if max_accumulator_bit_width is not None:
+        # Use accumulator-aware extension (AXE) framework
+        print(f"Using AXE to target {max_accumulator_bit_width}-bit accumulation...")
+        gptq_class = partial(
+            A2GPTQ,
+            max_accumulator_bit_width=max_accumulator_bit_width,
+            max_accumulator_tile_size=max_accumulator_tile_size)
+    else:
+        gptq_class = GPTQ
     model.eval()
     dtype = next(model.parameters()).dtype
     device = next(model.parameters()).device
     with torch.no_grad():
-        with gptq_mode(model, act_order=act_order, use_quant_activations=True) as gptq:
+        with gptq_mode(model,
+                       act_order=act_order,
+                       use_quant_activations=use_quant_activations,
+                       create_weight_orig=create_weight_orig,
+                       gptq_class=gptq_class) as gptq:
             gptq_model = gptq.model
             for i in tqdm(range(gptq.num_layers)):
                 for i, (images, target) in enumerate(calib_loader):
@@ -593,21 +615,27 @@ def apply_gpfq(
         calib_loader,
         model,
         act_order,
-        p=1.0,
-        use_gpfa2q=False,
-        accumulator_bit_width=None,
-        compression_rate=0.0):
+        create_weight_orig=False,
+        max_accumulator_bit_width=None,
+        max_accumulator_tile_size=128):
     model.eval()
     dtype = next(model.parameters()).dtype
     device = next(model.parameters()).device
+    if max_accumulator_bit_width is not None:
+        # Use accumulator-aware extension (AXE) framework
+        print(f"Using AXE to target {max_accumulator_bit_width}-bit accumulation...")
+        gpfq_class = partial(
+            A2GPFQ,
+            max_accumulator_bit_width=max_accumulator_bit_width,
+            max_accumulator_tile_size=max_accumulator_tile_size)
+    else:
+        gpfq_class = GPFQv2
     with torch.no_grad():
         with gpfq_mode(model,
-                       p=p,
+                       create_weight_orig=create_weight_orig,
                        use_quant_activations=True,
                        act_order=act_order,
-                       use_gpfa2q=use_gpfa2q,
-                       accumulator_bit_width=accumulator_bit_width,
-                       compression_rate=compression_rate) as gpfq:
+                       gpfq_class=gpfq_class) as gpfq:
             gpfq_model = gpfq.model
             for i in tqdm(range(gpfq.num_layers)):
                 for i, (images, target) in enumerate(calib_loader):
diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py
index 8a70e29ba..34bdfbc96 100644
--- a/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py
+++ b/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py
@@ -233,10 +233,15 @@ def validate_args(args):
     type=int,
     help='Exponent bit width used with float quantization for activations (default: 3)')
 parser.add_argument(
-    '--accumulator-bit-width',
+    '--gpxq-accumulator-bit-width',
     default=None,
     type=int,
-    help='Accumulator Bit Width for GPFA2Q (default: None)')
+    help='Accumulator Bit Width for GPxQ (default: None)')
+parser.add_argument(
+    '--gpxq-accumulator-tile-size',
+    default=None,
+    type=int,
+    help='Accumulator tile size for GPxQ (default: None)')
 parser.add_argument('--onnx-opset-version', default=None, type=int, help='ONNX opset version')
 parser.add_argument(
     '--channel-splitting-ratio',
@@ -245,17 +250,20 @@ def validate_args(args):
     help=
     'Split Ratio for Channel Splitting. When set to 0.0, Channel Splitting will not be applied. (default: 0.0)'
 )
-parser.add_argument(
-    '--compression-rate',
-    default=0.0,
-    type=float,
-    help='Specify compression rate < 1.0 for random projection. Default is 0.0 and does not use RP.'
-)
 add_bool_arg(parser, 'gptq', default=False, help='GPTQ (default: disabled)')
 add_bool_arg(parser, 'gpfq', default=False, help='GPFQ (default: disabled)')
-add_bool_arg(parser, 'gpfa2q', default=False, help='GPFA2Q (default: disabled)')
 add_bool_arg(
     parser, 'gpxq-act-order', default=False, help='GPxQ Act order heuristic (default: disabled)')
+add_bool_arg(
+    parser,
+    'gptq-use-quant-activations',
+    default=False,
+    help='Use quant activations for GPTQ (default: disabled)')
+add_bool_arg(
+    parser,
+    'gpxq-create-weight-orig',
+    default=False,
+    help='Maintain original weights for non-quant forward pass (default: disabled)')
 add_bool_arg(parser, 'learned-round', default=False, help='Learned round (default: disabled)')
 add_bool_arg(parser, 'calibrate-bn', default=False, help='Calibrate BN (default: disabled)')
 add_bool_arg(
@@ -270,7 +278,7 @@ def validate_args(args):
     help='Merge BN layers before quantizing the model (default: enabled)')
 add_bool_arg(
     parser,
-    'uint_sym_act_for_unsigned_values',
+    'uint-sym-act-for-unsigned-values',
     default=True,
     help='Use unsigned act quant when possible (default: enabled)')
 add_bool_arg(parser, 'compile', default=False, help='Use torch.compile (default: disabled)')
@@ -312,7 +320,6 @@ def main():
         f"w{args.weight_bit_width}_"
         f"{'gptq_' if args.gptq else ''}"
         f"{'gpfq_' if args.gpfq else ''}"
-        f"{'gpfa2q_' if args.gpfa2q else ''}"
         f"{'gpxq_act_order_' if args.gpxq_act_order else ''}"
         f"{'learned_round_' if args.learned_round else ''}"
         f"{'weight_narrow_range_' if args.weight_narrow_range else ''}"
@@ -335,10 +342,8 @@ def main():
         f"Weight bit width: {args.weight_bit_width} - "
         f"GPTQ: {args.gptq} - "
         f"GPFQ: {args.gpfq} - "
-        f"GPFA2Q: {args.gpfa2q} - "
-        f"GPFQ P: {args.gpfq_p} - "
         f"GPxQ Act Order: {args.gpxq_act_order} - "
-        f"GPFA2Q Accumulator Bit Width: {args.accumulator_bit_width} - "
+        f"GPxQ Accumulator Bit Width: {args.gpxq_accumulator_bit_width} - "
         f"Learned Round: {args.learned_round} - "
         f"Weight narrow range: {args.weight_narrow_range} - "
         f"Bias bit width: {args.bias_bit_width} - "
@@ -412,7 +417,9 @@ def main():
     if args.act_equalization is not None:
         print("Applying activation equalization:")
         apply_act_equalization(model, calib_loader, layerwise=args.act_equalization == 'layerwise')
+
     device = next(iter(model.parameters())).device
+
     # Define the quantized model
     quant_model = quantize_model(
         model,
@@ -452,24 +459,21 @@ def main():
         apply_gpfq(
             calib_loader,
             quant_model,
-            p=args.gpfq_p,
             act_order=args.gpxq_act_order,
-            compression_rate=args.compression_rate)
+            create_weight_orig=args.gpxq_create_weight_orig,
+            max_accumulator_bit_width=args.gpxq_accumulator_bit_width,
+            max_accumulator_tile_size=args.gpxq_accumulator_tile_size)
 
-    if args.gpfa2q:
-        print("Performing GPFA2Q:")
-        apply_gpfq(
+    if args.gptq:
+        print("Performing GPTQ:")
+        apply_gptq(
             calib_loader,
             quant_model,
-            p=args.gpfq_p,
             act_order=args.gpxq_act_order,
-            use_gpfa2q=args.gpfa2q,
-            accumulator_bit_width=args.accumulator_bit_width,
-            compression_rate=args.compression_rate)
-
-    if args.gptq:
-        print("Performing GPTQ:")
-        apply_gptq(calib_loader, quant_model, act_order=args.gpxq_act_order)
+            use_quant_activations=args.gptq_use_quant_activations,
+            create_weight_orig=args.gpxq_create_weight_orig,
+            max_accumulator_bit_width=args.gpxq_accumulator_bit_width,
+            max_accumulator_tile_size=args.gpxq_accumulator_tile_size)
 
     if args.learned_round:
         print("Applying Learned Round:")
diff --git a/src/brevitas_examples/llm/README.md b/src/brevitas_examples/llm/README.md
index 457c74804..5cd067e64 100644
--- a/src/brevitas_examples/llm/README.md
+++ b/src/brevitas_examples/llm/README.md
@@ -34,6 +34,9 @@ usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES]
                [--input-quant-granularity {per_tensor,per_row,per_group}]
                [--input-group-size INPUT_GROUP_SIZE]
                [--quantize-input-zero-point] [--quantize-last-layer] [--gptq]
+               [--gpfq] [--gpxq-act-order] [--gpxq-use-quant-activations] [--gpxq-create-weight-orig]
+               [--gpxq-max-accumulator-bit-width GPXQ_MAX_ACCUMULATOR_BIT_WIDTH]
+               [--gpxq-max-accumulator-tile-size GPXQ_MAX_ACCUMULATOR_TILE_SIZE]
                [--act-calibration] [--bias-corr] [--ln-affine-merge]
                [--no-quantize] [--no-float16] [--replace-mha]
                [--weight-equalization]
@@ -105,6 +108,16 @@ options:
   --quantize-last-layer
                         Quantize last nn.Linear layer.
   --gptq                Apply GPTQ.
+  --gpfq                Apply GPFQ.
+  --gpxq-act-order      Apply GPxQ activation ordering.
+  --gpxq-use-quant-activations
+                        Use quantized activations in GPxQ.
+  --gpxq-create-weight-orig
+                        Create weight_orig in GPxQ.
+  --gpxq-max-accumulator-bit-width GPXQ_MAX_ACCUMULATOR_BIT_WIDTH
+                        Maximum accumulator bit width for GPxQ using AXE.
+  --gpxq-max-accumulator-tile-size GPXQ_MAX_ACCUMULATOR_TILE_SIZE
+                        Maximum accumulator tile size for GPxQ using AXE.
   --act-calibration     Apply activation calibration.
   --bias-corr           Apply bias correction.
   --ln-affine-merge     Merge LN affine params.
diff --git a/src/brevitas_examples/llm/llm_quant/gpxq.py b/src/brevitas_examples/llm/llm_quant/gpxq.py
index 44b99772f..5e61306d4 100644
--- a/src/brevitas_examples/llm/llm_quant/gpxq.py
+++ b/src/brevitas_examples/llm/llm_quant/gpxq.py
@@ -1,9 +1,8 @@
-"""
-Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
-"""
 
 from copy import deepcopy
+from functools import partial
 
 from accelerate.utils.operations import send_to_device
 import torch
@@ -13,9 +12,13 @@
 from brevitas.graph.calibrate import DisableEnableQuantization
 from brevitas.graph.calibrate import restore_return_quant_tensor
 from brevitas.graph.gpfq import gpfq_mode
+from brevitas.graph.gpfq import GPFQv2
+from brevitas.graph.gptq import GPTQ
 from brevitas.graph.gptq import gptq_mode
 from brevitas.graph.gpxq import StopFwdException
 from brevitas.utils.python_utils import recurse_getattr
+from brevitas_examples.common.axe import A2GPFQ
+from brevitas_examples.common.axe import A2GPTQ
 
 
 @torch.no_grad()
@@ -109,20 +112,33 @@ def apply_gptq(
         use_quant_activations=False,
         create_weight_orig=False,
         group_of_parallel_layers=None,
-        block_name=None):
+        block_name=None,
+        max_accumulator_bit_width=None,
+        max_accumulator_tile_size=128):
+    if max_accumulator_bit_width is not None:
+        # Use accumulator-aware extension (AXE) framework
+        print(f"Using AXE to target {max_accumulator_bit_width}-bit accumulation...")
+        gptq_class = partial(
+            A2GPTQ,
+            max_accumulator_bit_width=max_accumulator_bit_width,
+            max_accumulator_tile_size=max_accumulator_tile_size)
+    else:
+        gptq_class = GPTQ
     if block_name is not None:
         context_manager_kwargs = {
             'act_order': act_order,
             'group_of_parallel_layers': group_of_parallel_layers,
             'create_weight_orig': create_weight_orig,
-            'use_quant_activations': use_quant_activations}
+            'use_quant_activations': use_quant_activations,
+            'gptq_class': gptq_class}
         block_optimization(model, dataloader, block_name, gptq_mode, context_manager_kwargs)
     else:
         with gptq_mode(model,
                        use_quant_activations=use_quant_activations,
                        group_of_parallel_layers=group_of_parallel_layers,
                        act_order=act_order,
-                       create_weight_orig=create_weight_orig) as gptq:
+                       create_weight_orig=create_weight_orig,
+                       gptq_class=gptq_class) as gptq:
             gptq_model = gptq.model
             for _ in tqdm(range(gptq.num_layers)):
                 for inps in dataloader:
@@ -131,14 +147,36 @@ def apply_gptq(
 
 
 @torch.no_grad()
-def apply_gpfq(model, dataloader, act_order=True, group_of_parallel_layers=None, block_name=None):
+def apply_gpfq(
+        model,
+        dataloader,
+        act_order=True,
+        group_of_parallel_layers=None,
+        block_name=None,
+        max_accumulator_bit_width=None,
+        max_accumulator_tile_size=128):
+    if max_accumulator_bit_width is not None:
+        # Use accumulator-aware extension (AXE) framework
+        print(f"Using AXE to target {max_accumulator_bit_width}-bit accumulation...")
+        gpfq_class = partial(
+            A2GPFQ,
+            max_accumulator_bit_width=max_accumulator_bit_width,
+            max_accumulator_tile_size=max_accumulator_tile_size)
+    else:
+        gpfq_class = GPFQv2
     if block_name is not None:
-        raise RuntimeError("Block optimization not support for GPFQ at the moment")
+        context_manager_kwargs = {
+            'act_order': act_order,
+            'group_of_parallel_layers': group_of_parallel_layers,
+            'create_weight_orig': True,
+            'gpfq_class': gpfq_class}
+        block_optimization(model, dataloader, block_name, gpfq_mode, context_manager_kwargs)
     else:
         with gpfq_mode(model,
                        act_order=act_order,
                        group_of_parallel_layers=group_of_parallel_layers,
-                       create_weight_orig=True) as gpfq:
+                       create_weight_orig=True,
+                       gpfq_class=gpfq_class) as gpfq:
             gpfq_model = gpfq.model
             for _ in tqdm(range(gpfq.num_layers)):
                 for inps in dataloader:
diff --git a/src/brevitas_examples/llm/main.py b/src/brevitas_examples/llm/main.py
index bf995a426..4a87f5a1a 100644
--- a/src/brevitas_examples/llm/main.py
+++ b/src/brevitas_examples/llm/main.py
@@ -1,7 +1,5 @@
-"""
-Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
-"""
 
 import argparse
 import sys
@@ -74,6 +72,20 @@ def validate(args):
     if not args.no_quantize:
         if args.gptq and args.gpfq:
             warn("Both GPTQ and GPFQ are enabled.")
+        if args.gpxq_max_accumulator_bit_width is not None:
+            assert args.weight_quant_format == 'int', "AXE only supports integer formats."
+            assert args.input_quant_format == 'int', "AXE only supports integer formats."
+            assert args.input_bit_width is not None, \
+                "Specify input bit width; activation quantization is required to guarantee accumulator bounds."
+            if not (args.gptq or args.gpfq):
+                warn("Max accumulator bit width is specified, but no GPxQ is enabled.")
+            if args.gpxq_max_accumulator_tile_size is not None:
+                if args.weight_quant_granularity == 'per_group':
+                    assert args.gpxq_max_accumulator_tile_size == args.weight_group_size, \
+                        "Group size must be equal to tile size with per_group quantization."
+                if args.input_quant_granularity == 'per_group':
+                    assert args.gpxq_max_accumulator_tile_size == args.input_group_size, \
+                        "Group size must be equal to tile size with per_group quantization."
         if args.export_target is not None:
             assert args.input_quant_format == 'int', "Only integer quantization supported for export currently."
         if args.export_target is not None and args.input_bit_width is not None:
@@ -158,8 +170,7 @@ def main(args):
         seed=args.seed,
         require_fx=require_fx,
         device=None,
-        fuse_sequences=args.fuse_sequences,
-    )
+        fuse_sequences=args.fuse_sequences)
 
     validation_loader = get_dataset_for_model(
         args.model,
@@ -171,8 +182,7 @@ def main(args):
         seed=args.seed,
         require_fx=require_fx,
         device=None,
-        fuse_sequences=args.fuse_sequences,
-    )
+        fuse_sequences=args.fuse_sequences)
 
     device = next(iter(model.parameters())).device
     print("Data loaded.")
@@ -287,7 +297,9 @@ def main(args):
             act_order=args.gpxq_act_order,
             use_quant_activations=args.gpxq_use_quant_activations,
             create_weight_orig=args.gpxq_create_weight_orig,
-            block_name=args.gpxq_block_name)
+            block_name=args.gpxq_block_name,
+            max_accumulator_bit_width=args.gpxq_max_accumulator_bit_width,
+            max_accumulator_tile_size=args.gpxq_max_accumulator_tile_size)
         print("GPTQ applied.")
 
     if args.gpfq:
@@ -296,7 +308,9 @@ def main(args):
             model,
             calibration_loader,
             act_order=args.gpxq_act_order,
-            block_name=args.gpxq_block_name)
+            block_name=args.gpxq_block_name,
+            max_accumulator_bit_width=args.gpxq_max_accumulator_bit_width,
+            max_accumulator_tile_size=args.gpxq_max_accumulator_tile_size)
         print("GPFQ applied.")
 
     if args.bias_corr:
@@ -304,7 +318,7 @@ def main(args):
         apply_bias_correction(model, calibration_loader)
         print("Bias correction applied.")
 
-    if args.eval:
+    if args.eval and not args.no_quantize:
         print("Model eval...")
         quant_ppl = compute_perplexity(
             model, validation_loader, context_length=args.seqlen // 2, tokenizer=tokenizer)
@@ -455,13 +469,23 @@ def parse_args(args):
     parser.add_argument('--gptq', action='store_true', help='Apply GPTQ.')
     parser.add_argument('--gpfq', action='store_true', help='Apply GPFQ.')
     parser.add_argument(
-        '--gpxq-act-order', action='store_true', help='Apply GPXQ activation ordering.')
+        '--gpxq-act-order', action='store_true', help='Apply GPxQ activation ordering.')
     parser.add_argument(
         '--gpxq-use-quant-activations',
         action='store_true',
-        help='Use quantized activations in GPXQ.')
+        help='Use quantized activations in GPxQ.')
     parser.add_argument(
-        '--gpxq-create-weight-orig', action='store_true', help='Create weight_orig in GPXQ.')
+        '--gpxq-create-weight-orig', action='store_true', help='Create weight_orig in GPxQ.')
+    parser.add_argument(
+        '--gpxq-max-accumulator-bit-width',
+        type=int,
+        default=None,
+        help='Maximum accumulator bit width for GPxQ using AXE.')
+    parser.add_argument(
+        '--gpxq-max-accumulator-tile-size',
+        type=int,
+        default=None,
+        help='Maximum accumulator tile size for GPxQ using AXE.')
     parser.add_argument(
         '--act-calibration', action='store_true', help='Apply activation calibration.')
     parser.add_argument('--bias-corr', action='store_true', help='Apply bias correction.')