pytorch · Xia-Weiwen · May 6, 2025 · May 8, 2025 · May 16, 2025 · jerryzh168
diff --git a/torchao/quantization/observer.py b/torchao/quantization/observer.py
@@ -73,7 +73,7 @@ def get_block_size(
         granularity: The granularity type of the quantization
     """
     if isinstance(granularity, PerTensor):
-        return input_shape
+        return [-1]
     elif isinstance(granularity, PerAxis):
         block_size = list(input_shape)
         block_size[granularity.axis] = 1

diff --git a/torchao/quantization/pt2e/_affine_quantization.py b/torchao/quantization/pt2e/_affine_quantization.py
@@ -113,7 +113,8 @@ def _get_reduction_params(block_size, input_size):
           shape_for_reduction: (3, 3, 5, 2, 10)
           reduction_dim: [0, 1, 3, 4]
     """
-    assert len(block_size) == len(input_size)
 AffineQuantizedMinMaxObserver, 
 PartialWrapper, 
 class PartialWrapper: 
 class _PartialWrapper: 
 AffineQuantizedMinMaxObserver, 
 PartialWrapper, 
 class PartialWrapper: 
 class _PartialWrapper: 
+    assert block_size == [-1] or len(block_size) == len(input_size)
+    block_size = input_size if block_size == [-1] else block_size
     shape_for_reduction = []
     reduction_dims = []
     cur_dim = 0

diff --git a/torchao/quantization/pt2e/observer.py b/torchao/quantization/pt2e/observer.py
@@ -1793,7 +1793,7 @@ def get_block_size(
         "Please provide an instance of Granularity, not subclass of it"
     )
     if isinstance(granularity, PerTensor):
-        return input_shape
+        return [-1]
     elif isinstance(granularity, PerAxis):
         block_size = list(input_shape)
         block_size[granularity.axis] = 1

diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -255,7 +255,8 @@ def _get_reduction_params(block_size, input_size):
           shape_for_reduction: (3, 3, 5, 2, 10)
           reduction_dim: [0, 1, 3, 4]
     """
-    assert len(block_size) == len(input_size)
+    assert block_size == [-1] or len(block_size) == len(input_size)
+    block_size = input_size if block_size == [-1] else block_size
     shape_for_reduction = []
     reduction_dims = []
     cur_dim = 0
@@ -365,6 +366,9 @@ def _quantize_affine(
     # torch.uintx dtypes yet
     if output_dtype in _SUB_BYTE_UINT_BOUNDS:
         output_dtype = torch.uint8
+    if block_size == [-1]:
+        # per-tensor quantization
+        block_size = input.shape
     return _quantize_affine_no_dtype_cast(
         input,
         block_size,
@@ -520,6 +524,9 @@ def _dequantize_affine(
         torch.float16,
         torch.bfloat16,
     ], f"Unsupported output dtype: {output_dtype}"
+    if block_size == [-1]:
+        # per-tensor quantization
+        block_size = input.shape
     quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
     return _dequantize_affine_no_dtype_check(
         input,
@@ -878,6 +885,9 @@ def _choose_qparams_affine(
             scale_dtype = input.dtype
         if eps is None:
             eps = torch.finfo(input.dtype).eps
+        if block_size == [-1]:
+            # per-tensor quantization
+            block_size = input.shape
 
         assert len(block_size) == input.dim(), (
             f"Got input dim:{input.dim()}, block_size: {block_size}"