fix: update grad_output quant to avoid redundant work

kshitij12345 · kshitij12345 · commit 218e45c95c0d · 2025-04-30T11:36:57.000+02:00
Signed-off-by: kshitij12345 &lt;kshitijkalambarkar@gmail.com&gt;
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -524,6 +524,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     columnwise=columnwise_usage,
                 )
 
+            # Adjust the quantization direction approach depending
+            # on whether dgrad and wgrad calculations will be performed.
+            if not ctx.requires_dgrad and ctx.grad_output_quantizer is not None:
+                ctx.grad_output_quantizer.set_usage(rowwise=False)
+            if not ctx.requires_wgrad and ctx.grad_output_quantizer is not None:
+                ctx.grad_output_quantizer.set_usage(columnwise=False)
+
             # Prepare grad output tensor
             # Note: Cast to expected dtype and perform tensor-parallel communication
             nvtx_range_push(f"{nvtx_label}.grad_output_preprocess")