NVIDIA · sanandaraj5597 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -447,7 +447,11 @@ def forward(
             ctx.requires_wgrad = weight.requires_grad
             ctx.quantized_weight = quantized_weight
             if fuse_wgrad_accumulation and weight.requires_grad:
-                ctx.main_grad = weight.main_grad
+                # This check is needed to ensure that main_grad is not created
+                # during the forward pass when using MCore FSDP as it creates
+                # the main_grad buffer lazily before backprop
+                if not hasattr(param, "__fsdp_param__"):
+                    ctx.main_grad = weight.main_grad
             ctx.grad_input_quantizer = grad_input_quantizer
             ctx.grad_weight_quantizer = grad_weight_quantizer
             ctx.grad_output_quantizer = grad_output_quantizer
@@ -527,11 +531,14 @@ def backward(
             ctx.tensor_objects = None
 
             # Since main_grad can be modified inplace, it should not be a part of saved_tensors
-            main_grad = (
-                ctx.main_grad
-                if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
-                else None
-            )
+            if not hasattr(param, "__fsdp_param__"):
+                main_grad = (
+                    ctx.main_grad
+                    if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
+                    else None
+                )
+            else:
+                main_grad = origin_weight.get_main_grad()
 
             # Gather intermediate/activation tensors if needed
             # NOTE: weight_fp8 = weight when ctx.fp8 == False and torch.disttributed.FSDP already

diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -397,7 +397,11 @@ def forward(
             ctx.grad_output_quantizer = grad_output_quantizer
             ctx.fuse_wgrad_accumulation = fuse_wgrad_accumulation
             if fuse_wgrad_accumulation and weight.requires_grad:
-                ctx.main_grad = weight.main_grad
+                # This check is needed to ensure that main_grad is not created
+                # during the forward pass when using MCore FSDP as it creates
+                # the main_grad buffer lazily before backprop
+                if not hasattr(param, "__fsdp_param__"):
+                    ctx.main_grad = weight.main_grad
 
             ctx.debug = debug
             ctx.cpu_offloading = cpu_offloading
@@ -452,11 +456,14 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
             ctx.tensor_objects = None
 
             # Since main_grad can be modified inplace, it should not be a part of saved_tensors
-            main_grad = (
-                ctx.main_grad
-                if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
-                else None
-            )
+            if not hasattr(param, "__fsdp_param__"):
+                main_grad = (
+                    ctx.main_grad
+                    if weight is not None and ctx.fuse_wgrad_accumulation and ctx.requires_wgrad
+                    else None
+                )
+            else:
+                main_grad = weight.get_main_grad()
 
             if ctx.cpu_offloading:
                 if ctx.grad_added_to_main_grad: