Fix FusedRMSLinear backward compute (#11095)

lshpku · web-flow · commit 1b10b5733eb6 · 2025-09-23T15:11:51.000+08:00
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -1996,9 +1996,10 @@ def backward(ctx, d_q, d_kv):
                 quant_method="1x128",
                 input_transpose=True,
             )
-            FP8LinearFunctionBase.compute_fp8_linear(
-                (d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False, out=h_grad.view([-1, h_grad.shape[-1]])
+            h_grad_0 = FP8LinearFunctionBase.compute_fp8_linear(
+                (d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False
             )
+            h_grad = h_grad + h_grad_0
 
             def q_down_weight_grad(h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight):
                 FP8LinearFunctionBase.kitchen_gemm(

Original file line number	Diff line number	Diff line change
`@@ -1996,9 +1996,10 @@ def backward(ctx, d_q, d_kv):`
`1996`	`1996`	`quant_method="1x128",`
`1997`	`1997`	`input_transpose=True,`
`1998`	`1998`	`)`
`1999`		`- FP8LinearFunctionBase.compute_fp8_linear(`
`2000`		`- (d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False, out=h_grad.view([-1, h_grad.shape[-1]])`
	`1999`	`+ h_grad_0 = FP8LinearFunctionBase.compute_fp8_linear(`
	`2000`	`+ (d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False`
`2001`	`2001`	`)`
	`2002`	`+ h_grad = h_grad + h_grad_0`
`2002`	`2003`
`2003`	`2004`	`def q_down_weight_grad(h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight):`
`2004`	`2005`	`FP8LinearFunctionBase.kitchen_gemm(`