[fix] fix incorrect number of gradients ;

hpcaitech · Dec 10, 2024 · 37b670e · 37b670e
1 parent 70b0ae1
commit 37b670e
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 2 deletions.
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
@@ -736,7 +736,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
         if ctx.async_grad_reduce_scatter:
             handle.wait()
 
-        return output, grad_weight, grad_bias, None, None, None, None
+        return output, grad_weight, grad_bias, None, None, None, None, None
 
 
 def _ring_as_reducescatter(
@@ -930,7 +930,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
         # grad_weight = grad_output.t().matmul(total_input)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 class _ReduceScatterForwardGatherBackward(torch.autograd.Function):

diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
@@ -350,6 +350,7 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 True,
                 self.seq_parallel_dim,
                 ring=self.seq_parallel_mode == "ring",
+                use_zbv=self.use_zbv,
             )
         else:
             output_parallel = linear_with_async_comm(
@@ -580,6 +581,7 @@ def forward(self, input_: Tensor) -> Tensor:
                     process_group=self.process_group,
                     dim=self.seq_parallel_dim,
                     ring=self.seq_parallel_mode == "ring",
+                    use_zbv=self.use_zbv,
                 )
             else:
                 output_parallel = F.linear(input_, self.weight)