fix bugs of communication

kaustpradalab · Aug 6, 2024 · db1495b · db1495b
1 parent eb66368
commit db1495b
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 10 deletions.
diff --git a/tiny_deepspeed/core/zero/ddp/module.py b/tiny_deepspeed/core/zero/ddp/module.py
@@ -16,9 +16,12 @@
 
 def sync_grad(grad, async_op=True):    # communication complexity: 2g
     if async_op:
-        return dist.all_reduce(grad, async_op=True)
+        work = dist.all_reduce(grad, async_op=True)
     else:
         dist.all_reduce(grad, async_op=False)
+        work = None
+    torch.cuda.synchronize()
+    return work
 
 
 class Linear(linear.Linear):

diff --git a/tiny_deepspeed/core/zero/zero1/module.py b/tiny_deepspeed/core/zero/zero1/module.py
@@ -16,10 +16,12 @@
 
 def sync_grad(grad, async_op=True, rank_id=None):    # communication complexity: g
     if async_op:
-        return dist.reduce(grad, dst=rank_id, async_op=True)
+        work = dist.reduce(grad, dst=rank_id, async_op=True)
     else:
         dist.reduce(grad, dst=rank_id, async_op=False)
-        return None
+        work = None
+    torch.cuda.synchronize()
+    return work
 
 
 class Linear(linear.Linear):

diff --git a/tiny_deepspeed/core/zero/zero2/module.py b/tiny_deepspeed/core/zero/zero2/module.py
@@ -16,22 +16,24 @@
 
 def sync_grad(grad, async_op=True, rank_id=None):    # communication complexity: g
     if async_op:
-        return dist.reduce(grad, dst=rank_id, async_op=True)
+        work = dist.reduce(grad, dst=rank_id, async_op=True)
     else:
         dist.reduce(grad, dst=rank_id, async_op=False)
-        return None
+        work = None
+    torch.cuda.synchronize()
+    return work
 
 def desync_grad(grad, rank_id=None):
     if grad is not None and rank_id is not None:
         if dist.get_rank() != rank_id:
             # print(dist.get_rank(), rank_id)
             grad.data = torch.randn(1, device=grad.device, dtype=grad.dtype)
             grad.data.to("cpu")  # should actually be released but impossible in pytorch, maybe solved by plugin C++
-            torch.cuda.synchronize()
-            return None
-        else:
-            return grad
-    return grad
+            grad = None
+        torch.cuda.synchronize()
+        return grad
+    else:
+        return None
 
 
 class Linear(linear.Linear):