Group allreduce futures

tushar00jain · tushar00jain · commit ad04693a52f2 · 2025-06-05T17:32:12.000-07:00
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -135,7 +135,7 @@ def allocate_reduce_scatter_output(
     return tensor, padded_sizes
 
 
-class _QuantizedOpFuture(Future[None]):
+class _QuantizedOpFuture(Future[list[torch.Tensor]]):
     def __init__(
         self,
         sync_stream: cuda.Stream,
@@ -145,11 +145,12 @@ def __init__(
         self._sync_stream = sync_stream
         self._keep_alive_tensors = keep_alive_tensors
 
-    def wait(self) -> None:
+    def wait(self) -> list[torch.Tensor]:
         # Wait for the synchronization to complete.
         cuda.current_stream().wait_stream(self._sync_stream)
         # Clean up intermediate buffers.
         del self._keep_alive_tensors
+        return []
 
 
 def reduce_scatter_quantized(
@@ -284,7 +285,7 @@ def allreduce_quantized(
     opts: AllreduceOptions | ReduceOp,
     process_group: "ProcessGroup",
     sync_stream: cuda.Stream | None = None,
-) -> Future[None]:
+) -> Future[list[torch.Tensor]]:
     """
     Performs a quantized all-reduce operation on a list of tensors.
 
@@ -314,6 +315,8 @@ def allreduce_quantized(
         A Future that can be used to wait for the operation to complete and
         clean up intermediate buffers.
 
+        The future's value is set to an empty list
+
     Raises:
         NotImplementedError: If the reduce operation is not ReduceOp.AVG.
     """
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -467,16 +467,6 @@ def __init__(
         if fragment_update_alpha < 0 or fragment_update_alpha > 1:
             raise ValueError("fragment_update_alpha must be between 0 and 1")
 
-        # TODO: Support multiple fragments
-        # This requires changing the manager to support `should_commit` for each
-        # fragment separately.
-        if len(model_fragments) != 1:
-            raise ValueError("Multiple fragments are not supported yet")
-
-        # TODO: Support `fragment_sync_delay`
-        if fragment_sync_delay != 0:
-            raise ValueError("Fragment synchronization delay is not supported yet")
-
         # TODO: Support `fragment_update_alpha`
         if fragment_update_alpha != 0.0:
             raise ValueError(
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -259,7 +259,6 @@ def __init__(
         self._quorum_id = -1
         self._errored: Optional[ExceptionWithTraceback] = None
         self._healing = False
-        self._pending_work: List[torch.futures.Future[object]] = []
         self._batches_committed = 0
 
         # first step is 1
@@ -332,15 +331,17 @@ def allreduce(
             # Run the allreduce async and save the work object so we can wait on
             # it later.
             fut: Optional[
-                torch.futures.Future[None]
-                | torch.futures.Future[torch.Tensor]
-                | torch.futures.Future[List[torch.Tensor]]
+                torch.futures.Future[torch.Tensor]
+                | torch.futures.Future[list[torch.Tensor]]
             ] = None
             if should_quantize and IS_TRITON_AVAILABLE:
-                fut = allreduce_quantized([tensor], ReduceOp.AVG, self._pg)
+                fut = allreduce_quantized([tensor], ReduceOp.SUM, self._pg)
             else:
-                work = self._pg.allreduce([tensor], ReduceOp.SUM)
-                fut = work.get_future()
+                sync_stream = torch.cuda.Stream()
+                sync_stream.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(sync_stream):
+                    work = self._pg.allreduce([tensor], ReduceOp.SUM)
+                    fut = work.get_future()
 
             # schedule grad normalization as a continuation
             # on the Future
@@ -357,11 +358,9 @@ def callback(
                 return tensor
 
             assert fut is not None
-            if not should_quantize:
-                fut = fut.then(callback)
+            fut = fut.then(callback)
             fut = self.wrap_future(fut, tensor)
             return fut
-
         except Exception as e:
             self._logger.exception(
                 f"got exception in all reduce -- skipping remaining: {e}"
@@ -429,7 +428,6 @@ def callback(
                 return default
 
         fut = fut.then(callback)
-        self._pending_work.append(cast(torch.futures.Future[object], fut))
         return fut
 
     def start_quorum(
@@ -694,21 +692,10 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         Raises:
             RuntimeError: if should_commit fails max_retries times in a row and max_retries is set
         """
-        for work in self._pending_work:
-            # check at the beginning of since .wait() may trigger errors
-            if self._errored is not None:
-                break
-
-            # We swallow the error at in a future then callback so this will
-            # never return an error.
-            work.wait()
-
         # make sure recovery is complete before committing
         if self._recovery_stream is not None:
             self._recovery_stream.synchronize()
 
-        self._pending_work = []
-
         if err := self._pg.errored():
             self.report_error(err)
 
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -775,8 +775,7 @@ def abort(self) -> None:
 
     def errored(self) -> Optional[Exception]:
         # force a synchronization to ensure all work is complete
-        torch.cuda.synchronize()
-
+        torch.cuda.current_stream.synchronize()
         return self._errored
 
     def getBackendName(self) -> str: