Group allreduce futures

tushar00jain · tushar00jain · commit a6ec9eff018d · 2025-06-06T17:32:34.000-07:00
diff --git a/torchft/ddp.py b/torchft/ddp.py
@@ -68,7 +68,15 @@ def __init__(self, manager: "Manager", module: nn.Module, **kwargs: object) -> N
     def _comm_hook(
         state: "Manager", bucket: dist.GradBucket
     ) -> torch.futures.Future[torch.Tensor]:
-        return state.allreduce(bucket.buffer())
+        work = state.allreduce(bucket.buffer())
+        fut = work.get_future()
+
+        def callback(fut: torch.futures.Future[None]) -> torch.Tensor:
+            nonlocal bucket
+            return bucket.buffer()
+
+        fut = fut.then(callback)
+        return fut
 
 
 class PureDistributedDataParallel(nn.Module):
diff --git a/torchft/ddp_test.py b/torchft/ddp_test.py
@@ -10,11 +10,12 @@
 import torch
 import torch.distributed as dist
 from torch import nn
+from torch.distributed.distributed_c10d import Work
 from torch.futures import Future
 
 from torchft.ddp import DistributedDataParallel, PureDistributedDataParallel
 from torchft.manager import Manager
-from torchft.process_group import ProcessGroupBabyGloo, ProcessGroupGloo
+from torchft.process_group import ProcessGroupBabyGloo, ProcessGroupGloo, _DummyWork
 
 
 class TestDDP(TestCase):
@@ -39,14 +40,13 @@ def test_ddp(self) -> None:
 
         call_count = 0
 
-        def allreduce(tensor: torch.Tensor) -> Future[torch.Tensor]:
+        def allreduce(tensor: torch.Tensor) -> Work:
             nonlocal call_count
 
             call_count += 1
 
-            fut = Future()  # pyre-fixme[29]: not a function
-            fut.set_result(tensor)
-            return fut
+            work = _DummyWork(None)
+            return work
 
         manager.allreduce = allreduce
 
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -17,6 +17,7 @@
 import torch
 import torch.distributed as dist
 from torch import nn, optim
+from torch.distributed.distributed_c10d import Work
 from torch.distributed.tensor import DTensor
 from torch.nn.parameter import Parameter
 from torch.optim.optimizer import Optimizer
@@ -197,9 +198,7 @@ def __init__(
         self._outer_optimizer = outer_optimizer
 
         # Stores pending all reduce
-        self._allreduce_futures: list[
-            torch.futures.Future[None] | torch.futures.Future[torch.Tensor]
-        ] = []
+        self._allreduce_futures: list[Work] = []
 
         if bucket_cap_mb is not None:
             self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
@@ -467,16 +466,6 @@ def __init__(
         if fragment_update_alpha < 0 or fragment_update_alpha > 1:
             raise ValueError("fragment_update_alpha must be between 0 and 1")
 
-        # TODO: Support multiple fragments
-        # This requires changing the manager to support `should_commit` for each
-        # fragment separately.
-        if len(model_fragments) != 1:
-            raise ValueError("Multiple fragments are not supported yet")
-
-        # TODO: Support `fragment_sync_delay`
-        if fragment_sync_delay != 0:
-            raise ValueError("Fragment synchronization delay is not supported yet")
-
         # TODO: Support `fragment_update_alpha`
         if fragment_update_alpha != 0.0:
             raise ValueError(
@@ -522,6 +511,8 @@ def __init__(
                 use_bucketization,
                 bucket_cap_mb,
                 should_quantize,
+                fragment_sync_delay,
+                fragment_update_alpha,
             )
             for i, model_fragment in enumerate(model_fragments)
         ]
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -39,14 +39,14 @@
 
 import torch
 from torch.distributed import ReduceOp, TCPStore
-from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp
+from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp, Work
 
 from torchft._torchft import ManagerClient, ManagerServer
 from torchft.checkpointing import CheckpointTransport, HTTPTransport
 from torchft.futures import future_timeout
 
 if TYPE_CHECKING:
-    from torchft.process_group import ProcessGroup
+    from torchft.process_group import ProcessGroup, _DummyWork
 
 IS_TRITON_AVAILABLE = True
 try:
@@ -259,7 +259,6 @@ def __init__(
         self._quorum_id = -1
         self._errored: Optional[ExceptionWithTraceback] = None
         self._healing = False
-        self._pending_work: List[torch.futures.Future[object]] = []
         self._batches_committed = 0
 
         # first step is 1
@@ -296,9 +295,8 @@ def shutdown(self, wait: bool = True) -> None:
             self._manager.shutdown()
         self._executor.shutdown(wait=wait)
 
-    def allreduce(
-        self, tensor: torch.Tensor, should_quantize: bool = False
-    ) -> torch.futures.Future[torch.Tensor]:
+    @torch.profiler.record_function("torchft::manager::allreduce")
+    def allreduce(self, tensor: torch.Tensor, should_quantize: bool = False) -> Work:
         """
         Fault tolerant allreduce the tensor and return a Future that will be completed when
         the tensor is ready.
@@ -318,9 +316,8 @@ def allreduce(
             a Future that will be completed with the allreduced tensor
         """
         if self.errored():
-            fut = torch.futures.Future()  # pyre-fixme[29]: not a function
-            fut.set_result(tensor)
-            return fut
+            work = _DummyWork(None)
+            return work
 
         self.wait_quorum()
 
@@ -332,45 +329,44 @@ def allreduce(
             # Run the allreduce async and save the work object so we can wait on
             # it later.
             fut: Optional[
-                torch.futures.Future[None]
-                | torch.futures.Future[torch.Tensor]
-                | torch.futures.Future[List[torch.Tensor]]
+                torch.futures.Future[None] | torch.futures.Future[list[torch.Tensor]]
             ] = None
+            work: Optional[Work] = None
+
             if should_quantize and IS_TRITON_AVAILABLE:
-                fut = allreduce_quantized([tensor], ReduceOp.AVG, self._pg)
+                assert False, "allreduce_quantized is not supported yet"
+                # TODO: Support `allreduce_quantized`
+                # fut = allreduce_quantized([tensor], ReduceOp.AVG, self._pg)
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
+                assert work is not None
                 fut = work.get_future()
 
             # schedule grad normalization as a continuation
             # on the Future
             def callback(
                 fut: torch.futures.Future[List[torch.Tensor]],
-            ) -> torch.Tensor:
+            ) -> None:
                 nonlocal tensor
 
                 # check for exceptions
                 fut.value()
 
                 tensor /= self.num_participants()
 
-                return tensor
-
             assert fut is not None
-            if not should_quantize:
-                fut = fut.then(callback)
-            fut = self.wrap_future(fut, tensor)
-            return fut
-
+            fut = fut.then(callback)
+            fut = self.wrap_future(fut, None)
+            return work
         except Exception as e:
             self._logger.exception(
                 f"got exception in all reduce -- skipping remaining: {e}"
             )
             self.report_error(e)
 
-            fut = torch.futures.Future()  # pyre-fixme[29]: not a function
-            fut.set_result(tensor)
-            return fut
+            work = _DummyWork(None)
+
+            return work
 
     def report_error(self, e: Exception) -> None:
         """
@@ -429,7 +425,6 @@ def callback(
                 return default
 
         fut = fut.then(callback)
-        self._pending_work.append(cast(torch.futures.Future[object], fut))
         return fut
 
     def start_quorum(
@@ -562,7 +557,7 @@ def _async_quorum(
             self._logger.info(f"reconfiguring for {quorum_id=} {store_prefixed_addr=}")
             # We use the replica rank and world as we want all replicas in the PG.
             try:
-                with torch.profiler.record_function("torchft::manager::_pg.configure"):
+                with torch.profiler.record_function("torchft::manager::_pg::configure"):
                     self._pg.configure(
                         store_prefixed_addr, replica_rank, replica_world_size
                     )
@@ -694,21 +689,10 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         Raises:
             RuntimeError: if should_commit fails max_retries times in a row and max_retries is set
         """
-        for work in self._pending_work:
-            # check at the beginning of since .wait() may trigger errors
-            if self._errored is not None:
-                break
-
-            # We swallow the error at in a future then callback so this will
-            # never return an error.
-            work.wait()
-
         # make sure recovery is complete before committing
         if self._recovery_stream is not None:
             self._recovery_stream.synchronize()
 
-        self._pending_work = []
-
         if err := self._pg.errored():
             self.report_error(err)
 
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
@@ -164,9 +164,7 @@ def test_quorum_happy(self, client_mock: MagicMock) -> None:
 
         manager.start_quorum()
         manager.allreduce(torch.tensor([1.0])).wait()
-        self.assertEqual(len(manager._pending_work), 1)
         self.assertTrue(manager.should_commit())
-        self.assertEqual(len(manager._pending_work), 0)
 
         self.assertEqual(manager._quorum_id, 123)
         self.assertEqual(manager.current_step(), 1)
@@ -554,8 +552,6 @@ def test_manager_wrap_future(self, client_mock: MagicMock) -> None:
         self.assertIs(error.original_exception, e)
         self.assertEqual(wrapped_fut.value(), 2)
 
-        self.assertEqual(manager._pending_work, [wrapped_fut])
-
     @patch("torchft.manager.ManagerClient", autospec=True)
     def test_manager_wrap_future_timeout(self, client_mock: MagicMock) -> None:
         manager = self._create_manager(timeout=timedelta(seconds=0.01))
@@ -590,18 +586,16 @@ def test_manager_numerics(self, client_mock: MagicMock) -> None:
         manager._pg.allreduce.return_value = _DummyWork(None)
 
         self.assertTrue(manager.is_participating())
-        fut = torch.futures.Future()  # pyre-fixme[29]: not a function
-        fut = manager.allreduce(torch.tensor([1.0]))
-        result = fut.value()
-        torch.testing.assert_close(result, torch.tensor([1.0 / 5]))
+        tensor = torch.tensor([1.0])
+        manager.allreduce(tensor).wait()
+        torch.testing.assert_close(tensor, torch.tensor([1.0 / 5]))
 
         # check healing numerics
         manager._healing = True
         self.assertFalse(manager.is_participating())
-        fut = torch.futures.Future()  # pyre-fixme[29]: not a function
-        fut = manager.allreduce(torch.tensor([1.0]))
-        result = fut.value()
-        torch.testing.assert_close(result, torch.tensor([0.0]))
+        tensor = torch.tensor([1.0])
+        manager.allreduce(tensor).wait()
+        torch.testing.assert_close(tensor, torch.tensor([0.0]))
 
     @patch("torchft.manager.ManagerClient", autospec=True)
     def test_quorum_happy_timeouts(self, client_mock: MagicMock) -> None:
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -775,7 +775,7 @@ def abort(self) -> None:
 
     def errored(self) -> Optional[Exception]:
         # force a synchronization to ensure all work is complete
-        torch.cuda.synchronize()
+        torch.cuda.current_stream().synchronize()
 
         return self._errored
 
diff --git a/train_ddp.py b/train_ddp.py
@@ -51,7 +51,7 @@ def main() -> None:
     # majority of groups will be available so few batches will be dropped.
     sampler = DistributedSampler(
         trainset,
-        replica_group=REPLICA_GROUP_ID,
+        replica_rank=REPLICA_GROUP_ID,
         num_replica_groups=NUM_REPLICA_GROUPS,
         group_rank=0,
         # for DDP we can use replica groups of size 1, FSDP/PP/CP would need more.
diff --git a/train_diloco.py b/train_diloco.py