Group allreduce futures

tushar00jain · tushar00jain · commit b4d5433a7d92 · 2025-06-03T23:07:40.000-07:00
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -46,7 +46,7 @@ def allreduce_quantized(
     opts: AllreduceOptions | ReduceOp,
     process_group: "ProcessGroup",
     sync_stream: cuda.Stream | None = None,
-) -> Future[None]:
+) -> Future[list[torch.Tensor]]:
     """
     Performs a quantized all-reduce operation on a list of tensors.
 
@@ -76,6 +76,8 @@ def allreduce_quantized(
         A Future that can be used to wait for the operation to complete and
         clean up intermediate buffers.
 
+        The future's value is set to an empty list
+
     Raises:
         NotImplementedError: If the reduce operation is not ReduceOp.AVG.
     """
@@ -137,7 +139,7 @@ def allreduce_quantized(
         # Dequantize and copy to output buffer.
         fused_dequantize_from_fp8(tensors, quantized_tensors, world_size)
 
-        class QuantizedAllReduceFuture(Future[None]):
+        class QuantizedAllReduceFuture(Future[list[torch.Tensor]]):
             def __init__(
                 self,
                 sync_stream: cuda.Stream,
@@ -149,12 +151,13 @@ def __init__(
                 self._quantized_tensors = quantized_tensors
                 self._quantized_tensors_out = quantized_tensors_out
 
-            def wait(self) -> None:
+            def wait(self) -> list[torch.Tensor]:
                 # Wait for the synchronization to complete.
                 cuda.current_stream().wait_stream(self._sync_stream)
                 # Clean up intermediate buffers.
                 del self._quantized_tensors_out
                 del self._quantized_tensors
+                return []
 
         # pyre-ignore[29]
         return QuantizedAllReduceFuture(
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -147,15 +147,15 @@ def _average(self) -> list[torch.Tensor]:
         """
         Averages the model parameters across the manager and returns the averaged parameters.
         """
-        works = []
         averaged_parameters = []
         for p in self._model.parameters():
             # Create a new tensor to store the averaged parameter
             avg_param = extract_local_tensor(p)
-            works.append(self._manager.allreduce(avg_param))
             averaged_parameters.append(avg_param)
-        for work in works:
-            work.wait()
+
+        work = self._manager.collect_all_allreduce(averaged_parameters)
+        work.wait()
+
         return averaged_parameters
 
 
@@ -193,9 +193,7 @@ def __init__(
         self._outer_optimizer = outer_optimizer
 
         # Stores pending all reduce
-        self._allreduce_futures: List[
-            torch.futures.Future[None] | torch.futures.Future[torch.Tensor]
-        ] = []
+        self._allreduce_futures: list[torch.futures.Future[None]] = []
 
         if bucket_cap_mb is not None:
             self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
@@ -320,18 +318,27 @@ def _average_grads(self) -> None:
 
     def _allreduce_per_param(self) -> None:
         """Performs allreduce on each gradient tensor separately (original method)."""
+        tensors = []
+
         for p in self._model_fragment.parameters():
             # Perform allreduce on the pseudogradients
             assert p.grad is not None
             if isinstance(p, DTensor):
-                work = self._manager.allreduce(
-                    p.grad._local_tensor, should_quantize=self.should_quantize
-                )
+                tensors.append(p.grad._local_tensor)
             else:
-                work = self._manager.allreduce(
-                    p.grad, should_quantize=self.should_quantize
-                )
-            self._allreduce_futures.append(work)
+                tensors.append(p.grad)
+
+        work = self._manager.collect_all_allreduce(
+            tensors, should_quantize=self.should_quantize
+        )
+
+        def callback(
+            fut: torch.futures.Future[List[torch.futures.Future[torch.Tensor]]],
+        ) -> None:
+            return
+
+        work = work.then(callback)
+        self._allreduce_futures.append(work)
 
     def bucketize_and_allreduce(
         self,
@@ -351,6 +358,9 @@ def bucketize_and_allreduce(
         total_size = sum(t.numel() for t in tensors)
         dtype, device = tensors[0].dtype, tensors[0].device
 
+        flat_buffers: list[torch.Tensor] = []
+        all_bucket_tensors: list[list[Tuple[torch.Tensor, int, int]]] = []
+
         offset = 0
         flat_index = 0
         while offset < total_size:
@@ -372,19 +382,27 @@ def bucketize_and_allreduce(
                 pack_offset += numel
                 flat_index += 1
 
-            work = self._manager.allreduce(
-                flat_buffer, should_quantize=self.should_quantize
-            )
+            flat_buffers.append(flat_buffer)
+            all_bucket_tensors.append(bucket_tensors)
+
+            offset += chunk_size
 
-            def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
-                nonlocal bucket_tensors, flat_buffer
+        def callback(
+            fut: torch.futures.Future[List[torch.futures.Future[torch.Tensor]]],
+        ) -> None:
+            nonlocal all_bucket_tensors, flat_buffers
+
+            for i in range(len(flat_buffers)):
+                bucket_tensors = all_bucket_tensors[i]
+                flat_buffer = flat_buffers[i]
                 for t, pack_offset, numel in bucket_tensors:
                     t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
 
-            work = work.then(callback)
-            self._allreduce_futures.append(work)
-
-            offset += chunk_size
+        work = self._manager.collect_all_allreduce(
+            flat_buffers, should_quantize=self.should_quantize
+        )
+        work = work.then(callback)
+        self._allreduce_futures.append(work)
 
     def _allreduce_bucketized(self) -> None:
         """
@@ -455,16 +473,6 @@ def __init__(
         if sync_every < len(model_fragments):
             raise ValueError("Only 1 fragment can be syncrhonized at a time")
 
-        # TODO: Support multiple fragments
-        # This requires changing the manager to support `should_commit` for each
-        # fragment separately.
-        if len(model_fragments) != 1:
-            raise ValueError("Multiple fragments are not supported yet")
-
-        # TODO: Support `fragment_sync_delay`
-        if fragment_sync_delay != 0:
-            raise ValueError("Fragment synchronization delay is not supported yet")
-
         # TODO: Support `fragment_update_alpha`
         if fragment_update_alpha != 0.0:
             raise ValueError(
diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict
+from typing import Dict, List
 from unittest import TestCase
 from unittest.mock import MagicMock, create_autospec
 
@@ -86,7 +86,7 @@ def test_local_sgd_healthy(self) -> None:
             manager.should_commit.return_value = True
             self.assertEqual(local_sgd._local_step, 0)
             self.assertEqual(manager.should_commit.call_count, 1)
-            self.assertEqual(manager.allreduce.call_count, 4)
+            self.assertEqual(manager.collect_all_allreduce.call_count, 1)
 
     def test_extract_local_tensor(self) -> None:
         regular_tensor = torch.rand(3, 3, requires_grad=True)
@@ -172,7 +172,7 @@ def test_diloco_healthy(self) -> None:
                 diloco._fragments[0].original_parameters, _params_dict(model)
             )
             self.assertEqual(manager.should_commit.call_count, 1)
-            self.assertEqual(manager.allreduce.call_count, parameter_count)
+            self.assertEqual(manager.collect_all_allreduce.call_count, 1)
 
             outer_opt_state = outer_optimizer.state_dict()
             self.assertEqual(len(outer_opt_state["state"]), parameter_count)
@@ -220,13 +220,12 @@ def test_diloco_allreduce_call_efficiency(
             loss.backward()
             inner_optimizer.step()
 
-            allreduce_calls = manager.allreduce.call_count
-            param_count = len([p for p in model.parameters() if p.requires_grad])
+            allreduce_calls = manager.collect_all_allreduce.call_count
 
             if expect_fewer_calls:
-                self.assertLess(int(allreduce_calls), int(param_count))
+                self.assertEqual(int(allreduce_calls), 1)
             else:
-                self.assertEqual(int(allreduce_calls), int(param_count))
+                self.assertEqual(int(allreduce_calls), 1)
 
     def test_bucketization_correctness(self) -> None:
         class TinyModel(nn.Module):
@@ -251,16 +250,20 @@ def forward(self, x):
         manager._use_async_quorum = False
         manager.should_commit.return_value = True
 
-        # Define fake allreduce: multiplies buffer by 2
-        def fake_allreduce(
-            tensor: Tensor, should_quantize: bool
-        ) -> torch.futures.Future[Tensor]:
-            tensor.mul_(2)
+        # Define fake collect_all_allreduce: multiplies all buffers by 2
+        def fake_collect_all_allreduce(
+            tensors: List[Tensor], should_quantize: bool
+        ) -> torch.futures.Future[List[torch.futures.Future[Tensor]]]:
+            for tensor in tensors:
+                tensor.mul_(2)
             fut = torch.futures.Future()  # pyre-fixme[29]: not a function
-            fut.set_result(tensor)
-            return fut
+            fut.set_result(tensors)
 
-        manager.allreduce.side_effect = fake_allreduce
+            futs = torch.futures.Future()  # pyre-fixme[29]: not a function
+            futs.set_result([fut])
+            return futs
+
+        manager.collect_all_allreduce.side_effect = fake_collect_all_allreduce
 
         diloco = DiLoCo(
             manager, [model], inner_opt, outer_opt, sync_every=2, use_bucketization=True
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -278,8 +278,33 @@ def shutdown(self, wait: bool = True) -> None:
             self._manager.shutdown()
         self._executor.shutdown(wait=wait)
 
+    def collect_all_allreduce(
+        self, tensors: List[torch.Tensor], should_quantize: bool = False
+    ) -> torch.futures.Future[List[torch.futures.Future[torch.Tensor]]]:
+        futs: List[torch.futures.Future[torch.Tensor]] = []
+        default_futs: List[torch.futures.Future[torch.Tensor]] = []
+
+        for tensor in tensors:
+            fut = self.allreduce(tensor, should_quantize=should_quantize)
+            futs.append(fut)
+
+            default_fut = torch.futures.Future()  # pyre-fixme[29]: not a function
+            default_fut.set_result(tensor)
+            default_futs.append(default_fut)
+
+        fut = torch.futures.collect_all(futs)
+
+        return self.wrap_future(fut, default_futs)
+
     def allreduce(
         self, tensor: torch.Tensor, should_quantize: bool = False
+    ) -> torch.futures.Future[torch.Tensor]:
+        fut = self._allreduce(tensor, should_quantize=should_quantize)
+        fut = self.wrap_future(fut, tensor)
+        return fut
+
+    def _allreduce(
+        self, tensor: torch.Tensor, should_quantize: bool = False
     ) -> torch.futures.Future[torch.Tensor]:
         """
         Fault tolerant allreduce the tensor and return a Future that will be completed when
@@ -314,9 +339,8 @@ def allreduce(
             # Run the allreduce async and save the work object so we can wait on
             # it later.
             fut: Optional[
-                torch.futures.Future[None]
+                torch.futures.Future[List[torch.Tensor]]
                 | torch.futures.Future[torch.Tensor]
-                | torch.futures.Future[List[torch.Tensor]]
             ] = None
             if should_quantize and IS_TRITON_AVAILABLE:
                 fut = allreduce_quantized([tensor], ReduceOp.AVG, self._pg)
@@ -331,19 +355,16 @@ def callback(
             ) -> torch.Tensor:
                 nonlocal tensor
 
-                # check for exceptions
                 fut.value()
 
-                tensor /= self.num_participants()
+                if not should_quantize:
+                    tensor /= self.num_participants()
 
                 return tensor
 
             assert fut is not None
-            if not should_quantize:
-                fut = fut.then(callback)
-            fut = self.wrap_future(fut, tensor)
+            fut = fut.then(callback)
             return fut
-
         except Exception as e:
             self._logger.exception(
                 f"got exception in all reduce -- skipping remaining: {e}"
@@ -668,21 +689,24 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         Raises:
             RuntimeError: if should_commit fails max_retries times in a row and max_retries is set
         """
-        for work in self._pending_work:
-            # check at the beginning of since .wait() may trigger errors
-            if self._errored is not None:
+        while True:
+            if len(self._pending_work) == 0:
                 break
 
+            work = self._pending_work.pop(0)
             # We swallow the error at in a future then callback so this will
             # never return an error.
             work.wait()
 
+            # Remove all work if there was an error.
+            # We won't commit in this case as well.
+            if self._errored is None:
+                break
+
         # make sure recovery is complete before committing
         if self._recovery_stream is not None:
             self._recovery_stream.synchronize()
 
-        self._pending_work = []
-
         if err := self._pg.errored():
             self.report_error(err)
 
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -774,9 +774,6 @@ def abort(self) -> None:
         super().abort()
 
     def errored(self) -> Optional[Exception]:
-        # force a synchronization to ensure all work is complete
-        torch.cuda.synchronize()
-
         return self._errored
 
     def getBackendName(self) -> str: