From 234093249a30dbc8c35fc8255f519fcd2235dd18 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Tue, 21 Mar 2023 19:52:33 +0800 Subject: [PATCH] fix id/uuid consistency in distributed training Signed-off-by: Zhiyuan Chen --- danling/runner/base_runner.py | 5 ++++- danling/runner/torch_runner.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/danling/runner/base_runner.py b/danling/runner/base_runner.py index 04ba3559..88d0f2f3 100644 --- a/danling/runner/base_runner.py +++ b/danling/runner/base_runner.py @@ -124,11 +124,14 @@ def init_tensorboard(self, *args, **kwargs) -> None: """ raise NotImplementedError - def set_seed(self, bias: Optional[int] = None) -> None: + def set_seed(self, seed: Optional[int] = None, bias: Optional[int] = None) -> None: r""" Set up random seed. Args: + seed: Random seed to set. + Defaults to `self.seed` (`config.seed`). + bias: Make the seed different for each processes. This avoids same data augmentation are applied on every processes. diff --git a/danling/runner/torch_runner.py b/danling/runner/torch_runner.py index 491f7e2d..e6c50d54 100644 --- a/danling/runner/torch_runner.py +++ b/danling/runner/torch_runner.py @@ -61,11 +61,14 @@ def init_tensorboard(self, *args, **kwargs) -> None: self.writer = SummaryWriter(*args, **kwargs) self.writer.add_scalar = catch(OSError, verbose=False)(self.writer.add_scalar) # type: ignore - def set_seed(self, bias: Optional[int] = None) -> None: + def set_seed(self, seed: Optional[int] = None, bias: Optional[int] = None) -> None: r""" Set up random seed. Args: + seed: Random seed to set. + Defaults to `self.seed` (`config.seed`). + bias: Make the seed different for each processes. This avoids same data augmentation are applied on every processes. @@ -75,7 +78,8 @@ def set_seed(self, bias: Optional[int] = None) -> None: Set to `False` to disable this feature. """ - seed = self.seed + if seed is None: + seed = self.seed if self.distributed: object_list = [seed] dist.broadcast_object_list(object_list) @@ -84,6 +88,7 @@ def set_seed(self, bias: Optional[int] = None) -> None: bias = self.rank if bias: seed += bias + self.seed = seed torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) @@ -185,6 +190,10 @@ def init_distributed(self) -> None: """ self.accelerator = Accelerator(**self.accelerate) + if self.distributed: + object_list = [self.id, self.uuid] + dist.broadcast_object_list(object_list) + self.id, self.uuid = object_list[0], object_list[1] def __getattr__(self, name: str) -> Any: if self.accelerator is not None and hasattr(self.accelerator, name):