From d75d1f5c3d71c0d994c3505d3a8c7ad316da34f5 Mon Sep 17 00:00:00 2001 From: Wenbin Chen Date: Tue, 5 Nov 2024 07:26:12 +0000 Subject: [PATCH 1/4] Set "ds_grads_remaining" to 0 when module doesn't have this variable "ds_grads_remaining" is used to triger post_backward_function(). If the module is called more than once in one training step, this variable will be initialized every time. If backward() is also called multiple times, "ds_grads_remaining" will be reduced to a negative number. post_backward_function() will not be called as expected. This leads to extra fetch operation or extra memory usage. Set "ds_grads_remaining" to 0 only when it is not initialized to fix this issue.` Signed-off-by: Wenbin Chen --- deepspeed/runtime/zero/parameter_offload.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py index 082d7e874e4d..f945f5166190 100644 --- a/deepspeed/runtime/zero/parameter_offload.py +++ b/deepspeed/runtime/zero/parameter_offload.py @@ -392,7 +392,8 @@ def _run_before_forward_function(input): _run_after_backward_hook, inputs) def _post_backward_module_hook(module, inputs): - module.ds_grads_remaining = 0 + if not hasattr(module, "ds_grads_remaining"): + module.ds_grads_remaining = 0 if not hasattr(module, "post_bwd_fn"): From b83df91c55a4408817f89c3466b54f76fccb88d4 Mon Sep 17 00:00:00 2001 From: Wenbin Chen Date: Tue, 5 Nov 2024 07:48:50 +0000 Subject: [PATCH 2/4] Set "__n_available_params" to 0 in release_and_reset_all() Fix a bug that after the first training step, allocated parameters may bigger than "__max_n_available_params". "__n_available_params" is set to 0 in reset_step() which is called in backward(). All parameter are released in release_and_reset_all() which is called in step(). "__n_available_params" is reduced when parameter is released. These mean if step() is called after backward(), "__n_available_params" will be reduced to a negative number. "__n_available_params" is used to restrict fetched parameters, so negative value leads to a problem that fetched parameter will be larger than upper bound ("__max_n_available_params"). Move "__n_available_params = 0" to release_and_reset_all() to fix this issue. Signed-off-by: Wenbin Chen --- deepspeed/runtime/zero/partitioned_param_coordinator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py index 49f477cc4a1b..596d0e9c20f9 100644 --- a/deepspeed/runtime/zero/partitioned_param_coordinator.py +++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py @@ -252,7 +252,6 @@ def reset_step(self) -> None: self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10)) self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque()) self.__step_id = 0 - self.__n_available_params = 0 self.__profiler.reset_events() def _dump_params(self, tag, sub_module, params, step_id=None): @@ -430,7 +429,7 @@ def release_and_reset_all(self, module: Module) -> None: # there's a hook execution issue param.ds_active_sub_modules.clear() self.__release_param(param) - + self.__n_available_params = 0 for param in iter_params(module, recurse=True): if param.ds_status != ZeroParamStatus.NOT_AVAILABLE: raise RuntimeError(f"{param.ds_summary()} expected to be released") From 9f0d2aeeae8cc5a0041e8fa0fca9679a98e112cc Mon Sep 17 00:00:00 2001 From: Wenbin Chen Date: Wed, 13 Nov 2024 08:09:38 +0000 Subject: [PATCH 3/4] Add unit stage3 test for running model twice in one step If run model more than once in one training step, there may be issues. Add unit test to catch these kinds of problems. Signed-off-by: Wenbin Chen --- .../runtime/zero/test_zero_multiple_run.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/unit/runtime/zero/test_zero_multiple_run.py diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py new file mode 100644 index 000000000000..aa8c6d719248 --- /dev/null +++ b/tests/unit/runtime/zero/test_zero_multiple_run.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import deepspeed +import torch +from unit.common import DistributedTest, preferred_dtype +from unit.simple_model import SimpleModel, random_dataloader + + +class TestZ3MultipleModelCall(DistributedTest): + world_size = 1 + + def test_z3_multiple_model_call(self): + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "steps_per_print": 1, + "zero_optimization": { + "stage": 3 + }, + "fp16": { + "enabled": True, + "initial_scale_power": 8 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + hidden_dim, nlayers = 2048, 3 + model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers) + model_engine, _, _, _ = deepspeed.initialize(config=config_dict, + model=model, + model_parameters=model.parameters()) + data_loader = iter( + random_dataloader(model=model_engine, total_samples=10, hidden_dim=hidden_dim, device=model_engine.device)) + + for n, batch in enumerate(data_loader): + loss1 = model_engine(batch[0], batch[1]) + with torch.no_grad(): + loss2 = model_engine(batch[0], batch[1]) + loss = loss1 + loss2 + model_engine.backward(loss) + for name, submodule in model_engine.module.linears._modules.items(): + assert hasattr(submodule, "ds_grads_remaining"), \ + f"linears.{name} does not have variable ds_grads_remaining" + assert submodule.ds_grads_remaining == 0, \ + f"ds_grads_remaining of linears.{name} is not 0 ({submodule.ds_grads_remaining})" + model_engine.step() From 7cabf80a9e76deb532bf1923779f701fb2fea492 Mon Sep 17 00:00:00 2001 From: Wenbin Chen Date: Wed, 20 Nov 2024 02:28:28 +0000 Subject: [PATCH 4/4] Fix dtype error Signed-off-by: Wenbin Chen --- tests/unit/runtime/zero/test_zero_multiple_run.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py index aa8c6d719248..d4eb3a578cc9 100644 --- a/tests/unit/runtime/zero/test_zero_multiple_run.py +++ b/tests/unit/runtime/zero/test_zero_multiple_run.py @@ -20,10 +20,6 @@ def test_z3_multiple_model_call(self): "zero_optimization": { "stage": 3 }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "optimizer": { "type": "Adam", "params": {