From d75d1f5c3d71c0d994c3505d3a8c7ad316da34f5 Mon Sep 17 00:00:00 2001
From: Wenbin Chen <wenbin.chen@intel.com>
Date: Tue, 5 Nov 2024 07:26:12 +0000
Subject: [PATCH 1/4] Set "ds_grads_remaining" to 0 when module doesn't have
 this variable

"ds_grads_remaining" is used to triger post_backward_function(). If the
module is called more than once in one training step, this variable will
be initialized every time. If backward() is also called multiple times,
"ds_grads_remaining" will be reduced to a negative number.
post_backward_function() will not be called as expected. This leads to
extra fetch operation or extra memory usage.

Set "ds_grads_remaining" to 0 only when it is not initialized to fix
this issue.`

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 deepspeed/runtime/zero/parameter_offload.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 082d7e874e4d..f945f5166190 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -392,7 +392,8 @@ def _run_before_forward_function(input):
                                                                _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
+            if not hasattr(module, "ds_grads_remaining"):
+                module.ds_grads_remaining = 0
 
             if not hasattr(module, "post_bwd_fn"):
 

From b83df91c55a4408817f89c3466b54f76fccb88d4 Mon Sep 17 00:00:00 2001
From: Wenbin Chen <wenbin.chen@intel.com>
Date: Tue, 5 Nov 2024 07:48:50 +0000
Subject: [PATCH 2/4] Set "__n_available_params" to 0 in
 release_and_reset_all()

Fix a bug that after the first training step, allocated parameters
may bigger than "__max_n_available_params".

"__n_available_params" is set to 0 in reset_step() which is called in
backward(). All parameter are released in release_and_reset_all() which
is called in step(). "__n_available_params" is reduced when parameter is
released. These mean if step() is called after backward(),
"__n_available_params" will be reduced to a negative number.
"__n_available_params" is used to restrict fetched parameters, so
negative value leads to a problem that fetched parameter will be larger
than upper bound ("__max_n_available_params").

Move "__n_available_params = 0" to release_and_reset_all() to fix this
issue.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 deepspeed/runtime/zero/partitioned_param_coordinator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 49f477cc4a1b..596d0e9c20f9 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -252,7 +252,6 @@ def reset_step(self) -> None:
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
-        self.__n_available_params = 0
         self.__profiler.reset_events()
 
     def _dump_params(self, tag, sub_module, params, step_id=None):
@@ -430,7 +429,7 @@ def release_and_reset_all(self, module: Module) -> None:
             # there's a hook execution issue
             param.ds_active_sub_modules.clear()
             self.__release_param(param)
-
+        self.__n_available_params = 0
         for param in iter_params(module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")

From 9f0d2aeeae8cc5a0041e8fa0fca9679a98e112cc Mon Sep 17 00:00:00 2001
From: Wenbin Chen <wenbin.chen@intel.com>
Date: Wed, 13 Nov 2024 08:09:38 +0000
Subject: [PATCH 3/4] Add unit stage3 test for running model twice in one step

If run model more than once in one training step, there may be issues.
Add unit test to catch these kinds of problems.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 .../runtime/zero/test_zero_multiple_run.py    | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 tests/unit/runtime/zero/test_zero_multiple_run.py

diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py
new file mode 100644
index 000000000000..aa8c6d719248
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_multiple_run.py
@@ -0,0 +1,57 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+from unit.common import DistributedTest, preferred_dtype
+from unit.simple_model import SimpleModel, random_dataloader
+
+
+class TestZ3MultipleModelCall(DistributedTest):
+    world_size = 1
+
+    def test_z3_multiple_model_call(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 3
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        hidden_dim, nlayers = 2048, 3
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+        model_engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                     model=model,
+                                                     model_parameters=model.parameters())
+        data_loader = iter(
+            random_dataloader(model=model_engine, total_samples=10, hidden_dim=hidden_dim, device=model_engine.device))
+
+        for n, batch in enumerate(data_loader):
+            loss1 = model_engine(batch[0], batch[1])
+            with torch.no_grad():
+                loss2 = model_engine(batch[0], batch[1])
+            loss = loss1 + loss2
+            model_engine.backward(loss)
+            for name, submodule in model_engine.module.linears._modules.items():
+                assert hasattr(submodule, "ds_grads_remaining"), \
+                  f"linears.{name} does not have variable ds_grads_remaining"
+                assert submodule.ds_grads_remaining == 0, \
+                  f"ds_grads_remaining of linears.{name} is not 0 ({submodule.ds_grads_remaining})"
+            model_engine.step()

From 7cabf80a9e76deb532bf1923779f701fb2fea492 Mon Sep 17 00:00:00 2001
From: Wenbin Chen <wenbin.chen@intel.com>
Date: Wed, 20 Nov 2024 02:28:28 +0000
Subject: [PATCH 4/4] Fix dtype error

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 tests/unit/runtime/zero/test_zero_multiple_run.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py
index aa8c6d719248..d4eb3a578cc9 100644
--- a/tests/unit/runtime/zero/test_zero_multiple_run.py
+++ b/tests/unit/runtime/zero/test_zero_multiple_run.py
@@ -20,10 +20,6 @@ def test_z3_multiple_model_call(self):
             "zero_optimization": {
                 "stage": 3
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "optimizer": {
                 "type": "Adam",
                 "params": {