Add a note on deepspeed's gradient accumulation

allenai · Jan 15, 2025 · 0b6f5df · 0b6f5df
1 parent 4365dea
commit 0b6f5df
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 0 deletions.
diff --git a/open_instruct/ppo_vllm_thread_ray.py b/open_instruct/ppo_vllm_thread_ray.py
@@ -1152,6 +1152,7 @@ def vllm_generate(
                     mini_batch_end = mini_batch_start + args.local_mini_batch_size
                     mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
                     gradient_accumulation_idx = 0
+                    # NOTE: deepspeed handles gradient accumulation automatically; see https://github.com/microsoft/DeepSpeed/issues/758#issuecomment-801580724
                     for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size):
                         # print("micro batch start", micro_batch_start, self.rank)
                         micro_batch_end = micro_batch_start + args.per_device_train_batch_size

diff --git a/open_instruct/ppo_vllm_thread_ray_gtrl.py b/open_instruct/ppo_vllm_thread_ray_gtrl.py
@@ -1225,6 +1225,7 @@ def vllm_generate(
                     mini_batch_end = mini_batch_start + args.local_mini_batch_size
                     mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
                     gradient_accumulation_idx = 0
+                    # NOTE: deepspeed handles gradient accumulation automatically; see https://github.com/microsoft/DeepSpeed/issues/758#issuecomment-801580724
                     for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size):
                         # print("micro batch start", micro_batch_start, self.rank)
                         micro_batch_end = micro_batch_start + args.per_device_train_batch_size