Merge pull request #244 from NREL/gb/sum_grads

sum multi gpu gradients and apply once to weight equally in adam mome…
NREL · Nov 14, 2024 · de76035 · de76035
2 parents 1abd3fd + 1b7eff9
commit de76035
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 17 deletions.
diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py
@@ -1321,9 +1321,9 @@ def run_gradient_descent(
             Flag to break up the batch for parallel gradient descent
             calculations on multiple gpus. If True and multiple GPUs are
             present, each batch from the batch_handler will be divided up
-            between the GPUs and the resulting gradient from each GPU will
-            constitute a single gradient descent step with the nominal learning
-            rate that the model was initialized with.
+            between the GPUs and resulting gradients from each GPU will be
+            summed and then applied once per batch at the nominal learning
+            rate that the model and optimizer were initialized with.
         calc_loss_kwargs : dict
             Kwargs to pass to the self.calc_loss() method
 
@@ -1376,9 +1376,19 @@ def run_gradient_descent(
                             **calc_loss_kwargs,
                         )
                     )
-            for _, future in enumerate(futures):
+
+            # sum the gradients from each gpu to weight equally in
+            # optimizer momentum calculation
+            total_grad = None
+            for future in futures:
                 grad, loss_details = future.result()
-                optimizer.apply_gradients(zip(grad, training_weights))
+                if total_grad is None:
+                    total_grad = grad
+                else:
+                    for i, igrad in enumerate(grad):
+                        total_grad[i] += igrad
+
+            optimizer.apply_gradients(zip(total_grad, training_weights))
 
             self.timer.stop()
             logger.debug(

diff --git a/sup3r/models/base.py b/sup3r/models/base.py
@@ -89,10 +89,10 @@ def __init__(
         default_device : str | None
             Option for default device placement of model weights. If None and a
             single GPU exists, that GPU will be the default device. If None and
-            multiple GPUs exist, the CPU will be the default device (this was
-            tested as most efficient given the custom multi-gpu strategy
-            developed in self.run_gradient_descent()). Examples: "/gpu:0" or
-            "/cpu:0"
+            multiple GPUs exist, the first GPU will be the default device
+            (this was tested as most efficient given the custom multi-gpu
+             strategy developed in self.run_gradient_descent()). Examples:
+            "/gpu:0" or "/cpu:0"
         name : str | None
             Optional name for the GAN.
         """
@@ -685,10 +685,11 @@ def train_epoch(
             Flag to break up the batch for parallel gradient descent
             calculations on multiple gpus. If True and multiple GPUs are
             present, each batch from the batch_handler will be divided up
-            between the GPUs and the resulting gradient from each GPU will
-            constitute a single gradient descent step with the nominal learning
-            rate that the model was initialized with. If true and multiple gpus
-            are found, default_device device should be set to /cpu:0
+            between the GPUs and resulting gradients from each GPU will be
+            summed and then applied once per batch at the nominal learning
+            rate that the model and optimizer were initialized with.
+            If true and multiple gpus are found, ``default_device`` device
+            should be set to /gpu:0
 
         Returns
         -------
@@ -931,10 +932,11 @@ def train(
             Flag to break up the batch for parallel gradient descent
             calculations on multiple gpus. If True and multiple GPUs are
             present, each batch from the batch_handler will be divided up
-            between the GPUs and the resulting gradient from each GPU will
-            constitute a single gradient descent step with the nominal learning
-            rate that the model was initialized with. If true and multiple gpus
-            are found, default_device device should be set to /cpu:0
+            between the GPUs and resulting gradients from each GPU will be
+            summed and then applied once per batch at the nominal learning
+            rate that the model and optimizer were initialized with.
+            If true and multiple gpus are found, ``default_device`` device
+            should be set to /gpu:0
         tensorboard_log : bool
             Whether to write log file for use with tensorboard. Log data can
             be viewed with ``tensorboard --logdir <logdir>`` where ``<logdir>``