Bookkeeping for velocity, rms_gradient, etc.; optimizer tests now pass

milancurcic · milancurcic · commit e9ba73e8f81c · 2025-07-28T14:55:18.000-04:00
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -44,6 +44,7 @@ end subroutine minimize
     real :: momentum = 0
     logical :: nesterov = .false.
     real, allocatable, private :: velocity(:)
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_sgd
     procedure :: minimize => minimize_sgd
@@ -59,6 +60,7 @@ end subroutine minimize
     real :: decay_rate = 0.9
     real :: epsilon = 1e-8
     real, allocatable, private :: rms_gradient(:)
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_rmsprop
     procedure :: minimize => minimize_rmsprop
@@ -82,6 +84,7 @@ end subroutine minimize
     real :: weight_decay_decoupled = 0 ! decoupled weight decay regularization (AdamW)
     real, allocatable, private :: m(:), v(:)
     integer, private :: t = 0
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_adam
     procedure :: minimize => minimize_adam
@@ -99,6 +102,7 @@ end subroutine minimize
     real :: learning_rate_decay = 0
     real, allocatable, private :: sum_squared_gradient(:)
     integer, private :: t = 0
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_adagrad
     procedure :: minimize => minimize_adagrad
@@ -121,19 +125,38 @@ pure subroutine minimize_sgd(self, param, gradient)
     !! update rule.
     class(sgd), intent(inout) :: self
     real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(in) :: gradient(:) ! Always the same size as param
+    integer :: end_index
 
     if (self % momentum > 0) then
+
+      ! end_index is part of the bookkeeping for updating velocity because each
+      ! batch update makes two calls to minimize, one for the weights and one for
+      ! the biases.
+      ! We use start_index and end_index to update the appropriate sections
+      ! of the velocity array.
+      end_index = self % start_index + size(param) - 1
+
       ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
+      self % velocity(self % start_index:end_index) = &
+        self % momentum * self % velocity(self % start_index:end_index) &
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        param = param + self % momentum * self % velocity &
+        param = param + self % momentum * self % velocity(self % start_index:end_index) &
           - self % learning_rate * gradient
       else
-        param = param + self % velocity
+        param = param + self % velocity(self % start_index:end_index)
+      end if
+
+      if (self % start_index == 1) then
+        ! We updated the weights part, now we shift forward for the biases part
+        self % start_index = end_index + 1
+      else
+        ! We updated the biases part, now we shift back to start for the next batch
+        self % start_index = 1
       end if
+
     else
       ! Apply regular update
       param = param - self % learning_rate * gradient
@@ -157,14 +180,27 @@ pure subroutine minimize_rmsprop(self, param, gradient)
     class(rmsprop), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
+    self % rms_gradient(self % start_index:end_index) = &
+      self % decay_rate * self % rms_gradient(self % start_index:end_index) &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
     param = param - self % learning_rate &
-      / sqrt(self % rms_gradient + self % epsilon) * gradient
+      / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) &
+      * gradient
+
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
 
   end subroutine minimize_rmsprop
 
@@ -185,20 +221,27 @@ pure subroutine minimize_adam(self, param, gradient)
     class(adam), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
     associate(g => gradient + self % weight_decay_l2 * param)
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+      self % m(self % start_index:end_index) = &
+        self % beta1 * self % m(self % start_index:end_index) &
+        + (1 - self % beta1) * g
+      self % v(self % start_index:end_index) = &
+        self % beta2 * self % v(self % start_index:end_index) &
+        + (1 - self % beta2) * g**2
     end associate
 
     ! Compute bias-corrected first and second moment estimates.
     associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
+      m_hat => self % m(self % start_index:end_index) / (1 - self % beta1**self % t), &
+      v_hat => self % v(self % start_index:end_index) / (1 - self % beta2**self % t) &
     )
 
     ! Update parameters.
@@ -208,6 +251,14 @@ pure subroutine minimize_adam(self, param, gradient)
 
     end associate
 
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
+
   end subroutine minimize_adam
 
 
@@ -226,6 +277,9 @@ pure subroutine minimize_adagrad(self, param, gradient)
     class(adagrad), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     ! Update the current time step
     self % t = self % t + 1
@@ -239,13 +293,23 @@ pure subroutine minimize_adagrad(self, param, gradient)
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
 
-      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+      self % sum_squared_gradient(self % start_index:end_index) = &
+        self % sum_squared_gradient(self % start_index:end_index) + g**2
 
-      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      param = param - learning_rate * g &
+        / (sqrt(self % sum_squared_gradient(self % start_index:end_index)) &
         + self % epsilon)
 
     end associate
 
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
+
   end subroutine minimize_adagrad
 
 end module nf_optimizers