@@ -4,10 +4,10 @@ program quadratic_fit
44 ! descent.
55 use nf, only: dense, input, network
66 use nf_dense_layer, only: dense_layer
7- use nf_optimizers, only: sgd
7+ use nf_optimizers, only: sgd, rmsprop, adam
88
99 implicit none
10- type (network) :: net(6 )
10+ type (network) :: net(9 )
1111
1212 ! Training parameters
1313 integer , parameter :: num_epochs = 1000
@@ -16,6 +16,9 @@ program quadratic_fit
1616 integer , parameter :: batch_size = 100
1717 real , parameter :: learning_rate = 0.01
1818 real , parameter :: decay_rate = 0.9
19+ real , parameter :: beta1 = 0.85
20+ real , parameter :: beta2 = 0.95
21+ real , parameter :: epsilon = 1e-8
1922
2023 ! Input and output data
2124 real , allocatable :: x(:), y(:) ! training data
@@ -51,19 +54,46 @@ program quadratic_fit
5154 call sgd_optimizer(net(1 ), x, y, xtest, ytest, learning_rate, num_epochs)
5255
5356 ! SGD, momentum
54- call sgd_optimizer(net(2 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 )
57+ call sgd_optimizer( &
58+ net(2 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 &
59+ )
5560
5661 ! SGD, momentum with Nesterov
57- call sgd_optimizer(net(3 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 , nesterov= .true. )
62+ call sgd_optimizer( &
63+ net(3 ), x, y, xtest, ytest, learning_rate, num_epochs, &
64+ momentum= 0.9 , nesterov= .true. &
65+ )
5866
5967 ! Batch SGD optimizer
6068 call batch_gd_optimizer(net(4 ), x, y, xtest, ytest, learning_rate, num_epochs)
6169
6270 ! Mini-batch SGD optimizer
63- call minibatch_gd_optimizer(net(5 ), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
71+ call minibatch_gd_optimizer( &
72+ net(5 ), x, y, xtest, ytest, learning_rate, num_epochs, batch_size &
73+ )
6474
6575 ! RMSProp optimizer
66- call rmsprop_optimizer(net(6 ), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
76+ call rmsprop_optimizer( &
77+ net(6 ), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate &
78+ )
79+
80+ ! Adam optimizer
81+ call adam_optimizer( &
82+ net(7 ), x, y, xtest, ytest, learning_rate, num_epochs, &
83+ beta1, beta2, epsilon &
84+ )
85+
86+ ! Adam optimizer with L2 regularization
87+ call adam_optimizer( &
88+ net(8 ), x, y, xtest, ytest, learning_rate, num_epochs, &
89+ beta1, beta2, epsilon, weight_decay_l2= 1e-4 &
90+ )
91+
92+ ! Adam optimizer with decoupled weight decay regularization
93+ call adam_optimizer( &
94+ net(9 ), x, y, xtest, ytest, learning_rate, num_epochs, &
95+ beta1, beta2, epsilon, weight_decay_decoupled= 1e-5 &
96+ )
6797
6898contains
6999
@@ -73,7 +103,9 @@ real elemental function quadratic(x) result(y)
73103 y = (x** 2 / 2 + x / 2 + 1 ) / 2
74104 end function quadratic
75105
76- subroutine sgd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , momentum , nesterov )
106+ subroutine sgd_optimizer ( &
107+ net , x , y , xtest , ytest , learning_rate , num_epochs , momentum , nesterov &
108+ )
77109 ! In the stochastic gradient descent (SGD) optimizer, we run the forward
78110 ! and backward passes and update the weights for each training sample,
79111 ! one at a time.
@@ -109,12 +141,19 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
109141 do i = 1 , size (x)
110142 call net % forward([x(i)])
111143 call net % backward([y(i)])
112- call net % update(sgd(learning_rate= learning_rate, momentum= momentum_value, nesterov= nesterov_value))
144+ call net % update( &
145+ sgd( &
146+ learning_rate= learning_rate, &
147+ momentum= momentum_value, &
148+ nesterov= nesterov_value &
149+ ) &
150+ )
113151 end do
114152
115153 if (mod (n, num_epochs / 10 ) == 0 ) then
116154 ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
117- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
155+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
156+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
118157 end if
119158
120159 end do
@@ -123,7 +162,9 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
123162
124163 end subroutine sgd_optimizer
125164
126- subroutine batch_gd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs )
165+ subroutine batch_gd_optimizer ( &
166+ net , x , y , xtest , ytest , learning_rate , num_epochs &
167+ )
127168 ! Like the stochastic gradient descent (SGD) optimizer, except that here we
128169 ! accumulate the weight gradients for all training samples and update the
129170 ! weights once per epoch.
@@ -147,7 +188,8 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
147188
148189 if (mod (n, num_epochs / 10 ) == 0 ) then
149190 ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
150- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
191+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
192+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
151193 end if
152194
153195 end do
@@ -156,7 +198,9 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
156198
157199 end subroutine batch_gd_optimizer
158200
159- subroutine minibatch_gd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , batch_size )
201+ subroutine minibatch_gd_optimizer ( &
202+ net , x , y , xtest , ytest , learning_rate , num_epochs , batch_size &
203+ )
160204 ! Like the batch SGD optimizer, except that here we accumulate the weight
161205 ! over a number of mini batches and update the weights once per mini batch.
162206 !
@@ -203,7 +247,8 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
203247
204248 if (mod (n, num_epochs / 10 ) == 0 ) then
205249 ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
206- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
250+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
251+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
207252 end if
208253
209254 end do
@@ -212,79 +257,36 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
212257
213258 end subroutine minibatch_gd_optimizer
214259
215- subroutine rmsprop_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , decay_rate )
260+ subroutine rmsprop_optimizer ( &
261+ net , x , y , xtest , ytest , learning_rate , num_epochs , decay_rate &
262+ )
216263 ! RMSprop optimizer for updating weights using root mean square
217264 type (network), intent (inout ) :: net
218265 real , intent (in ) :: x(:), y(:)
219266 real , intent (in ) :: xtest(:), ytest(:)
220267 real , intent (in ) :: learning_rate, decay_rate
221268 integer , intent (in ) :: num_epochs
222269 integer :: i, j, n
223- real , parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
224270 real , allocatable :: ypred(:)
225271
226- ! Define a dedicated type to store the RMSprop gradients.
227- ! This is needed because array sizes vary between layers and we need to
228- ! keep track of gradients for each layer over time.
229- ! For now this works only for dense layers.
230- ! We will need to define a similar type for conv2d layers.
231- type :: rms_gradient_dense
232- real , allocatable :: dw(:,:)
233- real , allocatable :: db(:)
234- end type rms_gradient_dense
235-
236- type (rms_gradient_dense), allocatable :: rms(:)
237-
238272 print ' (a)' , ' RMSProp optimizer'
239273 print ' (34("-"))'
240274
241- ! Here we allocate the array or RMS gradient derived types.
242- ! We need one for each dense layer, however we will allocate it to the
243- ! length of all layers as it will make housekeeping easier.
244- allocate (rms(size (net % layers)))
245-
246275 do n = 1 , num_epochs
247276
248277 do i = 1 , size (x)
249278 call net % forward([x(i)])
250279 call net % backward([y(i)])
251280 end do
252281
253- ! RMSprop update rule
254- do j = 1 , size (net % layers)
255- select type (this_layer = > net % layers(j) % p)
256- type is (dense_layer)
257-
258- ! If this is our first time here for this layer, allocate the
259- ! internal RMS gradient arrays and initialize them to zero.
260- if (.not. allocated (rms(j) % dw)) then
261- allocate (rms(j) % dw, mold= this_layer % dw)
262- allocate (rms(j) % db, mold= this_layer % db)
263- rms(j) % dw = 0
264- rms(j) % db = 0
265- end if
266-
267- ! Update the RMS gradients using the RMSprop moving average rule
268- rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw** 2
269- rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db** 2
270-
271- ! Update weights and biases using the RMSprop update rule
272- this_layer % weights = this_layer % weights - learning_rate &
273- / sqrt (rms(j) % dw + epsilon) * this_layer % dw
274- this_layer % biases = this_layer % biases - learning_rate &
275- / sqrt (rms(j) % db + epsilon) * this_layer % db
276-
277- ! We have updated the weights and biases, so we need to reset the
278- ! gradients to zero for the next epoch.
279- this_layer % dw = 0
280- this_layer % db = 0
281-
282- end select
283- end do
282+ call net % update( &
283+ rmsprop(learning_rate= learning_rate, decay_rate= decay_rate) &
284+ )
284285
285286 if (mod (n, num_epochs / 10 ) == 0 ) then
286287 ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
287- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
288+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
289+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
288290 end if
289291
290292 end do
@@ -293,6 +295,69 @@ subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs,
293295
294296 end subroutine rmsprop_optimizer
295297
298+ subroutine adam_optimizer ( &
299+ net , x , y , xtest , ytest , learning_rate , num_epochs , beta1 , beta2 , epsilon , &
300+ weight_decay_l2 , weight_decay_decoupled &
301+ )
302+ ! Adam optimizer
303+ type (network), intent (inout ) :: net
304+ real , intent (in ) :: x(:), y(:)
305+ real , intent (in ) :: xtest(:), ytest(:)
306+ real , intent (in ) :: learning_rate, beta1, beta2, epsilon
307+ real , intent (in ), optional :: weight_decay_l2
308+ real , intent (in ), optional :: weight_decay_decoupled
309+ integer , intent (in ) :: num_epochs
310+ real , allocatable :: ypred(:)
311+ integer :: i, n
312+ real :: weight_decay_l2_val
313+ real :: weight_decay_decoupled_val
314+
315+ ! Set default values for weight_decay_l2
316+ if (.not. present (weight_decay_l2)) then
317+ weight_decay_l2_val = 0.0
318+ else
319+ weight_decay_l2_val = weight_decay_l2
320+ end if
321+
322+ ! Set default values for weight_decay_decoupled
323+ if (.not. present (weight_decay_decoupled)) then
324+ weight_decay_decoupled_val = 0.0
325+ else
326+ weight_decay_decoupled_val = weight_decay_decoupled
327+ end if
328+
329+ print ' (a)' , ' Adam optimizer'
330+ print ' (34("-"))'
331+
332+ do n = 1 , num_epochs
333+ do i = 1 , size (x)
334+ call net % forward([x(i)])
335+ call net % backward([y(i)])
336+ end do
337+
338+ call net % update( &
339+ adam( &
340+ learning_rate= learning_rate, &
341+ beta1= beta1, &
342+ beta2= beta2, &
343+ epsilon= epsilon, &
344+ weight_decay_l2= weight_decay_l2_val, &
345+ weight_decay_decoupled= weight_decay_decoupled_val &
346+ ) &
347+ )
348+
349+ if (mod (n, num_epochs / 10 ) == 0 ) then
350+ ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
351+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
352+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
353+ end if
354+
355+ end do
356+
357+ print * , ' '
358+
359+ end subroutine adam_optimizer
360+
296361 subroutine shuffle (arr )
297362 ! Shuffle an array using the Fisher-Yates algorithm.
298363 integer , intent (inout ) :: arr(:)
0 commit comments