layernorm: public api

OneAdder · OneAdder · commit ccc180ee427e · 2025-02-23T17:24:19.000+04:00
diff --git a/src/nf.f90 b/src/nf.f90
@@ -11,7 +11,8 @@ module nf
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layer_normalization
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -17,7 +17,8 @@ module nf_layer_constructors
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layer_normalization
 
   interface input
 
@@ -222,15 +223,23 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
-  module function self_attention(num_heads) result(res)
-    !! Rank-2 (sequence_length, out_features) self attention constructor.
-    !! sequence_length and model_dimension are determined at layer initialization, based on the
-    !! output shape of the previous layer.
-    integer, intent(in) :: num_heads
-      !! Number of attention heads
-    type(layer) :: res
-      !! Resulting layer instance
-  end function self_attention
+    module function self_attention(num_heads) result(res)
+      !! Rank-2 (sequence_length, out_features) self attention constructor.
+      !! sequence_length and model_dimension are determined at layer initialization, based on the
+      !! output shape of the previous layer.
+      integer, intent(in) :: num_heads
+        !! Number of attention heads
+      type(layer) :: res
+        !! Resulting layer instance
+    end function self_attention
+
+    module function layer_normalization() result(res)
+      !! Layer Normalization
+      !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+      !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+      !! https://arxiv.org/abs/1607.06450v1
+      type(layer) :: res
+    end function layer_normalization
 
   end interface
 
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
+  module function layer_normalization() result(res)
+    type(layer) :: res
+
+    res % name = 'layer_normalization'
+    allocate(res % p, source=layernorm_layer())
+  end function layer_normalization
+
 end submodule nf_layer_constructors_submodule
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
       type is(self_attention_layer)
@@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
+      type is(layernorm_layer)
+
+        select type(prev_layer => previous % p)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
     end select
 
   end subroutine backward_2d
@@ -250,26 +265,40 @@ module subroutine forward(self, input)
 
       type is(linear2d_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(self_attention_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(layernorm_layer)
+
+        ! Upstream layers permitted: linear2d, self_attention
+        select type(prev_layer => input % p)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -311,6 +340,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(self_attention_layer)
         allocate(output, source=this_layer % output)
+      type is(layernorm_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -354,8 +385,8 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or
-    ! self_attention layers is not known until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d,
+    ! self_attention or layernorm layers is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
@@ -367,6 +398,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(self_attention_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(layernorm_layer)
+        self % layer_shape = shape(this_layer % output)
       type is(maxpool2d_layer)
         self % layer_shape = shape(this_layer % output)
     end select
@@ -425,6 +458,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = this_layer % get_num_params()
       type is (self_attention_layer)
         num_params = this_layer % get_num_params()
+      type is (layernorm_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -458,6 +493,8 @@ module function get_params(self) result(params)
         params = this_layer % get_params()
       type is (self_attention_layer)
         params = this_layer % get_params()
+      type is (layernorm_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -491,6 +528,8 @@ module function get_gradients(self) result(gradients)
         gradients = this_layer % get_gradients()
       type is (self_attention_layer)
         gradients = this_layer % get_gradients()
+      type is (layernorm_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -549,6 +588,9 @@ module subroutine set_params(self, params)
       type is (self_attention_layer)
         call this_layer % set_params(params)
 
+      type is (layernorm_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -11,6 +11,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_layer, only: layer
   use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_loss, only: quadratic
@@ -163,6 +164,8 @@ module subroutine backward(self, output, loss)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
           type is(self_attention_layer)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+          type is(layernorm_layer)
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
         end select
       end if