multihead_attention: self attention

OneAdder · OneAdder · commit eef37e3a077e · 2025-02-16T20:46:38.000+04:00
diff --git a/src/nf/nf_self_attention_layer.f90 b/src/nf/nf_self_attention_layer.f90
@@ -0,0 +1,78 @@
+module nf_self_attention_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_activation, only: softmax
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
+
+  implicit none
+
+  type, extends(multihead_attention_layer) :: self_attention_layer
+    real, allocatable :: gradient(:, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+  end type self_attention_layer
+
+  interface self_attention_layer
+    module function self_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+      !! This function returns the `self_attention_layer` instance.
+      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      type(self_attention_layer) :: res
+    end function self_attention_layer_cons
+  end interface self_attention_layer
+
+contains
+  module function self_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+    !! This function returns the `self_attention_layer` instance.
+    integer, intent(in) :: sequence_length, model_dimension, n_heads
+    type(self_attention_layer) :: res
+    res % sequence_length = sequence_length
+    res % model_dimension = model_dimension
+    res % n_heads = n_heads
+
+    if (mod(model_dimension, n_heads) /= 0) then
+      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
+      error stop
+    end if
+    res % head_size = model_dimension / n_heads
+
+    res % query_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % key_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % value_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % output_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    call res % query_layer % init([0])
+    call res % key_layer % init([0])
+    call res % value_layer % init([0])
+    call res % output_layer % init([0])
+
+    res % softmax_func = softmax()
+  end function self_attention_layer_cons
+
+  module subroutine backward(self, input, gradient)
+    class(self_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, intent(in) :: gradient(:, :)
+
+    call self % common_backward(input, gradient)
+    self % gradient = &
+        self % query_layer % gradient &
+        + self % key_layer % gradient &
+        + self % value_layer % gradient
+  end subroutine backward
+
+  module subroutine forward(self, input)
+    class(self_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+
+    call self % common_forward(input, input, input)
+  end subroutine forward
+
+  module subroutine init(self, input_shape)
+    class(self_attention_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    call self % init_base(input_shape)
+    allocate(self % gradient(self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_self_attention_layer
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -1,6 +1,7 @@
 program test_multihead_attention_layer
   use iso_fortran_env, only: stderr => error_unit
   use nf_multihead_attention_layer, only: multihead_attention_layer
+  use nf_self_attention_layer, only: self_attention_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_optimizers, only: sgd
   implicit none
@@ -13,7 +14,7 @@ program test_multihead_attention_layer
   real :: output(3, 2, 2)
 
   attention = multihead_attention_layer(sequence_length=3, model_dimension=4, n_heads=2)
-  call attention % init([0])
+  call attention % init_base([0])
 
   call test_multihead_attention_split_heads(attention, sample_input, ok, split_heads_output)
   call test_multihead_attention_create_attention_matrix(attention, split_heads_output, ok)
@@ -24,6 +25,7 @@ program test_multihead_attention_layer
   call test_multihead_attention_backward(attention, ok)
   call test_multihead_attention_update_gradients(attention, ok)
   call test_multihead_attention_forward_reallife_shape(ok)
+  call test_self_attention(ok)
 
 contains
   subroutine test_multihead_attention_split_heads(attention, input, ok, output)
@@ -139,7 +141,7 @@ subroutine test_multihead_attention_forward(attention, ok)
     type(multihead_attention_layer), intent(in out) :: attention
     logical, intent(in out) :: ok
     real :: input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
-    real :: output(attention % sequence_length, attention % model_dimension, attention % batch_size)
+    real :: output(attention % sequence_length, attention % model_dimension)
     real :: output_flat(12)
     integer :: output_shape(2)
     integer :: attn_weights_shape(3)
@@ -194,7 +196,7 @@ subroutine test_multihead_attention_forward_reallife_shape(ok)
     call random_number(input)
 
     attention = multihead_attention_layer(sequence_length=148, model_dimension=512, n_heads=8)
-    call attention % init([0])
+    call attention % init_base([0])
 
     call attention % common_forward(input, input, input)
 
@@ -283,4 +285,37 @@ subroutine test_multihead_attention_update_gradients(attention, ok)
       write(stderr, '(a)') 'incorrect output after parameters update.. failed'
     end if
   end subroutine test_multihead_attention_update_gradients
+
+  subroutine test_self_attention(ok)
+    logical, intent(in out) :: ok
+    type(self_attention_layer) :: attention
+    real :: input(2, 3) = reshape([-1., 0., 17., .4, 5., .6], [2, 3])
+    real :: output(2, 3)
+    real :: output_flat(6)
+    real :: expected_output_flat(6) = [&
+        0.772716165, 0.577548742, 0.772716165, 0.577548742, 0.772716165, 0.577548742&
+    ]
+    real :: gradient(2, 3) = reshape([1., 2., .17, 4., .5, 6.], [2, 3])
+    real :: gradient_flat(6)
+    real :: expected_gradient_flat(6) = [&
+        0.350671142, 0.607403040, 0.350671142, 0.607403040, 0.350671142, 0.607403040&
+    ]
+
+    attention = self_attention_layer(sequence_length=2, model_dimension=3, n_heads=1)
+    call attention % init([0])
+
+    call attention % forward(input)
+    output_flat = reshape(attention % output, shape(output_flat))
+    if (.not. all(output_flat.eq.expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+
+    call attention % backward(input, gradient)
+    gradient_flat = reshape(attention % gradient, shape(gradient_flat))
+    if (.not. all(gradient_flat.eq.expected_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect values.. failed'
+    end if
+  end subroutine test_self_attention
 end program test_multihead_attention_layer