multihead_attention: add cross attention

OneAdder · OneAdder · commit be36d93b42f0 · 2025-02-16T21:06:35.000+04:00
diff --git a/src/nf/nf_cross_attention_layer.f90 b/src/nf/nf_cross_attention_layer.f90
@@ -0,0 +1,76 @@
+module nf_cross_attention_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_activation, only: softmax
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
+
+  implicit none
+
+  type, extends(multihead_attention_layer) :: cross_attention_layer
+    real, allocatable :: gradient(:, :, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+  end type cross_attention_layer
+
+  interface cross_attention_layer
+    module function cross_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+      !! This function returns the `cross_attention_layer` instance.
+      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      type(cross_attention_layer) :: res
+    end function cross_attention_layer_cons
+  end interface cross_attention_layer
+
+contains
+  module function cross_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+    !! This function returns the `cross_attention_layer` instance.
+    integer, intent(in) :: sequence_length, model_dimension, n_heads
+    type(cross_attention_layer) :: res
+    res % sequence_length = sequence_length
+    res % model_dimension = model_dimension
+    res % n_heads = n_heads
+
+    if (mod(model_dimension, n_heads) /= 0) then
+      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
+      error stop
+    end if
+    res % head_size = model_dimension / n_heads
+
+    res % query_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % key_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % value_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    res % output_layer = linear2d_layer(sequence_length, model_dimension, model_dimension)
+    call res % query_layer % init([0])
+    call res % key_layer % init([0])
+    call res % value_layer % init([0])
+    call res % output_layer % init([0])
+
+    res % softmax_func = softmax()
+  end function cross_attention_layer_cons
+
+  module subroutine backward(self, input, gradient)
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+    real, intent(in) :: gradient(:, :)
+
+    call self % common_backward(input(1, :, :), gradient)
+    self % gradient(1, :, :) = self % query_layer % gradient
+    self % gradient(2, :, :) = self % key_layer % gradient + self % value_layer % gradient
+  end subroutine backward
+
+  module subroutine forward(self, input)
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+
+    call self % common_forward(input(1, :, :), input(2, :, :), input(2, :, :))
+  end subroutine forward
+
+  module subroutine init(self, input_shape)
+    class(cross_attention_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    call self % init_base(input_shape)
+    allocate(self % gradient(2, self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_cross_attention_layer
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -2,6 +2,7 @@ program test_multihead_attention_layer
   use iso_fortran_env, only: stderr => error_unit
   use nf_multihead_attention_layer, only: multihead_attention_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_cross_attention_layer, only: cross_attention_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_optimizers, only: sgd
   implicit none
@@ -26,6 +27,7 @@ program test_multihead_attention_layer
   call test_multihead_attention_update_gradients(attention, ok)
   call test_multihead_attention_forward_reallife_shape(ok)
   call test_self_attention(ok)
+  call test_cross_attention(ok)
 
 contains
   subroutine test_multihead_attention_split_heads(attention, input, ok, output)
@@ -318,4 +320,50 @@ subroutine test_self_attention(ok)
       write(stderr, '(a)') 'backward returned incorrect values.. failed'
     end if
   end subroutine test_self_attention
+
+  subroutine test_cross_attention(ok)
+    logical, intent(in out) :: ok
+    type(cross_attention_layer) :: attention
+    real :: query(2, 3) = reshape([-1., 0., 17., .4, 5., .6], [2, 3])
+    real :: key_value(2, 3) = reshape([0.1, -.2, 0.3, 4., 15., 0.5], [2, 3])
+    real :: input(2, 2, 3)
+    real :: output(2, 2, 3)
+    real :: output_flat(6)
+    real :: expected_output_flat(6) = [&
+        0.600311756, 0.471662223, 0.600311756, 0.471662223, 0.600311756, 0.471662223&
+    ]
+    real :: gradient(2, 3) = reshape([1., 2., .17, 4., .5, 6.], [2, 3])
+    real :: query_gradient_flat(6)
+    real :: key_value_gradient_flat(6)
+    real :: expected_query_gradient_flat(6) = [&
+        1.48406753E-03, 0.184446245, 1.48406753E-03, 0.184446245, 1.48406753E-03, 0.184446245&
+    ]
+    real :: expected_key_value_gradient_flat(6) = [&
+        0.303095698, 0.107004307, 0.303095698, 0.107004307, 0.303095698, 0.107004307&
+    ]
+    input(1, :, :) = query
+    input(2, :, :) = key_value
+
+    attention = cross_attention_layer(sequence_length=2, model_dimension=3, n_heads=1)
+    call attention % init([0])
+
+    call attention % forward(input)
+    output_flat = reshape(attention % output, shape(output_flat))
+    if (.not. all(output_flat.eq.expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+
+    call attention % backward(input, gradient)
+    query_gradient_flat = reshape(attention % gradient(1, :, :), shape(query_gradient_flat))
+    if (.not. all(query_gradient_flat.eq.expected_query_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect query values.. failed'
+    end if
+    key_value_gradient_flat = reshape(attention % gradient(2, :, :), shape(key_value_gradient_flat))
+    if (.not. all(key_value_gradient_flat.eq.expected_key_value_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect key-value values.. failed'
+    end if
+  end subroutine test_cross_attention
 end program test_multihead_attention_layer