multihead_attention: add more comments

OneAdder · OneAdder · commit a5030cb820cb · 2025-02-16T21:29:41.000+04:00
diff --git a/src/nf/nf_cross_attention_layer.f90 b/src/nf/nf_cross_attention_layer.f90
@@ -7,6 +7,11 @@ module nf_cross_attention_layer
   implicit none
 
   type, extends(multihead_attention_layer) :: cross_attention_layer
+    !! Cross Attention Layer
+    !! Source:
+    !! Bahdanau, D. (2014)
+    !! Neural machine translation by jointly learning to align and translate.
+    !! https://arxiv.org/pdf/1409.0473
     real, allocatable :: gradient(:, :, :)
   contains
     procedure :: forward
@@ -50,6 +55,7 @@ module function cross_attention_layer_cons(sequence_length, model_dimension, n_h
   end function cross_attention_layer_cons
 
   module subroutine backward(self, input, gradient)
+    !! Cross Attention Back propagation
     class(cross_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :, :)
     real, intent(in) :: gradient(:, :)
@@ -60,6 +66,9 @@ module subroutine backward(self, input, gradient)
   end subroutine backward
 
   module subroutine forward(self, input)
+    !! Cross Attention Forward propagation
+    !! Input Shape (kind, sequence_length, model_dimension)
+    !! where kind is 1 for Query and 2 for Key-Value
     class(cross_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :, :)
 
diff --git a/src/nf/nf_self_attention_layer.f90 b/src/nf/nf_self_attention_layer.f90
@@ -7,6 +7,11 @@ module nf_self_attention_layer
   implicit none
 
   type, extends(multihead_attention_layer) :: self_attention_layer
+    !! Self Attention Layer
+    !! Source:
+    !! Parikh, A. P., Taeckstroem, O., Das, D., & Uszkoreit, J. (2016)
+    !! A decomposable attention model for natural language inference.
+    !! https://arxiv.org/pdf/1606.01933
     real, allocatable :: gradient(:, :)
   contains
     procedure :: forward
@@ -50,6 +55,8 @@ module function self_attention_layer_cons(sequence_length, model_dimension, n_he
   end function self_attention_layer_cons
 
   module subroutine backward(self, input, gradient)
+    !! Self Attention back propagation
+    !! Returns sum of Query, Key and Value gradients
     class(self_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)
     real, intent(in) :: gradient(:, :)
@@ -62,6 +69,9 @@ module subroutine backward(self, input, gradient)
   end subroutine backward
 
   module subroutine forward(self, input)
+    !! Cross Attention forward propagation
+    !! Passes input three times into MultiHead Attention
+    !! Input Shape: (sequence_length, model_dimension)
     class(self_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)