deviate from paper and use softclamping instead for laser

lucidrains · Dec 3, 2024 · 544c699 · 544c699
1 parent 5b4ddef
commit 544c699
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 4 deletions.
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.42.22',
+  version = '1.42.23',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py
@@ -1079,6 +1079,7 @@ def __init__(
         neutreno_alpha = 0.4,
         learned_value_residual_mix = False,
         laser = False, # https://arxiv.org/abs/2411.03493v1
+        laser_softclamp_value = 15.,
         onnxable = False,
         attend_sdp_kwargs: dict = dict(
             enable_flash = True,
@@ -1121,6 +1122,7 @@ def __init__(
         # enhancing gradients to attention through exponentiated values
 
         self.laser = laser
+        self.laser_softclamp_value = laser_softclamp_value
 
         # relations projection from tp-attention
 
@@ -1448,8 +1450,7 @@ def forward(
             attn_bias = pad_at_dim(attn_bias, (num_mem_kv, 0))
 
         if self.laser:
-            values_max = v.amax(dim = -2, keepdim = True).detach() # numerical stability
-            v = v - values_max
+            v = softclamp(v, self.laser_softclamp_value)
             v = v.exp()
 
         # attention is all we need
@@ -1464,7 +1465,7 @@ def forward(
         # laser
 
         if self.laser:
-            out = log(out) + values_max
+            out = log(out)
 
         # store the values for resformer or Neutreno