Save the current offset in cache

borzunov · borzunov · commit 059fe1b000ed · 2021-12-21T02:13:14.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -65,19 +65,17 @@ def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropou
     def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
         b, n, _, h, device = *x.shape, self.heads, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
-        using_cache = exists(cache) and cache_key in cache
+        offset = cache.get('offset', 0) if exists(cache) else 0
 
         qkv = self.to_qkv(x).chunk(3, dim = -1)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
 
         if exists(rotary_pos_emb):
-            if using_cache:
-                rotary_pos_emb = rotary_pos_emb[..., n - 1:, :]  # FIXME: Fix rotary index here
-            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))
+            q, k, v = apply_pos_emb(rotary_pos_emb[..., offset:, :], (q, k, v))
 
         q = q * self.scale
 
-        if using_cache:
+        if offset > 0:
             k_top, v_top = cache[cache_key]
             k = torch.cat([k_top, k], dim=-2)
             v = torch.cat([v_top, v], dim=-2)
@@ -92,7 +90,7 @@ def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key
             dots.masked_fill_(~mask, mask_value)
             del mask
 
-        if self.causal and not using_cache:  # causality is naturally enforced if we run the cached inference
+        if self.causal and offset == 0:  # causality is naturally enforced for the cached inference
             i, j = dots.shape[-2:]
             mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
             dots.masked_fill_(mask, mask_value)
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -586,7 +586,7 @@ def forward(
             alpha = 0.1
             tokens = tokens * alpha + tokens.detach() * (1 - alpha)
 
-        if cache is not None and 'decoding' in cache:
+        if exists(cache) and cache.get('offset'):
             tokens = tokens[:, -1:]
         out = self.transformer(tokens, cache=cache)
 
@@ -598,13 +598,14 @@ def forward(
         # mask logits to make sure text predicts text (except last token), and image predicts image
 
         logits_mask = self.logits_mask[:, :seq_len]
-        if cache is not None:
-            if 'decoding' in cache:
-                logits_mask = logits_mask[:, -1:]
-            cache['decoding'] = True
+        if exists(cache) and cache.get('offset'):
+            logits_mask = logits_mask[:, -1:]
         max_neg_value = -torch.finfo(logits.dtype).max
         logits.masked_fill_(logits_mask, max_neg_value)
 
+        if exists(cache):
+            cache['offset'] = cache.get('offset', 0) + logits.shape[1]
+
         if not return_loss:
             return logits