From 1f5537902aece8ad54ab65cc47c57e61918ac31c Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 01:09:36 -0700
Subject: [PATCH 001/112] adding output_attentions arg

---
 longformer/longformer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/longformer/longformer.py b/longformer/longformer.py
index 81e55d3..89d8d15 100644
--- a/longformer/longformer.py
+++ b/longformer/longformer.py
@@ -58,7 +58,6 @@ def __init__(self, config, layer_id):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.output_attentions = config.output_attentions
         self.num_heads = config.num_attention_heads
         self.head_dim = int(config.hidden_size / config.num_attention_heads)
         self.embed_dim = config.hidden_size
@@ -92,6 +91,7 @@ def forward(
         head_mask=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
+        output_attentions=False,
     ):
         '''
         The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
@@ -181,7 +181,6 @@ def forward(
         if key_padding_mask is not None:
             # softmax sometimes inserts NaN if all positions are masked, replace them with 0
             attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0)
-
         attn_weights = attn_weights_float.type_as(attn_weights)
         attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
         v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
@@ -240,7 +239,7 @@ def forward(
             attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states)
 
         context_layer = attn.transpose(0, 1)
-        if self.output_attentions:
+        if output_attentions:
             if extra_attention_mask is not None:
                 # With global attention, return global attention probabilities only
                 # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
@@ -254,5 +253,5 @@ def forward(
                 # batch_size x num_heads x sequence_length x window_size
                 # which is the attention weights of every token attending to its neighbours
                 attn_weights = attn_weights.permute(0, 2, 1, 3)
-        outputs = (context_layer, attn_weights) if self.output_attentions else (context_layer,)
+        outputs = (context_layer, attn_weights) if output_attentions else (context_layer,)
         return outputs

From b98d1912e33e2e28a87a0c0c048a94e9d1bead9e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 01:36:58 -0700
Subject: [PATCH 002/112] adding gradient_checkpointing config

---
 longformer/longformer_encoder_decoder.py | 77 ++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 longformer/longformer_encoder_decoder.py

diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py
new file mode 100644
index 0000000..741939a
--- /dev/null
+++ b/longformer/longformer_encoder_decoder.py
@@ -0,0 +1,77 @@
+from typing import List, Optional, Tuple, Dict
+from torch import nn, Tensor
+from longformer.longformer import LongformerSelfAttention
+from transformers.modeling_bart import BartConfig, BartForConditionalGeneration
+
+
+class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration):
+    def __init__(self, config):
+        super().__init__(config)
+        for i, layer in enumerate(self.model.encoder.layers):
+            layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i)
+
+
+class LongformerEncoderDecoderConfig(BartConfig):
+    def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
+                 autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
+                 gradient_checkpointing: bool = False, **kwargs):
+        """
+        Args:
+            attention_window: list of attention window sizes of length = number of layers.
+                window size = number of attention locations on each side.
+                For an affective window size of 512, use `attention_window=[256]*num_layers`
+                which is 256 on each side.
+            attention_dilation: list of attention dilation of length = number of layers.
+                attention dilation of `1` means no dilation.
+            autoregressive: do autoregressive attention or have attention of both sides
+            attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
+                selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
+        """
+        super().__init__(**kwargs)
+        self.attention_window = attention_window
+        self.attention_dilation = attention_dilation
+        self.autoregressive = autoregressive
+        self.attention_mode = attention_mode
+        self.gradient_checkpointing = gradient_checkpointing
+        assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
+
+
+class LongformerSelfAttentionForBart(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id)
+        self.output = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
+        attn_mask: Optional[Tensor] = None,
+        need_weights=False,
+        output_attentions=False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        assert attn_mask is None
+
+        # LongformerSelfAttention expects this shape
+        query = query.view(bsz, tgt_len, embed_dim)
+        outputs = self.longformer_self_attn(
+            query,
+            attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = outputs[0]
+        attn_output = attn_output.contiguous().view(tgt_len, bsz, embed_dim)
+        attn_output = self.output(attn_output)
+
+        return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None)

From c10277cca8ce58c01bfc04722517a7b019102585 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 01:38:04 -0700
Subject: [PATCH 003/112] convert bart to longformer_encoder_decoder + memory
 profiler

---
 ...onvert_bart_to_longformerencoderdecoder.py | 148 ++++++++++++++++++
 scripts/mem_profiler.py                       |  58 +++++++
 2 files changed, 206 insertions(+)
 create mode 100644 scripts/convert_bart_to_longformerencoderdecoder.py
 create mode 100644 scripts/mem_profiler.py

diff --git a/scripts/convert_bart_to_longformerencoderdecoder.py b/scripts/convert_bart_to_longformerencoderdecoder.py
new file mode 100644
index 0000000..e469819
--- /dev/null
+++ b/scripts/convert_bart_to_longformerencoderdecoder.py
@@ -0,0 +1,148 @@
+import argparse
+import logging
+import os
+
+from transformers import BartTokenizer
+
+from transformers import BartForConditionalGeneration
+from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart, LongformerEncoderDecoderConfig
+from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def create_long_model(
+    save_model_to,
+    base_model='facebook/bart-large',
+    tokenizer_name_or_path='facebook/bart-large',
+    attention_window=512,
+    max_pos=4096
+):
+    model = BartForConditionalGeneration.from_pretrained(base_model)
+    tokenizer = BartTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos)
+    config = LongformerEncoderDecoderConfig.from_pretrained(base_model)
+    model.config = config
+
+    # in BART attention_probs_dropout_prob is attention_dropout, but LongformerSelfAttention
+    # expects attention_probs_dropout_prob, so set it here
+    config.attention_probs_dropout_prob = config.attention_dropout
+    config.architectures = ['LongformerEncoderDecoderForConditionalGeneration', ]
+
+    # extend position embeddings
+    tokenizer.model_max_length = max_pos
+    tokenizer.init_kwargs['model_max_length'] = max_pos
+    current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
+    assert current_max_pos == config.max_position_embeddings + 2
+
+    config.max_position_embeddings = max_pos
+    max_pos += 2  # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2
+    assert max_pos >= current_max_pos
+
+    # allocate a larger position embedding matrix for the encoder
+    new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size)
+    # copy position embeddings over and over to initialize the new position embeddings
+    k = 2
+    step = current_max_pos - 2
+    while k < max_pos - 1:
+        new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[2:]
+        k += step
+    model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed
+
+    # allocate a larger position embedding matrix for the decoder
+    new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size)
+    # copy position embeddings over and over to initialize the new position embeddings
+    k = 2
+    step = current_max_pos - 2
+    while k < max_pos - 1:
+        new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:]
+        k += step
+    model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed
+
+    # replace the `modeling_bart.SelfAttention` object with `LongformerSelfAttention`
+    config.attention_window = [attention_window] * config.num_hidden_layers
+    config.attention_dilation = [1] * config.num_hidden_layers
+
+    for i, layer in enumerate(model.model.encoder.layers):
+        longformer_self_attn_for_bart = LongformerSelfAttentionForBart(config, layer_id=i)
+
+        longformer_self_attn_for_bart.longformer_self_attn.query = layer.self_attn.q_proj
+        longformer_self_attn_for_bart.longformer_self_attn.key = layer.self_attn.k_proj
+        longformer_self_attn_for_bart.longformer_self_attn.value = layer.self_attn.v_proj
+
+        longformer_self_attn_for_bart.longformer_self_attn.query_global = layer.self_attn.q_proj
+        longformer_self_attn_for_bart.longformer_self_attn.key_global = layer.self_attn.k_proj
+        longformer_self_attn_for_bart.longformer_self_attn.value_global = layer.self_attn.v_proj
+
+        longformer_self_attn_for_bart.output = layer.self_attn.out_proj
+
+        layer.self_attn = longformer_self_attn_for_bart
+    logger.info(f'saving model to {save_model_to}')
+    model.save_pretrained(save_model_to)
+    tokenizer.save_pretrained(save_model_to)
+    return model, tokenizer
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert BART to LongBART. Replaces BART encoder's SelfAttnetion with LongformerSelfAttention")
+    parser.add_argument(
+        '--base_model',
+        type=str,
+        default='facebook/bart-large',
+        help='The name or path of the base model you want to convert'
+    )
+    parser.add_argument(
+        '--tokenizer_name_or_path',
+        type=str,
+        default='facebook/bart-large',
+        help='The name or path of the tokenizer'
+    )
+    parser.add_argument(
+        '--save_model_to',
+        type=str,
+        required=True,
+        help='The path to save the converted model'
+    )
+    parser.add_argument(
+        '--attention_window',
+        type=int,
+        default=512,
+        help='attention window size for longformer self attention'
+    )
+    parser.add_argument(
+        '--max_pos',
+        type=int,
+        default=4096,
+        help='maximum encoder positions'
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.save_model_to):
+        os.mkdir(args.save_model_to)
+
+    create_long_model(
+        save_model_to=args.save_model_to,
+        base_model=args.base_model,
+        tokenizer_name_or_path=args.tokenizer_name_or_path,
+        attention_window=args.attention_window,
+        max_pos=args.max_pos
+    )
+
+    tokenizer = BartTokenizer.from_pretrained(args.save_model_to)
+    TXT = "My friends are <mask> but they eat too many carbs."
+    model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(args.save_model_to)
+    model.model.encoder.config.gradient_checkpointing = True
+    model.model.decoder.config.gradient_checkpointing = True
+    data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048)
+    input_ids = data['input_ids']
+    attention_mask = data['attention_mask']
+    logits = model(input_ids, attention_mask=attention_mask)[0]
+    masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    probs = logits[0, masked_index].softmax(dim=0)
+    values, predictions = probs.topk(5)
+    print(tokenizer.decode(predictions).split())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py
new file mode 100644
index 0000000..64a6d56
--- /dev/null
+++ b/scripts/mem_profiler.py
@@ -0,0 +1,58 @@
+from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
+from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig
+
+import torch
+from torch.utils.data import DataLoader, Dataset
+from pytorch_lightning import Trainer
+import pytorch_lightning as pl
+
+seqlen = 1024 * 12
+global_size = 0  # seqlen // 100
+attention_window = 512  # one sided
+
+
+class CoolDataset(Dataset):
+    def __len__(self):
+        return 1024  # number of examples
+
+    def __getitem__(self, idx):
+        tokne_ids = torch.tensor([5] * seqlen)
+        mask = torch.tensor([1] * seqlen)
+        # mask[:global_size] = 2
+        return tokne_ids, mask
+
+
+class MemoryProfiler(pl.LightningModule):
+
+    def __init__(self, hparams=None):
+        super().__init__()
+        self.hparams = hparams
+
+        config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096')
+        config.max_position_embeddings = seqlen + 2
+        config.gradient_checkpointing = True
+        config.attention_mode = 'sliding_chunks'
+        config.attention_window = [attention_window] * config.num_hidden_layers
+        self.model = LongformerEncoderDecoderForConditionalGeneration(config)
+
+    def forward(self, x, y):
+        print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3)
+        return self.model(x, attention_mask=y)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x, y)
+        loss = y_hat[0].sum()
+        return {'loss': loss}
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.001)
+
+    def train_dataloader(self):
+        return DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+
+
+if __name__ == '__main__':
+    model = MemoryProfiler()
+    trainer = Trainer(gpus=[0], progress_bar_refresh_rate=1, max_epochs=1, amp_level='O2', use_amp=True)
+    trainer.fit(model)

From e29d7f53697fa755c51eec0a4ebc86e53f520d1e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 01:48:12 -0700
Subject: [PATCH 004/112] reqs and init

---
 longformer/__init__.py | 3 +++
 requirements.txt       | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/longformer/__init__.py b/longformer/__init__.py
index e69de29..d3e343c 100644
--- a/longformer/__init__.py
+++ b/longformer/__init__.py
@@ -0,0 +1,3 @@
+from longformer.longformer import Longformer, LongformerForMaskedLM, LongformerConfig
+from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig
+from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index f6806dc..bbee48c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
+-e git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
+
 torch>=1.2.0
-transformers>=2.2.0
 tensorboardX
-pytorch-lightning==0.6.0
-test-tube==0.7.5
+pytorch-lightning>=0.7.6
+test-tube

From dd0dc0d55827316ccfff6dad3bac5732855537c6 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 10:47:02 -0700
Subject: [PATCH 005/112] fix req

---
 requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bbee48c..87a3c78 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
--e git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
-
+transformers @ git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
 torch>=1.2.0
 tensorboardX
 pytorch-lightning>=0.7.6

From 54a13288477fcea9852f19b71b7ad6853611aad8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 4 Jul 2020 10:49:24 -0700
Subject: [PATCH 006/112] req

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 87a3c78..94a1875 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-transformers @ git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
+transformers @ git+http://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
 torch>=1.2.0
 tensorboardX
 pytorch-lightning>=0.7.6

From 243cfe802096957c866bab14d2a16426fdfbd169 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 6 Jul 2020 09:12:51 -0700
Subject: [PATCH 007/112] Update README.md

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 5b0fd2c..833e7c9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,20 @@
 # <p align=center>`Longformer`</p>
 `Longformer` is a BERT-like model for long documents.
 
+**\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\***
+
+A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. 
+
+The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. 
+```
+pip install git+https://github.com/allenai/longformer.git@encoderdecoder  
+
+# checkpoint: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-12288.tar.gz
+
+from longformer import LongformerEncoderDecoderForConditionalGeneration
+model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True)
+```
+
 **\*\*\*\*\* New June 2nd, 2020: Integrating with Huggingface + Train your own long model + Gradient checkpointing \*\*\*\*\***
 
 1. `Longformer` is now integrated in the huggingface/transformers [release v2.11.0](https://github.com/huggingface/transformers/tree/v2.11.0). Now you can do

From fbbc770d6e8d160af7a255144173afca50b48486 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 6 Jul 2020 09:13:18 -0700
Subject: [PATCH 008/112] Update README.md

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 833e7c9..8464003 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,7 @@
 
 **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\***
 
-A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. 
-
-The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. 
+A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. 
 ```
 pip install git+https://github.com/allenai/longformer.git@encoderdecoder  
 

From 95296ad89e200b7a7a56f34d4a87cde47162b777 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 15 Jul 2020 23:19:32 -0700
Subject: [PATCH 009/112] pretraining script

---
 requirements.txt    |   6 +-
 scripts/pretrain.py | 277 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 280 insertions(+), 3 deletions(-)
 create mode 100644 scripts/pretrain.py

diff --git a/requirements.txt b/requirements.txt
index f6806dc..75cbab1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-torch>=1.2.0
-transformers>=2.2.0
+torch>=1.5.0
+transformers>=3.0.1
 tensorboardX
-pytorch-lightning==0.6.0
+pytorch-lightning==0.7.6
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
new file mode 100644
index 0000000..060ab0e
--- /dev/null
+++ b/scripts/pretrain.py
@@ -0,0 +1,277 @@
+import argparse
+import glob
+import os
+import random
+import logging
+import numpy as np
+from tqdm import tqdm
+import torch
+from transformers import AutoTokenizer, AutoModelWithLMHead
+from transformers import DataCollatorForLanguageModeling
+from transformers.optimization import AdamW, get_linear_schedule_with_warmup
+
+from torch.utils.data import Dataset, DataLoader
+import pytorch_lightning as ptl
+from pytorch_lightning.logging.test_tube import TestTubeLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class MMapTextDataset(Dataset):
+    def __init__(self, mmap_filename, chunk_size):
+        self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
+        # defer loading the token_ids memmap until after the first __getitem__ call.
+        # when spawning new processes for ddp, there is a hard limit in python < 3.8 that
+        # pickle files need to be < 4GB. By waiting until after the first __getitem__ we
+        # don't have to pickle the memmap
+        self.token_ids = None
+        self._mmap_filename = mmap_filename
+        self._chunk_size = chunk_size
+
+    def __len__(self):
+        return self.num_instances
+
+    def __getitem__(self, i):
+        if self.token_ids is None:
+            self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16,
+                                       shape=(self.num_instances, self._chunk_size))
+        return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long)
+
+    @staticmethod
+    def raw_text_to_mmap(args):
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        assert len(tokenizer) < 65535  # will use uint16 to store token ids
+        all_files = glob.glob(f'{args.input_dir}/*.txt')
+
+        if os.path.exists(f'{args.input_dir}/cache/'):
+            logger.info("Cache already exists. Remove the cache directory to regenerate")
+            return
+        os.mkdir(f'{args.input_dir}/cache/')
+        train_chunks = []
+        val_chunks = []
+
+        # TODO: process each shared in a separate worker
+        # TODO: support multiple documents in one chunk instead of padding
+        for fname in tqdm(all_files):
+            with open(fname, 'r') as fin:
+                for line in tqdm(fin):
+                    if line.strip() == '':  # drop empty lines
+                        continue
+                    chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks
+                    tokens = tokenizer.tokenize(line)  # each line is one document
+                    # generate chunks of length args.seqlen. The last chunk will be padded.
+                    # padding last chunk is not great for longformer because many chunks will be mostly padding
+                    current_chunk = [tokenizer.bos_token]
+                    for token in tokens:
+                        if len(current_chunk) == args.seqlen - 1:  # chunk is full
+                            current_chunk.append(tokenizer.eos_token)
+                            chunks_list.append(current_chunk)
+                            current_chunk = [tokenizer.bos_token]
+                        current_chunk.append(token)
+                    current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
+                    current_chunk[args.seqlen - 1] = tokenizer.eos_token
+                    chunks_list.append(current_chunk)
+
+        def _tokenized_text_to_mmap(output_fname, chunks_list):
+            random.shuffle(chunks_list)
+            num_chunks = len(chunks_list)
+            all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16)
+            for k, chunk in enumerate(tqdm(chunks_list)):
+                token_ids = tokenizer.convert_tokens_to_ids(chunk)
+                assert len(token_ids) == args.seqlen
+                all_token_ids[k, :] = [int(t) for t in token_ids]
+            fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen))
+            fp[:, :] = all_token_ids[:, :]
+            fp.flush()
+            del fp
+
+        _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks)
+        _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks)
+
+
+class Pretrainer(ptl.LightningModule):
+
+    def __init__(self, hparams):
+        super().__init__()
+
+        self.args = hparams
+        self.hparams = self.args
+
+        self.model = AutoModelWithLMHead.from_pretrained(args.model)
+        self.config = self.model.config
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        self.pad_token_id = tokenizer.pad_token_id
+
+        logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.')
+        MMapTextDataset.raw_text_to_mmap(args)
+
+        # TODO: add support for other objective functions
+        self.data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob
+        )
+
+    def forward(self, input_ids=None, labels=None, loss_only=True):
+        # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD
+        attention_mask = (input_ids != self.pad_token_id).int()
+
+        if labels is not None:
+            # output is loss, prediction_scores, hidden_states
+            output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
+            if loss_only:
+                return output[0]
+            else:
+                return {"loss": output[0], "hidden_states": output[2]}
+        else:
+            # don't need to run the lm_head
+            assert not loss_only
+            output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
+            return {"hidden_states": output[2]}
+
+    def training_step(self, batch, batch_nb):
+        loss = self(**batch)
+        tensorboard_logs = {
+            'mlm_loss': loss.detach(),
+            'mlm_perplexity': torch.exp(loss).detach(),
+        }
+        return {'loss': loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_nb):
+        loss = self(**batch)
+        tensorboard_logs = {
+            'val_mlm_loss': loss.detach(),
+        }
+        return {'val_loss': tensorboard_logs["val_mlm_loss"], 'log': tensorboard_logs}
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
+        if self.use_ddp:
+            avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
+            avg_loss /= torch.distributed.get_world_size()
+        avg_loss = avg_loss.item()
+        logs = {'val_mlm_loss': avg_loss}
+        return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss}
+
+    def configure_optimizers(self):
+        no_decay = ["bias", "LayerNorm.weight"]
+
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
+                "weight_decay": self.args.weight_decay,
+            },
+            {
+                "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps
+        )
+
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+
+    def _get_loader(self, fname, is_train):
+        dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen)
+
+        if self.trainer.use_ddp:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
+            shuffle = False
+        else:
+            sampler = None
+            shuffle = is_train
+
+        loader = DataLoader(
+                dataset,
+                batch_size=self.args.batch_size,
+                shuffle=shuffle,
+                sampler=sampler,
+                num_workers=self.args.num_workers,
+                collate_fn=self.data_collator,
+                drop_last=is_train,
+        )
+        return loader
+
+    def train_dataloader(self):
+        return self._get_loader(f'{self.args.input_dir}/cache/train.bin', True)
+
+    def val_dataloader(self):
+        return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False)
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("--seed", type=int, default=3)
+        parser.add_argument("--input_dir", type=str, required=True)
+        parser.add_argument("--save_dir", type=str, default='runs/')
+        parser.add_argument("--save_prefix", type=str, required=True)
+        parser.add_argument("--train_dev_split", type=float, default=0.05)
+        parser.add_argument("--seqlen", type=int, default=512)
+        parser.add_argument("--tokenizer", type=str, default='roberta-base')
+        parser.add_argument("--model", type=str, default='roberta-base')
+        parser.add_argument("--mlm_prob", type=float, default=0.15)
+        parser.add_argument("--weight_decay", type=float, default=0.01)
+        parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--adam_epsilon", type=float, default=1e-6)
+        parser.add_argument("--training_steps", type=int, default=0.01)
+        parser.add_argument("--warmup_steps", type=int, default=1000)
+        parser.add_argument("--batch_size", type=int, default=8)
+        parser.add_argument("--num_workers", type=int, default=0)
+        parser.add_argument("--grad_accum", type=int, default=1)
+        parser.add_argument("--gpus", type=str, default='0')
+        parser.add_argument("--resume", type=str, default=None)
+        parser.add_argument("--num_tpu_cores", type=int, default=None)
+
+        return parser
+
+
+def main(args):
+    random.seed(args.seed * 10)
+    np.random.seed(args.seed * 100)
+    torch.manual_seed(args.seed * 1000)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed * 10000)
+
+    pretrainer = Pretrainer(args)
+
+    # logger here is a SummaryWritter for tensorboard
+    # it is used by the trainer, and certain return variables
+    # from the model are automatically logged
+    logger = TestTubeLogger(
+        save_dir=args.save_dir,
+        name=args.save_prefix,
+        version=0  # always use version=0
+    )
+
+    checkpoint_callback = ModelCheckpoint(
+        # model saved to filepath/prefix_....
+        filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'),
+        prefix='',
+        save_top_k=3,
+        verbose=True,
+        monitor='val_loss',
+        mode='min',
+    )
+
+    args.gpus = [int(x) for x in args.gpus.split(',')]
+    trainer = ptl.Trainer(
+        gpus=args.gpus,
+        num_tpu_cores=args.num_tpu_cores,
+        distributed_backend='ddp' if len(args.gpus) > 1 else None,
+        track_grad_norm=-1,
+        max_epochs=10000, min_epochs=0, max_steps=args.training_steps,  # run for many epochs, but stop after max_steps
+        early_stop_callback=None,
+        row_log_interval=25,
+        logger=logger,
+        checkpoint_callback=checkpoint_callback,
+        resume_from_checkpoint=args.resume,
+    )
+    trainer.fit(pretrainer)
+
+
+if __name__ == "__main__":
+    parser = Pretrainer.add_args(argparse.ArgumentParser(description="pretrain"))
+    args = parser.parse_args()
+    main(args)

From 325693e3ecb6e939da95509c98978ee86cc8fe74 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 14:12:22 -0700
Subject: [PATCH 010/112] wip

---
 requirements.txt    |  4 +--
 scripts/pretrain.py | 68 ++++++++++++++++++++++++++++-----------------
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 75cbab1..2279015 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 torch>=1.5.0
-transformers>=3.0.1
+transformers==3.0.2
 tensorboardX
-pytorch-lightning==0.7.6
+pytorch-lightning==0.8.5
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 060ab0e..014c33f 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -5,6 +5,7 @@
 import logging
 import numpy as np
 from tqdm import tqdm
+import time
 import torch
 from transformers import AutoTokenizer, AutoModelWithLMHead
 from transformers import DataCollatorForLanguageModeling
@@ -13,7 +14,7 @@
 from torch.utils.data import Dataset, DataLoader
 import pytorch_lightning as ptl
 from pytorch_lightning.logging.test_tube import TestTubeLogger
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger
 
 
 logging.basicConfig(level=logging.INFO)
@@ -112,33 +113,36 @@ def __init__(self, hparams):
         self.data_collator = DataCollatorForLanguageModeling(
             tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob
         )
+        self.start_time = 0
 
-    def forward(self, input_ids=None, labels=None, loss_only=True):
+    def forward(self, input_ids=None, labels=None):
         # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD
         attention_mask = (input_ids != self.pad_token_id).int()
 
-        if labels is not None:
-            # output is loss, prediction_scores, hidden_states
-            output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
-            if loss_only:
-                return output[0]
-            else:
-                return {"loss": output[0], "hidden_states": output[2]}
-        else:
-            # don't need to run the lm_head
-            assert not loss_only
-            output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
-            return {"hidden_states": output[2]}
+        # output is loss, prediction_scores, hidden_states
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
+        return output[0]  # loss
 
     def training_step(self, batch, batch_nb):
         loss = self(**batch)
+        input_ids = batch['input_ids']
         tensorboard_logs = {
+            'input_size': input_ids.numel(),
+            'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
+            'lr': self.trainer.optimizers[0].param_groups[0]['lr'],
             'mlm_loss': loss.detach(),
             'mlm_perplexity': torch.exp(loss).detach(),
+            'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
+        if self.start_time != 0:
+            elapsed_time = time.time() - self.start_time
+            tensorboard_logs['time per batch'] = elapsed_time
+        self.start_time = time.time()
+
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
+        self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {
             'val_mlm_loss': loss.detach(),
@@ -148,7 +152,7 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
-            avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
+            torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
         avg_loss = avg_loss.item()
         logs = {'val_mlm_loss': avg_loss}
@@ -169,7 +173,7 @@ def configure_optimizers(self):
         ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps
         )
 
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
@@ -215,12 +219,17 @@ def add_args(parser):
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--learning_rate", type=float, default=1e-5)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
-        parser.add_argument("--training_steps", type=int, default=0.01)
-        parser.add_argument("--warmup_steps", type=int, default=1000)
+        parser.add_argument("--grad_clip", type=float, default=0)
+        parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates')
+        parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates')
+        parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations')
+        parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**')
         parser.add_argument("--batch_size", type=int, default=8)
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--grad_accum", type=int, default=1)
-        parser.add_argument("--gpus", type=str, default='0')
+        # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
+        # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
+        parser.add_argument("--gpu_count", type=int, default=1)
         parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
@@ -255,18 +264,25 @@ def main(args):
         mode='min',
     )
 
-    args.gpus = [int(x) for x in args.gpus.split(',')]
+    # TODO: try gradient accumulation
+
+    args.val_every_batches = args.val_every * args.grad_accum  # convert val_every_steps to val_every_batches
     trainer = ptl.Trainer(
-        gpus=args.gpus,
+        gpus=args.gpu_count,
+        auto_select_gpus=False,
         num_tpu_cores=args.num_tpu_cores,
-        distributed_backend='ddp' if len(args.gpus) > 1 else None,
-        track_grad_norm=-1,
-        max_epochs=10000, min_epochs=0, max_steps=args.training_steps,  # run for many epochs, but stop after max_steps
+        distributed_backend='ddp' if args.gpu_count > 1 else None,
+        replace_sampler_ddp=False,
+        track_grad_norm=-1,  # TODO: add logging for gradient norm
+        max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
+        val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=25,
+        row_log_interval=10,
         logger=logger,
-        checkpoint_callback=checkpoint_callback,
+        checkpoint_callback=None,  # FIXME: checkpoint_callback,
         resume_from_checkpoint=args.resume,
+        gradient_clip_val=args.grad_clip,
+        callbacks=[LearningRateLogger()]
     )
     trainer.fit(pretrainer)
 

From 985acc9d840fa1d0ea2e6d285f9f2501503606e8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 17:32:04 -0700
Subject: [PATCH 011/112] wip

---
 scripts/pretrain.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 014c33f..4eb08b5 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -20,6 +20,9 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+# TODO: Try on multiple machines
+# TODO: try on a single TPU
+# TODO: try on a TPU-pod
 
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
@@ -129,19 +132,19 @@ def training_step(self, batch, batch_nb):
         tensorboard_logs = {
             'input_size': input_ids.numel(),
             'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
-            'lr': self.trainer.optimizers[0].param_groups[0]['lr'],
             'mlm_loss': loss.detach(),
             'mlm_perplexity': torch.exp(loss).detach(),
-            'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
+            'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
         if self.start_time != 0:
             elapsed_time = time.time() - self.start_time
-            tensorboard_logs['time per batch'] = elapsed_time
+            tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
 
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
+        # TODO: log how long evaluation takes
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {
@@ -228,7 +231,7 @@ def add_args(parser):
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--grad_accum", type=int, default=1)
         # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
-        # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
+        # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
         parser.add_argument("--gpu_count", type=int, default=1)
         parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
@@ -264,9 +267,7 @@ def main(args):
         mode='min',
     )
 
-    # TODO: try gradient accumulation
-
-    args.val_every_batches = args.val_every * args.grad_accum  # convert val_every_steps to val_every_batches
+    args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
         auto_select_gpus=False,
@@ -275,11 +276,12 @@ def main(args):
         replace_sampler_ddp=False,
         track_grad_norm=-1,  # TODO: add logging for gradient norm
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
-        val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches,
+        val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
         row_log_interval=10,
         logger=logger,
-        checkpoint_callback=None,  # FIXME: checkpoint_callback,
+        checkpoint_callback=checkpoint_callback,
+        accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         callbacks=[LearningRateLogger()]

From 023dd78227e9021b539fa6fc92c5f7d7d8e6c463 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 18:02:36 -0700
Subject: [PATCH 012/112] wip

---
 scripts/pretrain.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 4eb08b5..34775a7 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -145,6 +145,7 @@ def training_step(self, batch, batch_nb):
 
     def validation_step(self, batch, batch_nb):
         # TODO: log how long evaluation takes
+        # TODO: reproduce roberta evaluation numbers on the longformer corpus
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {

From 08230ac9f81ae2cc9cd094ba8bded56f24a1a66d Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 21:13:52 -0700
Subject: [PATCH 013/112] wip

---
 scripts/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 34775a7..232a4a3 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -46,7 +46,7 @@ def __getitem__(self, i):
 
     @staticmethod
     def raw_text_to_mmap(args):
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
         assert len(tokenizer) < 65535  # will use uint16 to store token ids
         all_files = glob.glob(f'{args.input_dir}/*.txt')
 
@@ -59,6 +59,7 @@ def raw_text_to_mmap(args):
 
         # TODO: process each shared in a separate worker
         # TODO: support multiple documents in one chunk instead of padding
+        # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
                 for line in tqdm(fin):

From fb65d5794765d3b450af27448bbcb85338d54449 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 22:09:32 -0700
Subject: [PATCH 014/112] .

---
 scripts/pretrain.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 232a4a3..a5171ad 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -24,6 +24,7 @@
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 
+
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
         self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
@@ -57,9 +58,9 @@ def raw_text_to_mmap(args):
         train_chunks = []
         val_chunks = []
 
-        # TODO: process each shared in a separate worker
+        # TODO: process each shared in a separate worker and save their output to files
         # TODO: support multiple documents in one chunk instead of padding
-        # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files
+
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
                 for line in tqdm(fin):

From 0e80cde3c5949ea8ec32ca866f812951d6854a06 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 08:11:10 -0700
Subject: [PATCH 015/112] pad chunks or start next doc

---
 scripts/pretrain.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index a5171ad..c787344 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -55,34 +55,42 @@ def raw_text_to_mmap(args):
             logger.info("Cache already exists. Remove the cache directory to regenerate")
             return
         os.mkdir(f'{args.input_dir}/cache/')
-        train_chunks = []
-        val_chunks = []
 
         # TODO: process each shared in a separate worker and save their output to files
-        # TODO: support multiple documents in one chunk instead of padding
 
+        chunks_list = []
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
+                current_chunk = [tokenizer.bos_token]
                 for line in tqdm(fin):
                     if line.strip() == '':  # drop empty lines
                         continue
-                    chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks
                     tokens = tokenizer.tokenize(line)  # each line is one document
                     # generate chunks of length args.seqlen. The last chunk will be padded.
                     # padding last chunk is not great for longformer because many chunks will be mostly padding
-                    current_chunk = [tokenizer.bos_token]
+
                     for token in tokens:
                         if len(current_chunk) == args.seqlen - 1:  # chunk is full
                             current_chunk.append(tokenizer.eos_token)
                             chunks_list.append(current_chunk)
                             current_chunk = [tokenizer.bos_token]
                         current_chunk.append(token)
-                    current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
-                    current_chunk[args.seqlen - 1] = tokenizer.eos_token
-                    chunks_list.append(current_chunk)
+                    if args.padded_chunks:
+                        # fill the rest of the seqlen with pad
+                        current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
+                        current_chunk[args.seqlen - 1] = tokenizer.eos_token
+                        chunks_list.append(current_chunk)
+                        current_chunk = [tokenizer.bos_token]
+                    else:
+                        # one long doc with sep inbetween
+                        if len(current_chunk) < args.seqlen - 1:
+                            current_chunk.append(tokenizer.sep_token)
+        random.shuffle(chunks_list)
+        val_count = int(args.train_dev_split * len(chunks_list))
+        val_chunks = chunks_list[:val_count]
+        train_chunks = chunks_list[val_count:]
 
         def _tokenized_text_to_mmap(output_fname, chunks_list):
-            random.shuffle(chunks_list)
             num_chunks = len(chunks_list)
             all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16)
             for k, chunk in enumerate(tqdm(chunks_list)):
@@ -222,6 +230,7 @@ def add_args(parser):
         parser.add_argument("--tokenizer", type=str, default='roberta-base')
         parser.add_argument("--model", type=str, default='roberta-base')
         parser.add_argument("--mlm_prob", type=float, default=0.15)
+        parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--learning_rate", type=float, default=1e-5)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)

From 6ca7d1b8557a6d0db1bdbd51e87bf785649b49c8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 08:18:24 -0700
Subject: [PATCH 016/112] todo

---
 scripts/pretrain.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index c787344..d2c5378 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -166,6 +166,8 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
+            # TODO: PTL already doing this. Is it still needed here?
+            # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
         avg_loss = avg_loss.item()
@@ -195,6 +197,7 @@ def configure_optimizers(self):
     def _get_loader(self, fname, is_train):
         dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen)
 
+        # TODO: consider `replace_sampler_ddp=True` and removing the following if statement
         if self.trainer.use_ddp:
             sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
             shuffle = False

From a2aa4f76630d7433f3dee6fc1d04862efa09bb5e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 09:22:56 -0700
Subject: [PATCH 017/112] wip

---
 scripts/pretrain.py | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index d2c5378..95ce577 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -4,6 +4,7 @@
 import random
 import logging
 import numpy as np
+import math
 from tqdm import tqdm
 import time
 import torch
@@ -143,6 +144,7 @@ def training_step(self, batch, batch_nb):
             'input_size': input_ids.numel(),
             'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
             'mlm_loss': loss.detach(),
+            'mlm_bpc': loss.detach()/math.log(2),
             'mlm_perplexity': torch.exp(loss).detach(),
             'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
@@ -225,30 +227,42 @@ def val_dataloader(self):
     @staticmethod
     def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
+
+        # Dataset. Some of these params are only useful when generating the dataset cache
         parser.add_argument("--input_dir", type=str, required=True)
-        parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True)
         parser.add_argument("--train_dev_split", type=float, default=0.05)
+        parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--seqlen", type=int, default=512)
+        parser.add_argument("--mlm_prob", type=float, default=0.15)
+
+        # HF model loading
         parser.add_argument("--tokenizer", type=str, default='roberta-base')
         parser.add_argument("--model", type=str, default='roberta-base')
-        parser.add_argument("--mlm_prob", type=float, default=0.15)
-        parser.add_argument("--padded_chunks", type=bool, default=False)
-        parser.add_argument("--weight_decay", type=float, default=0.01)
+
+        # Checkpointing and logging
+        parser.add_argument("--save_dir", type=str, default='runs/')
+        parser.add_argument("--save_prefix", type=str, required=True)
+        parser.add_argument("--resume", type=str, default=None)
+
+        # Training hyperparams
         parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates')
+        parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates')
+        parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations')
+        parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**')
+        parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
         parser.add_argument("--grad_clip", type=float, default=0)
-        parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates')
-        parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates')
-        parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations')
-        parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**')
-        parser.add_argument("--batch_size", type=int, default=8)
+
+        # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
+        parser.add_argument("--batch_size", type=int, default=32)
+        parser.add_argument("--grad_accum", type=int, default=16)
+
+        # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
-        parser.add_argument("--grad_accum", type=int, default=1)
         # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
         # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
         parser.add_argument("--gpu_count", type=int, default=1)
-        parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser

From 62a69d594aa0a2a5c02d83458e36b8f91ffda15b Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 13:51:56 -0700
Subject: [PATCH 018/112] wip

---
 scripts/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 95ce577..a715c9c 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -24,6 +24,7 @@
 # TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
+# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635
 
 
 class MMapTextDataset(Dataset):
@@ -157,7 +158,6 @@ def training_step(self, batch, batch_nb):
 
     def validation_step(self, batch, batch_nb):
         # TODO: log how long evaluation takes
-        # TODO: reproduce roberta evaluation numbers on the longformer corpus
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {

From 3e3a478317a7d0ecb2ca1983de92014889f4c543 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 17:24:05 -0700
Subject: [PATCH 019/112] wip

---
 requirements.txt    |  3 ++-
 scripts/pretrain.py | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2279015..cbce7f0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
+pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
+
 torch>=1.5.0
 transformers==3.0.2
 tensorboardX
-pytorch-lightning==0.8.5
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index a715c9c..2f6b890 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -21,10 +21,12 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+# DONE: reproduce RoBERTa numbers on the Longformer corpus
 # TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635
+# TODO: try restarting and double check optimizer, lr and lr scheduler
+# TODO: try fp16
 
 
 class MMapTextDataset(Dataset):
@@ -260,9 +262,8 @@ def add_args(parser):
 
         # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
-        # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
-        # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
-        parser.add_argument("--gpu_count", type=int, default=1)
+        parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
+                            help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser
@@ -299,7 +300,6 @@ def main(args):
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
-        auto_select_gpus=False,
         num_tpu_cores=args.num_tpu_cores,
         distributed_backend='ddp' if args.gpu_count > 1 else None,
         replace_sampler_ddp=False,

From 3bc535461451ac37496d66147030ed7fc40429a3 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 20:13:08 -0700
Subject: [PATCH 020/112] wip

---
 requirements.txt    | 2 +-
 scripts/pretrain.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cbce7f0..b396708 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
 
-torch>=1.5.0
+torch==1.3.0
 transformers==3.0.2
 tensorboardX
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 2f6b890..3263537 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -26,7 +26,6 @@
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 # TODO: try restarting and double check optimizer, lr and lr scheduler
-# TODO: try fp16
 
 
 class MMapTextDataset(Dataset):
@@ -313,6 +312,7 @@ def main(args):
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
+        precision=16, amp_level='O2',
         callbacks=[LearningRateLogger()]
     )
     trainer.fit(pretrainer)

From 1a91024e3470ee2c5ad8fa08d3daca1fc68a0e17 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 22:25:48 -0700
Subject: [PATCH 021/112] wip

---
 scripts/pretrain.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 3263537..e79eb17 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -225,6 +225,12 @@ def train_dataloader(self):
     def val_dataloader(self):
         return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False)
 
+    def grad_norm(self, norm_type):
+        # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
+        # TODO: grad_norm reporting needs to take fp16 loss scale into account
+        all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]
+        return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))}
+
     @staticmethod
     def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
@@ -302,7 +308,7 @@ def main(args):
         num_tpu_cores=args.num_tpu_cores,
         distributed_backend='ddp' if args.gpu_count > 1 else None,
         replace_sampler_ddp=False,
-        track_grad_norm=-1,  # TODO: add logging for gradient norm
+        track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
@@ -313,7 +319,7 @@ def main(args):
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         precision=16, amp_level='O2',
-        callbacks=[LearningRateLogger()]
+        callbacks=[LearningRateLogger()],
     )
     trainer.fit(pretrainer)
 

From 5fa21f24452a990864a31c65dcf9a780e9f47e2c Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 22:32:46 -0700
Subject: [PATCH 022/112] wip

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b396708..e47a1d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
 
-torch==1.3.0
+torch==1.3.1
 transformers==3.0.2
 tensorboardX
 test-tube==0.7.5

From 18eb0036c49c8452bf109c99d339229688f0e714 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 07:58:49 -0700
Subject: [PATCH 023/112] wip

---
 scripts/pretrain.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index e79eb17..5022aa8 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -22,7 +22,6 @@
 logger = logging.getLogger(__name__)
 
 # DONE: reproduce RoBERTa numbers on the Longformer corpus
-# TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 # TODO: try restarting and double check optimizer, lr and lr scheduler
@@ -259,7 +258,7 @@ def add_args(parser):
         parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**')
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
-        parser.add_argument("--grad_clip", type=float, default=0)
+        parser.add_argument("--grad_clip", type=float, default=0)  # TODO: test this with fp16. Likely not working
 
         # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
         parser.add_argument("--batch_size", type=int, default=32)
@@ -269,6 +268,21 @@ def add_args(parser):
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
                             help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
+
+        # For multi-node training, use the PyTorch launch script. The script and instructions can be found here:
+        # https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+        # To run PTL in a mode compatible with the launch script, two things are needed:
+        #   - pass the argument `--use_env` to `torch.distributed.launch`
+        #   - make sure `--nproc_per_node` matches `--gpu_count` and `--nnodes` matches `--node_count`.
+        # For example, to run on 2 nodes, 3 gpus each, the command line on node rank 1 would be like:
+        #   >>>> python -m torch.distributed.launch  \
+        #               --use_env  --nnodes 2  --nproc_per_node 3  \
+        #               --node_rank 1  --master_addr s2-server4  --master_port 12343  \
+        #               scripts/pretrain.py  \
+        #               --gpu_count 2  --node_count 2  \
+        #               --input_dir my_data_dir  --save_prefix test_multinode
+        parser.add_argument("--node_count", type=int, default=1,
+                            help="Number of nodes. It needs to match --nnodes of torch.distributed.launch")
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser
@@ -305,8 +319,9 @@ def main(args):
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
+        num_nodes=args.node_count,
         num_tpu_cores=args.num_tpu_cores,
-        distributed_backend='ddp' if args.gpu_count > 1 else None,
+        distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None,
         replace_sampler_ddp=False,
         track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps

From 607e4465da794d20ef7006a2bf9dddc9250ecbc1 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 09:41:34 -0700
Subject: [PATCH 024/112] wip

---
 scripts/pretrain.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 5022aa8..10165b0 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -22,9 +22,12 @@
 logger = logging.getLogger(__name__)
 
 # DONE: reproduce RoBERTa numbers on the Longformer corpus
+# DONE: testing ddp single machine
+# DONE: testing ddp multiple machines
+# DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: try restarting and double check optimizer, lr and lr scheduler
+# TODO: only one checkpoint per epoch is saved
 
 
 class MMapTextDataset(Dataset):
@@ -168,7 +171,7 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
-            # TODO: PTL already doing this. Is it still needed here?
+            # TODO: PTL is already doing this. Is it still needed here?
             # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
@@ -189,11 +192,10 @@ def configure_optimizers(self):
                 "weight_decay": 0.0,
             },
         ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
             optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps
         )
-
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
 
     def _get_loader(self, fname, is_train):
@@ -247,11 +249,15 @@ def add_args(parser):
 
         # Checkpointing and logging
         parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True)
-        parser.add_argument("--resume", type=str, default=None)
+        parser.add_argument("--save_prefix", type=str, required=True,
+                            help="path of output directory is --save_dir/--save_prefix")
+        parser.add_argument("--resume", type=str, default=None,  # It is better to use a different output dir.
+                            help="Path to a checkpoint to load model weights and training state. It overwrites args")
+        parser.add_argument("--resume_model_only", type=str, default=None,
+                            help="Path to a checkpoint to load model weights but not training state")
 
         # Training hyperparams
-        parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--lr", type=float, default=1e-5)
         parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates')
         parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates')
         parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations')
@@ -295,7 +301,10 @@ def main(args):
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(args.seed * 10000)
 
-    pretrainer = Pretrainer(args)
+    if args.resume_model_only is not None:
+        pretrainer = Pretrainer.load_from_checkpoint(args.resume_model_only, args)
+    else:
+        pretrainer = Pretrainer(args)
 
     # logger here is a SummaryWritter for tensorboard
     # it is used by the trainer, and certain return variables

From d4659deeabade71e46da1d2daecaf2e31ac60aab Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 10:24:16 -0700
Subject: [PATCH 025/112] wip

---
 scripts/pretrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 10165b0..6015fa0 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -27,7 +27,6 @@
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: only one checkpoint per epoch is saved
 
 
 class MMapTextDataset(Dataset):
@@ -175,7 +174,6 @@ def validation_epoch_end(self, outputs):
             # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
-        avg_loss = avg_loss.item()
         logs = {'val_mlm_loss': avg_loss}
         return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss}
 
@@ -320,9 +318,11 @@ def main(args):
         filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'),
         prefix='',
         save_top_k=3,
+        save_last=True,
         verbose=True,
         monitor='val_loss',
         mode='min',
+        period=-1,  # to allow multiple checkpoints per epoch
     )
 
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu

From c7c53cbde9c244fbc6b4f1119aeb61c268f80284 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sun, 19 Jul 2020 09:06:10 -0700
Subject: [PATCH 026/112] wip

---
 scripts/pretrain.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 6015fa0..683b008 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -27,6 +27,7 @@
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
+# TODO: run on beaker on ai2-server1/2
 
 
 class MMapTextDataset(Dataset):

From 0a07daf8dd0cf2ca24592f4006338b2568cca23b Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 20:16:38 +0000
Subject: [PATCH 027/112] wip

---
 scripts/cheatsheet.txt | 22 ++++++++++++++++
 scripts/test_tpu.py    | 57 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 scripts/test_tpu.py

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index be4fc3a..d39371e 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -70,3 +70,25 @@ python -m scripts.triviaqa_utils.evaluation_utils  \
     --prediction_file predictions.json
 # Output should be:
 {'exact_match': 73.07644188665083, 'f1': 77.78523804802242, 'common': 7993, 'denominator': 7993, 'pred_len': 7993, 'gold_len': 7993}
+
+
+# TPU
+export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+
+source /anaconda3/bin/activate torch-xla-nightly
+
+import torch_xla.debug.metrics as met; print(met.metrics_report())
+
+curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
+
+  XLA_IR_DEBUG=1                                                                                                                                     
+  XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470                                                                    
+  TF_CPP_LOG_THREAD_ID=1                                                                                              
+  TF_CPP_MIN_LOG_LEVEL=0                                                                                                                 
+  XLA_HLO_DEBUG=1                                                                                                                   
+  XLA_DUMP_FATAL_STACK=1                                                                                                                          
+  TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1                                                             
+  XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs                                                                                                        
+  XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics          
diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py
new file mode 100644
index 0000000..618ee6f
--- /dev/null
+++ b/scripts/test_tpu.py
@@ -0,0 +1,57 @@
+import os
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoModel
+import pytorch_lightning as pl
+
+class CoolDataset(Dataset):
+  def __len__(self):
+      return 128 * 128
+
+  def __getitem__(self, idx):
+      return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128)
+
+class CoolSystem(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+
+        # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096')
+        self.model = AutoModel.from_pretrained('bert-base-uncased')
+
+    def forward(self, x, y):
+        return self.model(x, attention_mask=None)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x, y)
+        loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
+        tensorboard_logs = {'train_loss': loss}
+        return {'loss': loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x, y)
+        val_loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
+        return {'val_loss': val_loss}
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+        tensorboard_logs = {'val_loss': avg_loss}
+        return {'val_loss': avg_loss, 'log': tensorboard_logs}
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.001)
+
+    def train_dataloader(self):
+        loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+        return loader
+
+    def val_dataloader(self):
+        loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+        return loader
+
+if __name__ == '__main__':
+    model = CoolSystem()
+    trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0)
+    trainer.fit(model)

From 827576cdc958512d66d67c6cf2041f4f5c9a45de Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 20:43:27 +0000
Subject: [PATCH 028/112] wip

---
 scripts/cheatsheet.txt | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index d39371e..6dde8ce 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -82,13 +82,3 @@ import torch_xla.debug.metrics as met; print(met.metrics_report())
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
-
-  XLA_IR_DEBUG=1                                                                                                                                     
-  XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470                                                                    
-  TF_CPP_LOG_THREAD_ID=1                                                                                              
-  TF_CPP_MIN_LOG_LEVEL=0                                                                                                                 
-  XLA_HLO_DEBUG=1                                                                                                                   
-  XLA_DUMP_FATAL_STACK=1                                                                                                                          
-  TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1                                                             
-  XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs                                                                                                        
-  XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics          

From 5d0c8a2a6dd76b68e0cfa491c50b4714e6befbdd Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 13:52:22 -0700
Subject: [PATCH 029/112] wip

---
 longformer/longformer_encoder_decoder.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py
index 741939a..67ab3e4 100644
--- a/longformer/longformer_encoder_decoder.py
+++ b/longformer/longformer_encoder_decoder.py
@@ -7,8 +7,11 @@
 class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration):
     def __init__(self, config):
         super().__init__(config)
-        for i, layer in enumerate(self.model.encoder.layers):
-            layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i)
+        if config.attention_mode == 'n2':
+            pass  # do nothing, use BertSelfAttention instead
+        else:
+            for i, layer in enumerate(self.model.encoder.layers):
+                layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i)
 
 
 class LongformerEncoderDecoderConfig(BartConfig):

From 413258aeb41f0ea6cf081c13be85d23e1e10be5f Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 13:52:41 -0700
Subject: [PATCH 030/112] wip

---
 scripts/mem_profiler.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py
index 64a6d56..4edc6b0 100644
--- a/scripts/mem_profiler.py
+++ b/scripts/mem_profiler.py
@@ -1,14 +1,17 @@
 from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
 from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig
 
+from longformer.longformer import LongformerForMaskedLM
+from longformer.longformer import LongformerConfig
+
 import torch
 from torch.utils.data import DataLoader, Dataset
 from pytorch_lightning import Trainer
 import pytorch_lightning as pl
 
-seqlen = 1024 * 12
-global_size = 0  # seqlen // 100
-attention_window = 512  # one sided
+seqlen = 1024 * 8
+global_size = seqlen // 100
+attention_window = 256  # one sided
 
 
 class CoolDataset(Dataset):
@@ -18,7 +21,7 @@ def __len__(self):
     def __getitem__(self, idx):
         tokne_ids = torch.tensor([5] * seqlen)
         mask = torch.tensor([1] * seqlen)
-        # mask[:global_size] = 2
+        mask[:global_size] = 2
         return tokne_ids, mask
 
 
@@ -28,21 +31,29 @@ def __init__(self, hparams=None):
         super().__init__()
         self.hparams = hparams
 
-        config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096')
+        # config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096')
+        config = LongformerConfig.from_pretrained('roberta-large')
         config.max_position_embeddings = seqlen + 2
         config.gradient_checkpointing = True
-        config.attention_mode = 'sliding_chunks'
+        # config.attention_mode = 'sliding_chunks'
+        config.attention_mode = 'n2'
         config.attention_window = [attention_window] * config.num_hidden_layers
-        self.model = LongformerEncoderDecoderForConditionalGeneration(config)
+        config.attention_dilation = [1] * config.num_hidden_layers
+        # self.model = LongformerEncoderDecoderForConditionalGeneration(config)
+        self.model = LongformerForMaskedLM(config)
 
     def forward(self, x, y):
         print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3)
+        # import ipdb; ipdb.set_trace()
+        # return self.model(x, attention_mask=y, decoder_input_ids=x[:, :attention_window * 2], use_cache=False)
         return self.model(x, attention_mask=y)
 
     def training_step(self, batch, batch_idx):
+        # import ipdb; ipdb.set_trace()
         x, y = batch
         y_hat = self(x, y)
         loss = y_hat[0].sum()
+        # import ipdb; ipdb.set_trace()
         return {'loss': loss}
 
     def configure_optimizers(self):
@@ -53,6 +64,6 @@ def train_dataloader(self):
 
 
 if __name__ == '__main__':
-    model = MemoryProfiler()
+    model = MemoryProfiler(hparams={})
     trainer = Trainer(gpus=[0], progress_bar_refresh_rate=1, max_epochs=1, amp_level='O2', use_amp=True)
     trainer.fit(model)

From 1a6498c117e380978151cd0b3e91de9f74640e05 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 21:45:45 +0000
Subject: [PATCH 031/112] tpu

---
 scripts/pretrain.py | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 683b008..93d83a4 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -30,6 +30,14 @@
 # TODO: run on beaker on ai2-server1/2
 
 
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
+
+
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
         self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
@@ -146,16 +154,17 @@ def training_step(self, batch, batch_nb):
         input_ids = batch['input_ids']
         tensorboard_logs = {
             'input_size': input_ids.numel(),
-            'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
-            'mlm_loss': loss.detach(),
-            'mlm_bpc': loss.detach()/math.log(2),
-            'mlm_perplexity': torch.exp(loss).detach(),
+            'mlm_loss': loss,
+            'mlm_bpc': loss/math.log(2),
+            'mlm_perplexity': torch.exp(loss),
             'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
         if self.start_time != 0:
             elapsed_time = time.time() - self.start_time
             tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
+        if not XLA_AVAILABLE:
+            tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3
 
         return {'loss': loss, 'log': tensorboard_logs}
 
@@ -204,6 +213,14 @@ def _get_loader(self, fname, is_train):
         if self.trainer.use_ddp:
             sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
             shuffle = False
+        elif self.trainer.use_tpu:
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                dataset,
+                num_replicas=xm.xrt_world_size(),
+                rank=xm.get_ordinal(),
+                shuffle=is_train,
+            )
+            shuffle = False
         else:
             sampler = None
             shuffle = is_train
@@ -227,6 +244,10 @@ def val_dataloader(self):
 
     def grad_norm(self, norm_type):
         # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
+
+        if XLA_AVAILABLE:
+            return {}  # computing grad_norm one parameter at a time takes forever on TPU
+
         # TODO: grad_norm reporting needs to take fp16 loss scale into account
         all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]
         return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))}
@@ -266,8 +287,8 @@ def add_args(parser):
         parser.add_argument("--grad_clip", type=float, default=0)  # TODO: test this with fp16. Likely not working
 
         # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
-        parser.add_argument("--batch_size", type=int, default=32)
-        parser.add_argument("--grad_accum", type=int, default=16)
+        parser.add_argument("--batch_size", type=int, default=8)
+        parser.add_argument("--grad_accum", type=int, default=1)
 
         # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
@@ -288,7 +309,7 @@ def add_args(parser):
         #               --input_dir my_data_dir  --save_prefix test_multinode
         parser.add_argument("--node_count", type=int, default=1,
                             help="Number of nodes. It needs to match --nnodes of torch.distributed.launch")
-        parser.add_argument("--num_tpu_cores", type=int, default=None)
+        parser.add_argument("--tpu_core_count", type=int, default=None)
 
         return parser
 
@@ -330,20 +351,22 @@ def main(args):
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
         num_nodes=args.node_count,
-        num_tpu_cores=args.num_tpu_cores,
+        num_tpu_cores=args.tpu_core_count,
         distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None,
         replace_sampler_ddp=False,
         track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=10,
+        row_log_interval=16,
+        progress_bar_refresh_rate=16,
         logger=logger,
         checkpoint_callback=checkpoint_callback,
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         precision=16, amp_level='O2',
+        num_sanity_val_steps=2,
         callbacks=[LearningRateLogger()],
     )
     trainer.fit(pretrainer)

From 3e82548dc8be6af6e582ad80534b944335b31c87 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 15:47:11 -0700
Subject: [PATCH 032/112] wip

---
 scripts/pretrain.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 93d83a4..2202a39 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -26,6 +26,12 @@
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
+# - tie weights
+# - tensorboard
+# - getrank
+# - barrier
+# - val all_reduce
+# - checkpointing
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -257,7 +263,7 @@ def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
 
         # Dataset. Some of these params are only useful when generating the dataset cache
-        parser.add_argument("--input_dir", type=str, required=True)
+        parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/')
         parser.add_argument("--train_dev_split", type=float, default=0.05)
         parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--seqlen", type=int, default=512)
@@ -269,7 +275,7 @@ def add_args(parser):
 
         # Checkpointing and logging
         parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True,
+        parser.add_argument("--save_prefix", type=str, default='test',
                             help="path of output directory is --save_dir/--save_prefix")
         parser.add_argument("--resume", type=str, default=None,  # It is better to use a different output dir.
                             help="Path to a checkpoint to load model weights and training state. It overwrites args")
@@ -291,6 +297,7 @@ def add_args(parser):
         parser.add_argument("--grad_accum", type=int, default=1)
 
         # Compute resources
+        parser.add_argument("--fp16", type=bool, default=False)
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
                             help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
@@ -365,7 +372,7 @@ def main(args):
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
-        precision=16, amp_level='O2',
+        precision=16 if args.fp16 else 32, amp_level='O2',
         num_sanity_val_steps=2,
         callbacks=[LearningRateLogger()],
     )

From adadd425c84c4993bb77d2af950b0af17d10079b Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 23 Jul 2020 02:54:28 +0000
Subject: [PATCH 033/112] wip

---
 scripts/cheatsheet.txt | 2 ++
 scripts/pretrain.py    | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index 6dde8ce..1e77b07 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -82,3 +82,5 @@ import torch_xla.debug.metrics as met; print(met.metrics_report())
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py  --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 2202a39..ab1120a 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -281,6 +281,7 @@ def add_args(parser):
                             help="Path to a checkpoint to load model weights and training state. It overwrites args")
         parser.add_argument("--resume_model_only", type=str, default=None,
                             help="Path to a checkpoint to load model weights but not training state")
+        parser.add_argument("--log_rate", type=int, default=16)
 
         # Training hyperparams
         parser.add_argument("--lr", type=float, default=1e-5)
@@ -365,8 +366,8 @@ def main(args):
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=16,
-        progress_bar_refresh_rate=16,
+        row_log_interval=args.log_rate,
+        progress_bar_refresh_rate=args.log_rate,
         logger=logger,
         checkpoint_callback=checkpoint_callback,
         accumulate_grad_batches=args.grad_accum,

From 9e191a0b4ecd763e862075a6097366561a99523e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 15 Jul 2020 23:19:32 -0700
Subject: [PATCH 034/112] pretraining script

---
 requirements.txt    |   6 +-
 scripts/pretrain.py | 277 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 280 insertions(+), 3 deletions(-)
 create mode 100644 scripts/pretrain.py

diff --git a/requirements.txt b/requirements.txt
index 5b004e7..75cbab1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-torch>=1.2.0
-transformers>=3.0.2
+torch>=1.5.0
+transformers>=3.0.1
 tensorboardX
-pytorch-lightning==0.6.0
+pytorch-lightning==0.7.6
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
new file mode 100644
index 0000000..060ab0e
--- /dev/null
+++ b/scripts/pretrain.py
@@ -0,0 +1,277 @@
+import argparse
+import glob
+import os
+import random
+import logging
+import numpy as np
+from tqdm import tqdm
+import torch
+from transformers import AutoTokenizer, AutoModelWithLMHead
+from transformers import DataCollatorForLanguageModeling
+from transformers.optimization import AdamW, get_linear_schedule_with_warmup
+
+from torch.utils.data import Dataset, DataLoader
+import pytorch_lightning as ptl
+from pytorch_lightning.logging.test_tube import TestTubeLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class MMapTextDataset(Dataset):
+    def __init__(self, mmap_filename, chunk_size):
+        self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
+        # defer loading the token_ids memmap until after the first __getitem__ call.
+        # when spawning new processes for ddp, there is a hard limit in python < 3.8 that
+        # pickle files need to be < 4GB. By waiting until after the first __getitem__ we
+        # don't have to pickle the memmap
+        self.token_ids = None
+        self._mmap_filename = mmap_filename
+        self._chunk_size = chunk_size
+
+    def __len__(self):
+        return self.num_instances
+
+    def __getitem__(self, i):
+        if self.token_ids is None:
+            self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16,
+                                       shape=(self.num_instances, self._chunk_size))
+        return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long)
+
+    @staticmethod
+    def raw_text_to_mmap(args):
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        assert len(tokenizer) < 65535  # will use uint16 to store token ids
+        all_files = glob.glob(f'{args.input_dir}/*.txt')
+
+        if os.path.exists(f'{args.input_dir}/cache/'):
+            logger.info("Cache already exists. Remove the cache directory to regenerate")
+            return
+        os.mkdir(f'{args.input_dir}/cache/')
+        train_chunks = []
+        val_chunks = []
+
+        # TODO: process each shared in a separate worker
+        # TODO: support multiple documents in one chunk instead of padding
+        for fname in tqdm(all_files):
+            with open(fname, 'r') as fin:
+                for line in tqdm(fin):
+                    if line.strip() == '':  # drop empty lines
+                        continue
+                    chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks
+                    tokens = tokenizer.tokenize(line)  # each line is one document
+                    # generate chunks of length args.seqlen. The last chunk will be padded.
+                    # padding last chunk is not great for longformer because many chunks will be mostly padding
+                    current_chunk = [tokenizer.bos_token]
+                    for token in tokens:
+                        if len(current_chunk) == args.seqlen - 1:  # chunk is full
+                            current_chunk.append(tokenizer.eos_token)
+                            chunks_list.append(current_chunk)
+                            current_chunk = [tokenizer.bos_token]
+                        current_chunk.append(token)
+                    current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
+                    current_chunk[args.seqlen - 1] = tokenizer.eos_token
+                    chunks_list.append(current_chunk)
+
+        def _tokenized_text_to_mmap(output_fname, chunks_list):
+            random.shuffle(chunks_list)
+            num_chunks = len(chunks_list)
+            all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16)
+            for k, chunk in enumerate(tqdm(chunks_list)):
+                token_ids = tokenizer.convert_tokens_to_ids(chunk)
+                assert len(token_ids) == args.seqlen
+                all_token_ids[k, :] = [int(t) for t in token_ids]
+            fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen))
+            fp[:, :] = all_token_ids[:, :]
+            fp.flush()
+            del fp
+
+        _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks)
+        _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks)
+
+
+class Pretrainer(ptl.LightningModule):
+
+    def __init__(self, hparams):
+        super().__init__()
+
+        self.args = hparams
+        self.hparams = self.args
+
+        self.model = AutoModelWithLMHead.from_pretrained(args.model)
+        self.config = self.model.config
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        self.pad_token_id = tokenizer.pad_token_id
+
+        logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.')
+        MMapTextDataset.raw_text_to_mmap(args)
+
+        # TODO: add support for other objective functions
+        self.data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob
+        )
+
+    def forward(self, input_ids=None, labels=None, loss_only=True):
+        # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD
+        attention_mask = (input_ids != self.pad_token_id).int()
+
+        if labels is not None:
+            # output is loss, prediction_scores, hidden_states
+            output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
+            if loss_only:
+                return output[0]
+            else:
+                return {"loss": output[0], "hidden_states": output[2]}
+        else:
+            # don't need to run the lm_head
+            assert not loss_only
+            output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
+            return {"hidden_states": output[2]}
+
+    def training_step(self, batch, batch_nb):
+        loss = self(**batch)
+        tensorboard_logs = {
+            'mlm_loss': loss.detach(),
+            'mlm_perplexity': torch.exp(loss).detach(),
+        }
+        return {'loss': loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_nb):
+        loss = self(**batch)
+        tensorboard_logs = {
+            'val_mlm_loss': loss.detach(),
+        }
+        return {'val_loss': tensorboard_logs["val_mlm_loss"], 'log': tensorboard_logs}
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
+        if self.use_ddp:
+            avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
+            avg_loss /= torch.distributed.get_world_size()
+        avg_loss = avg_loss.item()
+        logs = {'val_mlm_loss': avg_loss}
+        return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss}
+
+    def configure_optimizers(self):
+        no_decay = ["bias", "LayerNorm.weight"]
+
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
+                "weight_decay": self.args.weight_decay,
+            },
+            {
+                "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps
+        )
+
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+
+    def _get_loader(self, fname, is_train):
+        dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen)
+
+        if self.trainer.use_ddp:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
+            shuffle = False
+        else:
+            sampler = None
+            shuffle = is_train
+
+        loader = DataLoader(
+                dataset,
+                batch_size=self.args.batch_size,
+                shuffle=shuffle,
+                sampler=sampler,
+                num_workers=self.args.num_workers,
+                collate_fn=self.data_collator,
+                drop_last=is_train,
+        )
+        return loader
+
+    def train_dataloader(self):
+        return self._get_loader(f'{self.args.input_dir}/cache/train.bin', True)
+
+    def val_dataloader(self):
+        return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False)
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("--seed", type=int, default=3)
+        parser.add_argument("--input_dir", type=str, required=True)
+        parser.add_argument("--save_dir", type=str, default='runs/')
+        parser.add_argument("--save_prefix", type=str, required=True)
+        parser.add_argument("--train_dev_split", type=float, default=0.05)
+        parser.add_argument("--seqlen", type=int, default=512)
+        parser.add_argument("--tokenizer", type=str, default='roberta-base')
+        parser.add_argument("--model", type=str, default='roberta-base')
+        parser.add_argument("--mlm_prob", type=float, default=0.15)
+        parser.add_argument("--weight_decay", type=float, default=0.01)
+        parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--adam_epsilon", type=float, default=1e-6)
+        parser.add_argument("--training_steps", type=int, default=0.01)
+        parser.add_argument("--warmup_steps", type=int, default=1000)
+        parser.add_argument("--batch_size", type=int, default=8)
+        parser.add_argument("--num_workers", type=int, default=0)
+        parser.add_argument("--grad_accum", type=int, default=1)
+        parser.add_argument("--gpus", type=str, default='0')
+        parser.add_argument("--resume", type=str, default=None)
+        parser.add_argument("--num_tpu_cores", type=int, default=None)
+
+        return parser
+
+
+def main(args):
+    random.seed(args.seed * 10)
+    np.random.seed(args.seed * 100)
+    torch.manual_seed(args.seed * 1000)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed * 10000)
+
+    pretrainer = Pretrainer(args)
+
+    # logger here is a SummaryWritter for tensorboard
+    # it is used by the trainer, and certain return variables
+    # from the model are automatically logged
+    logger = TestTubeLogger(
+        save_dir=args.save_dir,
+        name=args.save_prefix,
+        version=0  # always use version=0
+    )
+
+    checkpoint_callback = ModelCheckpoint(
+        # model saved to filepath/prefix_....
+        filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'),
+        prefix='',
+        save_top_k=3,
+        verbose=True,
+        monitor='val_loss',
+        mode='min',
+    )
+
+    args.gpus = [int(x) for x in args.gpus.split(',')]
+    trainer = ptl.Trainer(
+        gpus=args.gpus,
+        num_tpu_cores=args.num_tpu_cores,
+        distributed_backend='ddp' if len(args.gpus) > 1 else None,
+        track_grad_norm=-1,
+        max_epochs=10000, min_epochs=0, max_steps=args.training_steps,  # run for many epochs, but stop after max_steps
+        early_stop_callback=None,
+        row_log_interval=25,
+        logger=logger,
+        checkpoint_callback=checkpoint_callback,
+        resume_from_checkpoint=args.resume,
+    )
+    trainer.fit(pretrainer)
+
+
+if __name__ == "__main__":
+    parser = Pretrainer.add_args(argparse.ArgumentParser(description="pretrain"))
+    args = parser.parse_args()
+    main(args)

From 9d18808eaa5debf1f8d0b474558c79e5072788fc Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 14:12:22 -0700
Subject: [PATCH 035/112] wip

---
 requirements.txt    |  4 +--
 scripts/pretrain.py | 68 ++++++++++++++++++++++++++++-----------------
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 75cbab1..2279015 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 torch>=1.5.0
-transformers>=3.0.1
+transformers==3.0.2
 tensorboardX
-pytorch-lightning==0.7.6
+pytorch-lightning==0.8.5
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 060ab0e..014c33f 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -5,6 +5,7 @@
 import logging
 import numpy as np
 from tqdm import tqdm
+import time
 import torch
 from transformers import AutoTokenizer, AutoModelWithLMHead
 from transformers import DataCollatorForLanguageModeling
@@ -13,7 +14,7 @@
 from torch.utils.data import Dataset, DataLoader
 import pytorch_lightning as ptl
 from pytorch_lightning.logging.test_tube import TestTubeLogger
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger
 
 
 logging.basicConfig(level=logging.INFO)
@@ -112,33 +113,36 @@ def __init__(self, hparams):
         self.data_collator = DataCollatorForLanguageModeling(
             tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob
         )
+        self.start_time = 0
 
-    def forward(self, input_ids=None, labels=None, loss_only=True):
+    def forward(self, input_ids=None, labels=None):
         # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD
         attention_mask = (input_ids != self.pad_token_id).int()
 
-        if labels is not None:
-            # output is loss, prediction_scores, hidden_states
-            output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
-            if loss_only:
-                return output[0]
-            else:
-                return {"loss": output[0], "hidden_states": output[2]}
-        else:
-            # don't need to run the lm_head
-            assert not loss_only
-            output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
-            return {"hidden_states": output[2]}
+        # output is loss, prediction_scores, hidden_states
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
+        return output[0]  # loss
 
     def training_step(self, batch, batch_nb):
         loss = self(**batch)
+        input_ids = batch['input_ids']
         tensorboard_logs = {
+            'input_size': input_ids.numel(),
+            'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
+            'lr': self.trainer.optimizers[0].param_groups[0]['lr'],
             'mlm_loss': loss.detach(),
             'mlm_perplexity': torch.exp(loss).detach(),
+            'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
+        if self.start_time != 0:
+            elapsed_time = time.time() - self.start_time
+            tensorboard_logs['time per batch'] = elapsed_time
+        self.start_time = time.time()
+
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
+        self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {
             'val_mlm_loss': loss.detach(),
@@ -148,7 +152,7 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
-            avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
+            torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
         avg_loss = avg_loss.item()
         logs = {'val_mlm_loss': avg_loss}
@@ -169,7 +173,7 @@ def configure_optimizers(self):
         ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps
         )
 
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
@@ -215,12 +219,17 @@ def add_args(parser):
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--learning_rate", type=float, default=1e-5)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
-        parser.add_argument("--training_steps", type=int, default=0.01)
-        parser.add_argument("--warmup_steps", type=int, default=1000)
+        parser.add_argument("--grad_clip", type=float, default=0)
+        parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates')
+        parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates')
+        parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations')
+        parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**')
         parser.add_argument("--batch_size", type=int, default=8)
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--grad_accum", type=int, default=1)
-        parser.add_argument("--gpus", type=str, default='0')
+        # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
+        # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
+        parser.add_argument("--gpu_count", type=int, default=1)
         parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
@@ -255,18 +264,25 @@ def main(args):
         mode='min',
     )
 
-    args.gpus = [int(x) for x in args.gpus.split(',')]
+    # TODO: try gradient accumulation
+
+    args.val_every_batches = args.val_every * args.grad_accum  # convert val_every_steps to val_every_batches
     trainer = ptl.Trainer(
-        gpus=args.gpus,
+        gpus=args.gpu_count,
+        auto_select_gpus=False,
         num_tpu_cores=args.num_tpu_cores,
-        distributed_backend='ddp' if len(args.gpus) > 1 else None,
-        track_grad_norm=-1,
-        max_epochs=10000, min_epochs=0, max_steps=args.training_steps,  # run for many epochs, but stop after max_steps
+        distributed_backend='ddp' if args.gpu_count > 1 else None,
+        replace_sampler_ddp=False,
+        track_grad_norm=-1,  # TODO: add logging for gradient norm
+        max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
+        val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=25,
+        row_log_interval=10,
         logger=logger,
-        checkpoint_callback=checkpoint_callback,
+        checkpoint_callback=None,  # FIXME: checkpoint_callback,
         resume_from_checkpoint=args.resume,
+        gradient_clip_val=args.grad_clip,
+        callbacks=[LearningRateLogger()]
     )
     trainer.fit(pretrainer)
 

From 6e24cee0c5f25a2e9d79d8847e74cc2e96618040 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 17:32:04 -0700
Subject: [PATCH 036/112] wip

---
 scripts/pretrain.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 014c33f..4eb08b5 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -20,6 +20,9 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+# TODO: Try on multiple machines
+# TODO: try on a single TPU
+# TODO: try on a TPU-pod
 
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
@@ -129,19 +132,19 @@ def training_step(self, batch, batch_nb):
         tensorboard_logs = {
             'input_size': input_ids.numel(),
             'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
-            'lr': self.trainer.optimizers[0].param_groups[0]['lr'],
             'mlm_loss': loss.detach(),
             'mlm_perplexity': torch.exp(loss).detach(),
-            'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
+            'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
         if self.start_time != 0:
             elapsed_time = time.time() - self.start_time
-            tensorboard_logs['time per batch'] = elapsed_time
+            tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
 
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
+        # TODO: log how long evaluation takes
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {
@@ -228,7 +231,7 @@ def add_args(parser):
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--grad_accum", type=int, default=1)
         # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
-        # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
+        # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
         parser.add_argument("--gpu_count", type=int, default=1)
         parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
@@ -264,9 +267,7 @@ def main(args):
         mode='min',
     )
 
-    # TODO: try gradient accumulation
-
-    args.val_every_batches = args.val_every * args.grad_accum  # convert val_every_steps to val_every_batches
+    args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
         auto_select_gpus=False,
@@ -275,11 +276,12 @@ def main(args):
         replace_sampler_ddp=False,
         track_grad_norm=-1,  # TODO: add logging for gradient norm
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
-        val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches,
+        val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
         row_log_interval=10,
         logger=logger,
-        checkpoint_callback=None,  # FIXME: checkpoint_callback,
+        checkpoint_callback=checkpoint_callback,
+        accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         callbacks=[LearningRateLogger()]

From a2ab9b353a375cc5e7b8a23721fab86af7666ae5 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 18:02:36 -0700
Subject: [PATCH 037/112] wip

---
 scripts/pretrain.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 4eb08b5..34775a7 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -145,6 +145,7 @@ def training_step(self, batch, batch_nb):
 
     def validation_step(self, batch, batch_nb):
         # TODO: log how long evaluation takes
+        # TODO: reproduce roberta evaluation numbers on the longformer corpus
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {

From e3f4ba9816e9d99d19b114eb65baf508b9122f7a Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 21:13:52 -0700
Subject: [PATCH 038/112] wip

---
 scripts/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 34775a7..232a4a3 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -46,7 +46,7 @@ def __getitem__(self, i):
 
     @staticmethod
     def raw_text_to_mmap(args):
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
         assert len(tokenizer) < 65535  # will use uint16 to store token ids
         all_files = glob.glob(f'{args.input_dir}/*.txt')
 
@@ -59,6 +59,7 @@ def raw_text_to_mmap(args):
 
         # TODO: process each shared in a separate worker
         # TODO: support multiple documents in one chunk instead of padding
+        # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
                 for line in tqdm(fin):

From f9e654b24c4dbe33901db275bcf52a3ac3271693 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 16 Jul 2020 22:09:32 -0700
Subject: [PATCH 039/112] .

---
 scripts/pretrain.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 232a4a3..a5171ad 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -24,6 +24,7 @@
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 
+
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
         self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
@@ -57,9 +58,9 @@ def raw_text_to_mmap(args):
         train_chunks = []
         val_chunks = []
 
-        # TODO: process each shared in a separate worker
+        # TODO: process each shared in a separate worker and save their output to files
         # TODO: support multiple documents in one chunk instead of padding
-        # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files
+
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
                 for line in tqdm(fin):

From 9c2646da2061f5ac64d1beaac4e61d836748613c Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 08:11:10 -0700
Subject: [PATCH 040/112] pad chunks or start next doc

---
 scripts/pretrain.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index a5171ad..c787344 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -55,34 +55,42 @@ def raw_text_to_mmap(args):
             logger.info("Cache already exists. Remove the cache directory to regenerate")
             return
         os.mkdir(f'{args.input_dir}/cache/')
-        train_chunks = []
-        val_chunks = []
 
         # TODO: process each shared in a separate worker and save their output to files
-        # TODO: support multiple documents in one chunk instead of padding
 
+        chunks_list = []
         for fname in tqdm(all_files):
             with open(fname, 'r') as fin:
+                current_chunk = [tokenizer.bos_token]
                 for line in tqdm(fin):
                     if line.strip() == '':  # drop empty lines
                         continue
-                    chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks
                     tokens = tokenizer.tokenize(line)  # each line is one document
                     # generate chunks of length args.seqlen. The last chunk will be padded.
                     # padding last chunk is not great for longformer because many chunks will be mostly padding
-                    current_chunk = [tokenizer.bos_token]
+
                     for token in tokens:
                         if len(current_chunk) == args.seqlen - 1:  # chunk is full
                             current_chunk.append(tokenizer.eos_token)
                             chunks_list.append(current_chunk)
                             current_chunk = [tokenizer.bos_token]
                         current_chunk.append(token)
-                    current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
-                    current_chunk[args.seqlen - 1] = tokenizer.eos_token
-                    chunks_list.append(current_chunk)
+                    if args.padded_chunks:
+                        # fill the rest of the seqlen with pad
+                        current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
+                        current_chunk[args.seqlen - 1] = tokenizer.eos_token
+                        chunks_list.append(current_chunk)
+                        current_chunk = [tokenizer.bos_token]
+                    else:
+                        # one long doc with sep inbetween
+                        if len(current_chunk) < args.seqlen - 1:
+                            current_chunk.append(tokenizer.sep_token)
+        random.shuffle(chunks_list)
+        val_count = int(args.train_dev_split * len(chunks_list))
+        val_chunks = chunks_list[:val_count]
+        train_chunks = chunks_list[val_count:]
 
         def _tokenized_text_to_mmap(output_fname, chunks_list):
-            random.shuffle(chunks_list)
             num_chunks = len(chunks_list)
             all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16)
             for k, chunk in enumerate(tqdm(chunks_list)):
@@ -222,6 +230,7 @@ def add_args(parser):
         parser.add_argument("--tokenizer", type=str, default='roberta-base')
         parser.add_argument("--model", type=str, default='roberta-base')
         parser.add_argument("--mlm_prob", type=float, default=0.15)
+        parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--learning_rate", type=float, default=1e-5)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)

From 433a2e29d16f263cec3d9165191d819d87ac1e4b Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 08:18:24 -0700
Subject: [PATCH 041/112] todo

---
 scripts/pretrain.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index c787344..d2c5378 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -166,6 +166,8 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
+            # TODO: PTL already doing this. Is it still needed here?
+            # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
         avg_loss = avg_loss.item()
@@ -195,6 +197,7 @@ def configure_optimizers(self):
     def _get_loader(self, fname, is_train):
         dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen)
 
+        # TODO: consider `replace_sampler_ddp=True` and removing the following if statement
         if self.trainer.use_ddp:
             sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
             shuffle = False

From ec472709f678e0aeee1d9071cb9919fb5a5a7179 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 09:22:56 -0700
Subject: [PATCH 042/112] wip

---
 scripts/pretrain.py | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index d2c5378..95ce577 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -4,6 +4,7 @@
 import random
 import logging
 import numpy as np
+import math
 from tqdm import tqdm
 import time
 import torch
@@ -143,6 +144,7 @@ def training_step(self, batch, batch_nb):
             'input_size': input_ids.numel(),
             'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
             'mlm_loss': loss.detach(),
+            'mlm_bpc': loss.detach()/math.log(2),
             'mlm_perplexity': torch.exp(loss).detach(),
             'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
@@ -225,30 +227,42 @@ def val_dataloader(self):
     @staticmethod
     def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
+
+        # Dataset. Some of these params are only useful when generating the dataset cache
         parser.add_argument("--input_dir", type=str, required=True)
-        parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True)
         parser.add_argument("--train_dev_split", type=float, default=0.05)
+        parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--seqlen", type=int, default=512)
+        parser.add_argument("--mlm_prob", type=float, default=0.15)
+
+        # HF model loading
         parser.add_argument("--tokenizer", type=str, default='roberta-base')
         parser.add_argument("--model", type=str, default='roberta-base')
-        parser.add_argument("--mlm_prob", type=float, default=0.15)
-        parser.add_argument("--padded_chunks", type=bool, default=False)
-        parser.add_argument("--weight_decay", type=float, default=0.01)
+
+        # Checkpointing and logging
+        parser.add_argument("--save_dir", type=str, default='runs/')
+        parser.add_argument("--save_prefix", type=str, required=True)
+        parser.add_argument("--resume", type=str, default=None)
+
+        # Training hyperparams
         parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates')
+        parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates')
+        parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations')
+        parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**')
+        parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
         parser.add_argument("--grad_clip", type=float, default=0)
-        parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates')
-        parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates')
-        parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations')
-        parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**')
-        parser.add_argument("--batch_size", type=int, default=8)
+
+        # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
+        parser.add_argument("--batch_size", type=int, default=32)
+        parser.add_argument("--grad_accum", type=int, default=16)
+
+        # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
-        parser.add_argument("--grad_accum", type=int, default=1)
         # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
         # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
         parser.add_argument("--gpu_count", type=int, default=1)
-        parser.add_argument("--resume", type=str, default=None)
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser

From 77e105d14a65c17878f07e33bc19979d8c57cf98 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 13:51:56 -0700
Subject: [PATCH 043/112] wip

---
 scripts/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 95ce577..a715c9c 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -24,6 +24,7 @@
 # TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
+# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635
 
 
 class MMapTextDataset(Dataset):
@@ -157,7 +158,6 @@ def training_step(self, batch, batch_nb):
 
     def validation_step(self, batch, batch_nb):
         # TODO: log how long evaluation takes
-        # TODO: reproduce roberta evaluation numbers on the longformer corpus
         self.start_time = 0  # reset training_step timer
         loss = self(**batch)
         tensorboard_logs = {

From af08b5a2ef4d72a859860c5f048204c069d56694 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 17:24:05 -0700
Subject: [PATCH 044/112] wip

---
 requirements.txt    |  3 ++-
 scripts/pretrain.py | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2279015..cbce7f0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
+pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
+
 torch>=1.5.0
 transformers==3.0.2
 tensorboardX
-pytorch-lightning==0.8.5
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index a715c9c..2f6b890 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -21,10 +21,12 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+# DONE: reproduce RoBERTa numbers on the Longformer corpus
 # TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635
+# TODO: try restarting and double check optimizer, lr and lr scheduler
+# TODO: try fp16
 
 
 class MMapTextDataset(Dataset):
@@ -260,9 +262,8 @@ def add_args(parser):
 
         # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
-        # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n
-        # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward
-        parser.add_argument("--gpu_count", type=int, default=1)
+        parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
+                            help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser
@@ -299,7 +300,6 @@ def main(args):
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
-        auto_select_gpus=False,
         num_tpu_cores=args.num_tpu_cores,
         distributed_backend='ddp' if args.gpu_count > 1 else None,
         replace_sampler_ddp=False,

From d1050232a91b8aee85cca032c761543479f8d748 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 20:13:08 -0700
Subject: [PATCH 045/112] wip

---
 requirements.txt    | 2 +-
 scripts/pretrain.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cbce7f0..b396708 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
 
-torch>=1.5.0
+torch==1.3.0
 transformers==3.0.2
 tensorboardX
 test-tube==0.7.5
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 2f6b890..3263537 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -26,7 +26,6 @@
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 # TODO: try restarting and double check optimizer, lr and lr scheduler
-# TODO: try fp16
 
 
 class MMapTextDataset(Dataset):
@@ -313,6 +312,7 @@ def main(args):
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
+        precision=16, amp_level='O2',
         callbacks=[LearningRateLogger()]
     )
     trainer.fit(pretrainer)

From 1183999999500f1043f001a08549bafd18ea84db Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 22:25:48 -0700
Subject: [PATCH 046/112] wip

---
 scripts/pretrain.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 3263537..e79eb17 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -225,6 +225,12 @@ def train_dataloader(self):
     def val_dataloader(self):
         return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False)
 
+    def grad_norm(self, norm_type):
+        # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
+        # TODO: grad_norm reporting needs to take fp16 loss scale into account
+        all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]
+        return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))}
+
     @staticmethod
     def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
@@ -302,7 +308,7 @@ def main(args):
         num_tpu_cores=args.num_tpu_cores,
         distributed_backend='ddp' if args.gpu_count > 1 else None,
         replace_sampler_ddp=False,
-        track_grad_norm=-1,  # TODO: add logging for gradient norm
+        track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
@@ -313,7 +319,7 @@ def main(args):
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         precision=16, amp_level='O2',
-        callbacks=[LearningRateLogger()]
+        callbacks=[LearningRateLogger()],
     )
     trainer.fit(pretrainer)
 

From 20e8208af7617c132bf1d7e2008a4569c97f7beb Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 17 Jul 2020 22:32:46 -0700
Subject: [PATCH 047/112] wip

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b396708..e47a1d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
 
-torch==1.3.0
+torch==1.3.1
 transformers==3.0.2
 tensorboardX
 test-tube==0.7.5

From 224824d9e74a12e7c3a0d7bd6bd1ac858a1a02df Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 07:58:49 -0700
Subject: [PATCH 048/112] wip

---
 scripts/pretrain.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index e79eb17..5022aa8 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -22,7 +22,6 @@
 logger = logging.getLogger(__name__)
 
 # DONE: reproduce RoBERTa numbers on the Longformer corpus
-# TODO: Try on multiple machines
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
 # TODO: try restarting and double check optimizer, lr and lr scheduler
@@ -259,7 +258,7 @@ def add_args(parser):
         parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**')
         parser.add_argument("--weight_decay", type=float, default=0.01)
         parser.add_argument("--adam_epsilon", type=float, default=1e-6)
-        parser.add_argument("--grad_clip", type=float, default=0)
+        parser.add_argument("--grad_clip", type=float, default=0)  # TODO: test this with fp16. Likely not working
 
         # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
         parser.add_argument("--batch_size", type=int, default=32)
@@ -269,6 +268,21 @@ def add_args(parser):
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
                             help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
+
+        # For multi-node training, use the PyTorch launch script. The script and instructions can be found here:
+        # https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py.
+        # To run PTL in a mode compatible with the launch script, two things are needed:
+        #   - pass the argument `--use_env` to `torch.distributed.launch`
+        #   - make sure `--nproc_per_node` matches `--gpu_count` and `--nnodes` matches `--node_count`.
+        # For example, to run on 2 nodes, 3 gpus each, the command line on node rank 1 would be like:
+        #   >>>> python -m torch.distributed.launch  \
+        #               --use_env  --nnodes 2  --nproc_per_node 3  \
+        #               --node_rank 1  --master_addr s2-server4  --master_port 12343  \
+        #               scripts/pretrain.py  \
+        #               --gpu_count 2  --node_count 2  \
+        #               --input_dir my_data_dir  --save_prefix test_multinode
+        parser.add_argument("--node_count", type=int, default=1,
+                            help="Number of nodes. It needs to match --nnodes of torch.distributed.launch")
         parser.add_argument("--num_tpu_cores", type=int, default=None)
 
         return parser
@@ -305,8 +319,9 @@ def main(args):
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
+        num_nodes=args.node_count,
         num_tpu_cores=args.num_tpu_cores,
-        distributed_backend='ddp' if args.gpu_count > 1 else None,
+        distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None,
         replace_sampler_ddp=False,
         track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps

From 4a1273082f7c33de7392825451b20920d380036f Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 09:41:34 -0700
Subject: [PATCH 049/112] wip

---
 scripts/pretrain.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 5022aa8..10165b0 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -22,9 +22,12 @@
 logger = logging.getLogger(__name__)
 
 # DONE: reproduce RoBERTa numbers on the Longformer corpus
+# DONE: testing ddp single machine
+# DONE: testing ddp multiple machines
+# DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: try restarting and double check optimizer, lr and lr scheduler
+# TODO: only one checkpoint per epoch is saved
 
 
 class MMapTextDataset(Dataset):
@@ -168,7 +171,7 @@ def validation_step(self, batch, batch_nb):
     def validation_epoch_end(self, outputs):
         avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean()
         if self.use_ddp:
-            # TODO: PTL already doing this. Is it still needed here?
+            # TODO: PTL is already doing this. Is it still needed here?
             # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
@@ -189,11 +192,10 @@ def configure_optimizers(self):
                 "weight_decay": 0.0,
             },
         ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
             optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps
         )
-
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
 
     def _get_loader(self, fname, is_train):
@@ -247,11 +249,15 @@ def add_args(parser):
 
         # Checkpointing and logging
         parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True)
-        parser.add_argument("--resume", type=str, default=None)
+        parser.add_argument("--save_prefix", type=str, required=True,
+                            help="path of output directory is --save_dir/--save_prefix")
+        parser.add_argument("--resume", type=str, default=None,  # It is better to use a different output dir.
+                            help="Path to a checkpoint to load model weights and training state. It overwrites args")
+        parser.add_argument("--resume_model_only", type=str, default=None,
+                            help="Path to a checkpoint to load model weights but not training state")
 
         # Training hyperparams
-        parser.add_argument("--learning_rate", type=float, default=1e-5)
+        parser.add_argument("--lr", type=float, default=1e-5)
         parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates')
         parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates')
         parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations')
@@ -295,7 +301,10 @@ def main(args):
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(args.seed * 10000)
 
-    pretrainer = Pretrainer(args)
+    if args.resume_model_only is not None:
+        pretrainer = Pretrainer.load_from_checkpoint(args.resume_model_only, args)
+    else:
+        pretrainer = Pretrainer(args)
 
     # logger here is a SummaryWritter for tensorboard
     # it is used by the trainer, and certain return variables

From c936d24414235405b37689b5e0df06040b5acc77 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 18 Jul 2020 10:24:16 -0700
Subject: [PATCH 050/112] wip

---
 scripts/pretrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 10165b0..6015fa0 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -27,7 +27,6 @@
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
-# TODO: only one checkpoint per epoch is saved
 
 
 class MMapTextDataset(Dataset):
@@ -175,7 +174,6 @@ def validation_epoch_end(self, outputs):
             # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
-        avg_loss = avg_loss.item()
         logs = {'val_mlm_loss': avg_loss}
         return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss}
 
@@ -320,9 +318,11 @@ def main(args):
         filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'),
         prefix='',
         save_top_k=3,
+        save_last=True,
         verbose=True,
         monitor='val_loss',
         mode='min',
+        period=-1,  # to allow multiple checkpoints per epoch
     )
 
     args.val_every *= args.grad_accum  # PTL is expecting number of batches_per_gpu

From 510801bd76896e43359eef66979c119e1de237a2 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sun, 19 Jul 2020 09:06:10 -0700
Subject: [PATCH 051/112] wip

---
 scripts/pretrain.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 6015fa0..683b008 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -27,6 +27,7 @@
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # TODO: try on a TPU-pod
+# TODO: run on beaker on ai2-server1/2
 
 
 class MMapTextDataset(Dataset):

From 9184b718e61c9b8abf3c0d846f16bbe33bc46f18 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 20:16:38 +0000
Subject: [PATCH 052/112] wip

---
 scripts/cheatsheet.txt | 22 ++++++++++++++++
 scripts/test_tpu.py    | 57 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 scripts/test_tpu.py

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index be4fc3a..d39371e 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -70,3 +70,25 @@ python -m scripts.triviaqa_utils.evaluation_utils  \
     --prediction_file predictions.json
 # Output should be:
 {'exact_match': 73.07644188665083, 'f1': 77.78523804802242, 'common': 7993, 'denominator': 7993, 'pred_len': 7993, 'gold_len': 7993}
+
+
+# TPU
+export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+
+source /anaconda3/bin/activate torch-xla-nightly
+
+import torch_xla.debug.metrics as met; print(met.metrics_report())
+
+curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
+
+  XLA_IR_DEBUG=1                                                                                                                                     
+  XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470                                                                    
+  TF_CPP_LOG_THREAD_ID=1                                                                                              
+  TF_CPP_MIN_LOG_LEVEL=0                                                                                                                 
+  XLA_HLO_DEBUG=1                                                                                                                   
+  XLA_DUMP_FATAL_STACK=1                                                                                                                          
+  TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1                                                             
+  XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs                                                                                                        
+  XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics          
diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py
new file mode 100644
index 0000000..618ee6f
--- /dev/null
+++ b/scripts/test_tpu.py
@@ -0,0 +1,57 @@
+import os
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoModel
+import pytorch_lightning as pl
+
+class CoolDataset(Dataset):
+  def __len__(self):
+      return 128 * 128
+
+  def __getitem__(self, idx):
+      return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128)
+
+class CoolSystem(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+
+        # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096')
+        self.model = AutoModel.from_pretrained('bert-base-uncased')
+
+    def forward(self, x, y):
+        return self.model(x, attention_mask=None)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x, y)
+        loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
+        tensorboard_logs = {'train_loss': loss}
+        return {'loss': loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x, y)
+        val_loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
+        return {'val_loss': val_loss}
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+        tensorboard_logs = {'val_loss': avg_loss}
+        return {'val_loss': avg_loss, 'log': tensorboard_logs}
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.001)
+
+    def train_dataloader(self):
+        loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+        return loader
+
+    def val_dataloader(self):
+        loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+        return loader
+
+if __name__ == '__main__':
+    model = CoolSystem()
+    trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0)
+    trainer.fit(model)

From 4ae991a4ffb6470badd593cfbe7f72b8aba0b89f Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 20:43:27 +0000
Subject: [PATCH 053/112] wip

---
 scripts/cheatsheet.txt | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index d39371e..6dde8ce 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -82,13 +82,3 @@ import torch_xla.debug.metrics as met; print(met.metrics_report())
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
-
-  XLA_IR_DEBUG=1                                                                                                                                     
-  XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470                                                                    
-  TF_CPP_LOG_THREAD_ID=1                                                                                              
-  TF_CPP_MIN_LOG_LEVEL=0                                                                                                                 
-  XLA_HLO_DEBUG=1                                                                                                                   
-  XLA_DUMP_FATAL_STACK=1                                                                                                                          
-  TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1                                                             
-  XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs                                                                                                        
-  XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics          

From aea2a984563026bedc10d63b63307d245822cb1f Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 21:45:45 +0000
Subject: [PATCH 054/112] tpu

---
 scripts/pretrain.py | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 683b008..93d83a4 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -30,6 +30,14 @@
 # TODO: run on beaker on ai2-server1/2
 
 
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
+
+
 class MMapTextDataset(Dataset):
     def __init__(self, mmap_filename, chunk_size):
         self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
@@ -146,16 +154,17 @@ def training_step(self, batch, batch_nb):
         input_ids = batch['input_ids']
         tensorboard_logs = {
             'input_size': input_ids.numel(),
-            'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3,
-            'mlm_loss': loss.detach(),
-            'mlm_bpc': loss.detach()/math.log(2),
-            'mlm_perplexity': torch.exp(loss).detach(),
+            'mlm_loss': loss,
+            'mlm_bpc': loss/math.log(2),
+            'mlm_perplexity': torch.exp(loss),
             'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size,
         }
         if self.start_time != 0:
             elapsed_time = time.time() - self.start_time
             tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
+        if not XLA_AVAILABLE:
+            tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3
 
         return {'loss': loss, 'log': tensorboard_logs}
 
@@ -204,6 +213,14 @@ def _get_loader(self, fname, is_train):
         if self.trainer.use_ddp:
             sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train)
             shuffle = False
+        elif self.trainer.use_tpu:
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                dataset,
+                num_replicas=xm.xrt_world_size(),
+                rank=xm.get_ordinal(),
+                shuffle=is_train,
+            )
+            shuffle = False
         else:
             sampler = None
             shuffle = is_train
@@ -227,6 +244,10 @@ def val_dataloader(self):
 
     def grad_norm(self, norm_type):
         # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
+
+        if XLA_AVAILABLE:
+            return {}  # computing grad_norm one parameter at a time takes forever on TPU
+
         # TODO: grad_norm reporting needs to take fp16 loss scale into account
         all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]
         return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))}
@@ -266,8 +287,8 @@ def add_args(parser):
         parser.add_argument("--grad_clip", type=float, default=0)  # TODO: test this with fp16. Likely not working
 
         # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
-        parser.add_argument("--batch_size", type=int, default=32)
-        parser.add_argument("--grad_accum", type=int, default=16)
+        parser.add_argument("--batch_size", type=int, default=8)
+        parser.add_argument("--grad_accum", type=int, default=1)
 
         # Compute resources
         parser.add_argument("--num_workers", type=int, default=0)
@@ -288,7 +309,7 @@ def add_args(parser):
         #               --input_dir my_data_dir  --save_prefix test_multinode
         parser.add_argument("--node_count", type=int, default=1,
                             help="Number of nodes. It needs to match --nnodes of torch.distributed.launch")
-        parser.add_argument("--num_tpu_cores", type=int, default=None)
+        parser.add_argument("--tpu_core_count", type=int, default=None)
 
         return parser
 
@@ -330,20 +351,22 @@ def main(args):
     trainer = ptl.Trainer(
         gpus=args.gpu_count,
         num_nodes=args.node_count,
-        num_tpu_cores=args.num_tpu_cores,
+        num_tpu_cores=args.tpu_core_count,
         distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None,
         replace_sampler_ddp=False,
         track_grad_norm=2,
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=10,
+        row_log_interval=16,
+        progress_bar_refresh_rate=16,
         logger=logger,
         checkpoint_callback=checkpoint_callback,
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
         precision=16, amp_level='O2',
+        num_sanity_val_steps=2,
         callbacks=[LearningRateLogger()],
     )
     trainer.fit(pretrainer)

From 69b717a259d6e82376096fb6ec343bd576d58cd3 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 15:47:11 -0700
Subject: [PATCH 055/112] wip

---
 scripts/pretrain.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 93d83a4..2202a39 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -26,6 +26,12 @@
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
+# - tie weights
+# - tensorboard
+# - getrank
+# - barrier
+# - val all_reduce
+# - checkpointing
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -257,7 +263,7 @@ def add_args(parser):
         parser.add_argument("--seed", type=int, default=3)
 
         # Dataset. Some of these params are only useful when generating the dataset cache
-        parser.add_argument("--input_dir", type=str, required=True)
+        parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/')
         parser.add_argument("--train_dev_split", type=float, default=0.05)
         parser.add_argument("--padded_chunks", type=bool, default=False)
         parser.add_argument("--seqlen", type=int, default=512)
@@ -269,7 +275,7 @@ def add_args(parser):
 
         # Checkpointing and logging
         parser.add_argument("--save_dir", type=str, default='runs/')
-        parser.add_argument("--save_prefix", type=str, required=True,
+        parser.add_argument("--save_prefix", type=str, default='test',
                             help="path of output directory is --save_dir/--save_prefix")
         parser.add_argument("--resume", type=str, default=None,  # It is better to use a different output dir.
                             help="Path to a checkpoint to load model weights and training state. It overwrites args")
@@ -291,6 +297,7 @@ def add_args(parser):
         parser.add_argument("--grad_accum", type=int, default=1)
 
         # Compute resources
+        parser.add_argument("--fp16", type=bool, default=False)
         parser.add_argument("--num_workers", type=int, default=0)
         parser.add_argument("--gpu_count", type=int, default=1,  # `--gpus` is reserved for internal use by PTL
                             help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`")
@@ -365,7 +372,7 @@ def main(args):
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,
-        precision=16, amp_level='O2',
+        precision=16 if args.fp16 else 32, amp_level='O2',
         num_sanity_val_steps=2,
         callbacks=[LearningRateLogger()],
     )

From 5f641c05cec9a73406a5911e99a80e80bfb583b1 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 23 Jul 2020 02:54:28 +0000
Subject: [PATCH 056/112] wip

---
 scripts/cheatsheet.txt | 2 ++
 scripts/pretrain.py    | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index 6dde8ce..1e77b07 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -82,3 +82,5 @@ import torch_xla.debug.metrics as met; print(met.metrics_report())
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py  --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 2202a39..ab1120a 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -281,6 +281,7 @@ def add_args(parser):
                             help="Path to a checkpoint to load model weights and training state. It overwrites args")
         parser.add_argument("--resume_model_only", type=str, default=None,
                             help="Path to a checkpoint to load model weights but not training state")
+        parser.add_argument("--log_rate", type=int, default=16)
 
         # Training hyperparams
         parser.add_argument("--lr", type=float, default=1e-5)
@@ -365,8 +366,8 @@ def main(args):
         max_epochs=10000, min_epochs=0, max_steps=args.train_steps,  # run for many epochs, but stop after max_steps
         val_check_interval=args.val_every, limit_val_batches=args.val_batches,
         early_stop_callback=None,
-        row_log_interval=16,
-        progress_bar_refresh_rate=16,
+        row_log_interval=args.log_rate,
+        progress_bar_refresh_rate=args.log_rate,
         logger=logger,
         checkpoint_callback=checkpoint_callback,
         accumulate_grad_batches=args.grad_accum,

From e3ddeca7507e91f5659f1d0f480267e08cc4484c Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 22 Jul 2020 22:12:45 -0700
Subject: [PATCH 057/112] wip

---
 scripts/test_tpu.py | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py
index 618ee6f..0df4091 100644
--- a/scripts/test_tpu.py
+++ b/scripts/test_tpu.py
@@ -1,23 +1,25 @@
-import os
 import torch
 from torch.utils.data import DataLoader, Dataset
 from transformers import AutoModel
 import pytorch_lightning as pl
 
+
 class CoolDataset(Dataset):
-  def __len__(self):
-      return 128 * 128
 
-  def __getitem__(self, idx):
-      return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128)
+    def __len__(self):
+        return 128 * 128
+
+    def __getitem__(self, idx):
+        return torch.tensor([1, 2, 3, 4] * 128 * 8), torch.tensor([1, 1, 1, 1] * 128 * 8)
+
 
 class CoolSystem(pl.LightningModule):
 
     def __init__(self):
         super().__init__()
 
-        # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096')
-        self.model = AutoModel.from_pretrained('bert-base-uncased')
+        self.model = AutoModel.from_pretrained('allenai/longformer-base-4096')
+        # self.model = AutoModel.from_pretrained('roberta-base')
 
     def forward(self, x, y):
         return self.model(x, attention_mask=None)
@@ -25,20 +27,8 @@ def forward(self, x, y):
     def training_step(self, batch, batch_idx):
         x, y = batch
         y_hat = self(x, y)
-        loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
-        tensorboard_logs = {'train_loss': loss}
-        return {'loss': loss, 'log': tensorboard_logs}
-
-    def validation_step(self, batch, batch_idx):
-        x, y = batch
-        y_hat = self(x, y)
-        val_loss = y_hat[0].sum() #  F.cross_entropy(y_hat, y)
-        return {'val_loss': val_loss}
-
-    def validation_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-        tensorboard_logs = {'val_loss': avg_loss}
-        return {'val_loss': avg_loss, 'log': tensorboard_logs}
+        loss = y_hat[0].sum()
+        return {'loss': loss}
 
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.001)
@@ -47,11 +37,8 @@ def train_dataloader(self):
         loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
         return loader
 
-    def val_dataloader(self):
-        loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0)
-        return loader
 
 if __name__ == '__main__':
     model = CoolSystem()
-    trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0)
+    trainer = pl.Trainer(progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0, num_tpu_cores=1)
     trainer.fit(model)

From 00ce1e9635b2e745b81bc7639949d654b6c10a39 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 23 Jul 2020 07:30:13 +0000
Subject: [PATCH 058/112] wip

---
 scripts/test_tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py
index 0df4091..8f50eba 100644
--- a/scripts/test_tpu.py
+++ b/scripts/test_tpu.py
@@ -40,5 +40,5 @@ def train_dataloader(self):
 
 if __name__ == '__main__':
     model = CoolSystem()
-    trainer = pl.Trainer(progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0, num_tpu_cores=1)
+    trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0)
     trainer.fit(model)

From 56b9c6aa544d86d6652a0ecd05f07d183043d449 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 23 Jul 2020 21:08:35 +0000
Subject: [PATCH 059/112] wip

---
 scripts/cheatsheet.txt |  3 +--
 scripts/pretrain.py    | 14 ++++++++++----
 scripts/test_tpu.py    |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index 1e77b07..e9f3fba 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -73,12 +73,11 @@ python -m scripts.triviaqa_utils.evaluation_utils  \
 
 
 # TPU
+export TPU_IP_ADDRESS=10.125.212.42
 export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
-
 source /anaconda3/bin/activate torch-xla-nightly
 
 import torch_xla.debug.metrics as met; print(met.metrics_report())
-
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index ab1120a..9706b19 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -30,8 +30,11 @@
 # - tensorboard
 # - getrank
 # - barrier
-# - val all_reduce
-# - checkpointing
+# - checkpointing (broken)
+# - gradient accumulation
+# - set_epoch bug
+# - gradient clipping
+# TODO: use AutoModelForMaskedLM and remove masked_lm_labels
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -169,7 +172,7 @@ def training_step(self, batch, batch_nb):
             elapsed_time = time.time() - self.start_time
             tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
-        if not XLA_AVAILABLE:
+        if not self.use_tpu:
             tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3
 
         return {'loss': loss, 'log': tensorboard_logs}
@@ -190,6 +193,9 @@ def validation_epoch_end(self, outputs):
             # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= torch.distributed.get_world_size()
+        elif self.use_tpu:
+            avg_loss = xm.all_reduce(xm.REDUCE_SUM, avg_loss) / xm.xrt_world_size()
+
         logs = {'val_mlm_loss': avg_loss}
         return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss}
 
@@ -251,7 +257,7 @@ def val_dataloader(self):
     def grad_norm(self, norm_type):
         # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
 
-        if XLA_AVAILABLE:
+        if self.use_tpu:
             return {}  # computing grad_norm one parameter at a time takes forever on TPU
 
         # TODO: grad_norm reporting needs to take fp16 loss scale into account
diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py
index 8f50eba..e692890 100644
--- a/scripts/test_tpu.py
+++ b/scripts/test_tpu.py
@@ -40,5 +40,5 @@ def train_dataloader(self):
 
 if __name__ == '__main__':
     model = CoolSystem()
-    trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0)
+    trainer = pl.Trainer(num_tpu_cores=8, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0)
     trainer.fit(model)

From 8fca18703925d062c2557ebb73618e31fd5e0fa8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 03:47:06 +0000
Subject: [PATCH 060/112] wip

---
 scripts/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 9706b19..6b88e94 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -79,6 +79,7 @@ def raw_text_to_mmap(args):
         os.mkdir(f'{args.input_dir}/cache/')
 
         # TODO: process each shared in a separate worker and save their output to files
+        # TODO: update the data generation to avoid the need for regeneration if the seqlen changes
 
         chunks_list = []
         for fname in tqdm(all_files):
@@ -353,7 +354,7 @@ def main(args):
         # model saved to filepath/prefix_....
         filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'),
         prefix='',
-        save_top_k=3,
+        save_top_k=1,
         save_last=True,
         verbose=True,
         monitor='val_loss',

From 9dd76b74ca807bd89027a4d9267fa93c79373dea Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 03:58:56 +0000
Subject: [PATCH 061/112] wip

---
 scripts/pretrain.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 6b88e94..7cfe0b2 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -28,9 +28,6 @@
 # TODO: try on a single TPU
 # - tie weights
 # - tensorboard
-# - getrank
-# - barrier
-# - checkpointing (broken)
 # - gradient accumulation
 # - set_epoch bug
 # - gradient clipping

From d40983a44c8ff86342b581d659ecc428138f1dda Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 08:22:25 -0700
Subject: [PATCH 062/112] wip

---
 scripts/pretrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 7cfe0b2..888a731 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -27,12 +27,12 @@
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
 # - tie weights
-# - tensorboard
 # - gradient accumulation
 # - set_epoch bug
 # - gradient clipping
-# TODO: use AutoModelForMaskedLM and remove masked_lm_labels
+# - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698
 # TODO: try on a TPU-pod
+# TODO: use AutoModelForMaskedLM and remove masked_lm_labels
 # TODO: run on beaker on ai2-server1/2
 
 

From f0f6a3033b7ff190999790e6a45664615ea8cc05 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 15:56:03 +0000
Subject: [PATCH 063/112] wip

---
 scripts/cheatsheet.txt | 10 ++++++----
 scripts/pretrain.py    |  8 +++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
index e9f3fba..c0ab4e5 100644
--- a/scripts/cheatsheet.txt
+++ b/scripts/cheatsheet.txt
@@ -73,13 +73,15 @@ python -m scripts.triviaqa_utils.evaluation_utils  \
 
 
 # TPU
-export TPU_IP_ADDRESS=10.125.212.42
-export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
-source /anaconda3/bin/activate torch-xla-nightly
-
 import torch_xla.debug.metrics as met; print(met.metrics_report())
 curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
 
 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py  --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096
+
+python scripts/pretrain.py  --input_dir data/ --save_prefix test_grad_accum --gpu_count 0 --tpu_core_count 8 --val_batches 30 --val_every 30 --num_workers 0 --log_rate 1
+
+export TPU_IP_ADDRESS=10.125.212.42
+export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+source /anaconda3/bin/activate torch-xla-nightly
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 7cfe0b2..dd5d486 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 import time
 import torch
-from transformers import AutoTokenizer, AutoModelWithLMHead
+from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers import DataCollatorForLanguageModeling
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
@@ -28,10 +28,8 @@
 # TODO: try on a single TPU
 # - tie weights
 # - tensorboard
-# - gradient accumulation
 # - set_epoch bug
 # - gradient clipping
-# TODO: use AutoModelForMaskedLM and remove masked_lm_labels
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -134,7 +132,7 @@ def __init__(self, hparams):
         self.args = hparams
         self.hparams = self.args
 
-        self.model = AutoModelWithLMHead.from_pretrained(args.model)
+        self.model = AutoModelForMaskedLM.from_pretrained(args.model)
         self.config = self.model.config
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
         self.pad_token_id = tokenizer.pad_token_id
@@ -153,7 +151,7 @@ def forward(self, input_ids=None, labels=None):
         attention_mask = (input_ids != self.pad_token_id).int()
 
         # output is loss, prediction_scores, hidden_states
-        output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
         return output[0]  # loss
 
     def training_step(self, batch, batch_nb):

From 9eb6fdf5e1c92c1d8b6a54843de88817a543acdb Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 19:58:16 +0000
Subject: [PATCH 064/112] wip

---
 scripts/pretrain.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 612043d..1181575 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -147,6 +147,17 @@ def __init__(self, hparams):
         )
         self.start_time = 0
 
+    def to(self, *args, **kwargs):
+        param_count_before_to = len(list(self.parameters()))
+        super().to(*args, **kwargs)
+        if self.trainer.use_tpu:
+            # need to re-tie the weights after moving to XLA!
+            self.model.tie_weights()
+            if 'roberta' in self.args.model:
+                self.model.lm_head.bias = self.model.lm_head.decoder.bias
+        param_count_after_to = len(list(self.parameters()))
+        assert param_count_before_to == param_count_after_to
+
     def forward(self, input_ids=None, labels=None):
         # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD
         attention_mask = (input_ids != self.pad_token_id).int()

From 14b60745f5952cf32acd9474377bbf13e0e45bc2 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 19:58:44 +0000
Subject: [PATCH 065/112] wip

---
 scripts/pretrain.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 1181575..410c46f 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -26,7 +26,6 @@
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
 # TODO: try on a single TPU
-# - tie weights
 # - set_epoch bug
 # - gradient clipping
 # - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698

From 5b97bd6825972c3d536fd664ce373fb20a118de0 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 20:07:24 +0000
Subject: [PATCH 066/112] wip

---
 scripts/pretrain.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 410c46f..46a4deb 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -30,7 +30,6 @@
 # - gradient clipping
 # - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698
 # TODO: try on a TPU-pod
-# TODO: use AutoModelForMaskedLM and remove masked_lm_labels
 # TODO: run on beaker on ai2-server1/2
 
 

From 71d7a9dd4534a9eef824a34022ccdd66dd7e375d Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 20:41:20 +0000
Subject: [PATCH 067/112] wip

---
 scripts/pretrain.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 46a4deb..e78ee84 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -25,10 +25,7 @@
 # DONE: testing ddp single machine
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
-# TODO: try on a single TPU
-# - set_epoch bug
-# - gradient clipping
-# - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698
+# TODO: check gradient clipping on a single TPU
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -294,6 +291,7 @@ def add_args(parser):
         parser.add_argument("--resume_model_only", type=str, default=None,
                             help="Path to a checkpoint to load model weights but not training state")
         parser.add_argument("--log_rate", type=int, default=16)
+        parser.add_argument("--disable_checkpointing", type=bool, default=False)
 
         # Training hyperparams
         parser.add_argument("--lr", type=float, default=1e-5)
@@ -381,7 +379,7 @@ def main(args):
         row_log_interval=args.log_rate,
         progress_bar_refresh_rate=args.log_rate,
         logger=logger,
-        checkpoint_callback=checkpoint_callback,
+        checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else None,
         accumulate_grad_batches=args.grad_accum,
         resume_from_checkpoint=args.resume,
         gradient_clip_val=args.grad_clip,

From 97a126d37520ef2b37e5c5fa82dda9a6cf003182 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 21:24:58 +0000
Subject: [PATCH 068/112] wip

---
 scripts/pretrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index e78ee84..8076273 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -25,7 +25,7 @@
 # DONE: testing ddp single machine
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
-# TODO: check gradient clipping on a single TPU
+# TODO: enable gradient norm logging on a single TPU
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -261,7 +261,7 @@ def grad_norm(self, norm_type):
         # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
 
         if self.use_tpu:
-            return {}  # computing grad_norm one parameter at a time takes forever on TPU
+            return {}  # TODO: computing grad_norm one parameter at a time takes forever on TPU
 
         # TODO: grad_norm reporting needs to take fp16 loss scale into account
         all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]

From c873da2e7ddd8c6a9e068ffede73bb46405661ef Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 25 Jul 2020 15:32:23 -0700
Subject: [PATCH 069/112] wip

---
 scripts/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 8076273..3572fa4 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -136,7 +136,7 @@ def __init__(self, hparams):
         logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.')
         MMapTextDataset.raw_text_to_mmap(args)
 
-        # TODO: add support for other objective functions
+        # TODO: add support for other objective functions (whole word masking, BART objectives)
         self.data_collator = DataCollatorForLanguageModeling(
             tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob
         )

From d602869b1b9a5e401e92c8924b43b0cf09fdff17 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 27 Jul 2020 23:10:56 -0700
Subject: [PATCH 070/112] faster gradnorm

---
 scripts/pretrain.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 3572fa4..c684a55 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -259,13 +259,16 @@ def val_dataloader(self):
 
     def grad_norm(self, norm_type):
         # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params
-
-        if self.use_tpu:
-            return {}  # TODO: computing grad_norm one parameter at a time takes forever on TPU
-
         # TODO: grad_norm reporting needs to take fp16 loss scale into account
-        all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None]
-        return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))}
+        parameters = [p for p in self.parameters() if p.grad is not None]
+        device = parameters[0].device
+        total_norm = torch.zeros([], device=device if parameters else None)
+        norm_type = float(norm_type)
+        for p in parameters:
+            param_norm = p.grad.data.pow(norm_type).sum()
+            total_norm.add_(param_norm)
+        total_norm = (total_norm ** (1.0 / norm_type))
+        return {'total_grad_norm': total_norm}
 
     @staticmethod
     def add_args(parser):
@@ -304,7 +307,7 @@ def add_args(parser):
         parser.add_argument("--grad_clip", type=float, default=0)  # TODO: test this with fp16. Likely not working
 
         # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum)
-        parser.add_argument("--batch_size", type=int, default=8)
+        parser.add_argument("--batch_size", type=int, default=32)
         parser.add_argument("--grad_accum", type=int, default=1)
 
         # Compute resources

From ffd06dd5606d801ca1b7e37c4fc75054f1943739 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 27 Jul 2020 23:54:27 -0700
Subject: [PATCH 071/112] allow changing seqlen at runtime

---
 scripts/pretrain.py | 128 ++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 53 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index c684a55..253cfc5 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -39,8 +39,9 @@
 
 
 class MMapTextDataset(Dataset):
-    def __init__(self, mmap_filename, chunk_size):
-        self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size
+    def __init__(self, mmap_filename, chunk_size, bos_token_id, eos_token_id):
+        # `chunk_size - 2` to reserve space for <s> and </s>
+        self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // (chunk_size - 2)
         # defer loading the token_ids memmap until after the first __getitem__ call.
         # when spawning new processes for ddp, there is a hard limit in python < 3.8 that
         # pickle files need to be < 4GB. By waiting until after the first __getitem__ we
@@ -48,18 +49,30 @@ def __init__(self, mmap_filename, chunk_size):
         self.token_ids = None
         self._mmap_filename = mmap_filename
         self._chunk_size = chunk_size
+        self._bos_token_id = bos_token_id
+        self._eos_token_id = eos_token_id
 
     def __len__(self):
         return self.num_instances
 
     def __getitem__(self, i):
         if self.token_ids is None:
-            self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16,
-                                       shape=(self.num_instances, self._chunk_size))
-        return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long)
+            self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16)
+        from_index = i * (self._chunk_size - 2)
+        to_index = (i + 1) * (self._chunk_size - 2)
+        data = np.concatenate(([self._bos_token_id], self.token_ids[from_index:to_index], [self._eos_token_id]))
+        return torch.tensor(data, dtype=torch.long)
 
     @staticmethod
     def raw_text_to_mmap(args):
+        """This is the main preprocessing function. It processes all the text files in `args.input_dir` and
+        outputs two np.memmap files, one for training and one for validation with ratio `args.train_dev_split`.
+        Processing each input file involves tokenizing it, sharding it into shards of size `args.shard_size`,
+        then writing each shard as an np.memmap file. The stream of tokens in the memmap file represents documents
+        separated with `tokenizer.sep_token`. In `__getitem__`, the `tokenizer.bos_token` and `tokenizer.eos_token`
+        are added. The reason for not adding them at preprocessing time is to allow different sequence lengths
+        later on. Notice that this is the "FULL-SENTENCES" setting in the RoBERTa paper, Table2.
+        """
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
         assert len(tokenizer) < 65535  # will use uint16 to store token ids
         all_files = glob.glob(f'{args.input_dir}/*.txt')
@@ -68,56 +81,62 @@ def raw_text_to_mmap(args):
             logger.info("Cache already exists. Remove the cache directory to regenerate")
             return
         os.mkdir(f'{args.input_dir}/cache/')
-
-        # TODO: process each shared in a separate worker and save their output to files
-        # TODO: update the data generation to avoid the need for regeneration if the seqlen changes
-
-        chunks_list = []
-        for fname in tqdm(all_files):
-            with open(fname, 'r') as fin:
-                current_chunk = [tokenizer.bos_token]
+        os.mkdir(f'{args.input_dir}/shards/')
+
+        # TODO: support continue after a crash
+
+        for full_fname in tqdm(all_files):  # TODO: process each input file in a separate worker
+            fname = full_fname.split('/')[-1]
+            with open(full_fname, 'r') as fin:
+
+                def _write_shard(data, idx):
+                    if len(data) == 0:
+                        return
+                    shared_filename = f'{args.input_dir}/shards/{fname}-{idx}.bin'
+                    logging.info(f'Writing {len(data)} tokens to shared {shared_filename}')
+                    fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(data))
+                    fp[:] = data[:]
+                    del fp  # flush and close file
+                token_list = []
+                shard_num = 0
                 for line in tqdm(fin):
-                    if line.strip() == '':  # drop empty lines
+                    line = line.strip()
+                    if line == '':  # drop empty lines
                         continue
-                    tokens = tokenizer.tokenize(line)  # each line is one document
-                    # generate chunks of length args.seqlen. The last chunk will be padded.
-                    # padding last chunk is not great for longformer because many chunks will be mostly padding
-
-                    for token in tokens:
-                        if len(current_chunk) == args.seqlen - 1:  # chunk is full
-                            current_chunk.append(tokenizer.eos_token)
-                            chunks_list.append(current_chunk)
-                            current_chunk = [tokenizer.bos_token]
-                        current_chunk.append(token)
-                    if args.padded_chunks:
-                        # fill the rest of the seqlen with pad
-                        current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk)))
-                        current_chunk[args.seqlen - 1] = tokenizer.eos_token
-                        chunks_list.append(current_chunk)
-                        current_chunk = [tokenizer.bos_token]
+                    tokens = tokenizer.encode(line, add_special_tokens=False)  # Special tokens are in `__getitem__`
+                    token_list.extend(tokens)
+                    if len(token_list) > args.shard_size:
+                        _write_shard(token_list, shard_num)
+                        token_list = []
+                        shard_num += 1
                     else:
-                        # one long doc with sep inbetween
-                        if len(current_chunk) < args.seqlen - 1:
-                            current_chunk.append(tokenizer.sep_token)
-        random.shuffle(chunks_list)
-        val_count = int(args.train_dev_split * len(chunks_list))
-        val_chunks = chunks_list[:val_count]
-        train_chunks = chunks_list[val_count:]
-
-        def _tokenized_text_to_mmap(output_fname, chunks_list):
-            num_chunks = len(chunks_list)
-            all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16)
-            for k, chunk in enumerate(tqdm(chunks_list)):
-                token_ids = tokenizer.convert_tokens_to_ids(chunk)
-                assert len(token_ids) == args.seqlen
-                all_token_ids[k, :] = [int(t) for t in token_ids]
-            fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen))
-            fp[:, :] = all_token_ids[:, :]
-            fp.flush()
+                        token_list.append(tokenizer.sep_token_id)
+                _write_shard(token_list, shard_num)
+
+        all_shards = glob.glob(f'{args.input_dir}/shards/*.bin')
+        random.shuffle(all_shards)  # shuffling based on shards not individual lines
+        val_shards_count = int(args.train_dev_split * len(all_shards))
+        val_shards = all_shards[:val_shards_count]
+        train_shards = all_shards[val_shards_count:]
+
+        def _combine_shards(output_fname, shards_list):
+            total_size = 0
+            for filename in shards_list:
+                total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1
+            total_size -= 1
+            logging.info(f'Writing {total_size} tokens to {output_fname}')
+            all_token_ids = np.empty(total_size, dtype=np.uint16)
+            last_token_index = 0
+            for filename in tqdm(shards_list):
+                shared = np.memmap(filename, mode='r', dtype=np.uint16)
+                all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:]
+                last_token_index += len(shared)
+            fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size)
+            fp[:] = all_token_ids[:]
             del fp
 
-        _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks)
-        _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks)
+        _combine_shards(f'{args.input_dir}/cache/val.bin', val_shards)
+        _combine_shards(f'{args.input_dir}/cache/train.bin', train_shards)
 
 
 class Pretrainer(ptl.LightningModule):
@@ -132,6 +151,8 @@ def __init__(self, hparams):
         self.config = self.model.config
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
         self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+        self.bos_token_id = tokenizer.bos_token_id
 
         logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.')
         MMapTextDataset.raw_text_to_mmap(args)
@@ -222,7 +243,8 @@ def configure_optimizers(self):
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
 
     def _get_loader(self, fname, is_train):
-        dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen)
+        dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen,
+                                  bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id)
 
         # TODO: consider `replace_sampler_ddp=True` and removing the following if statement
         if self.trainer.use_ddp:
@@ -277,7 +299,7 @@ def add_args(parser):
         # Dataset. Some of these params are only useful when generating the dataset cache
         parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/')
         parser.add_argument("--train_dev_split", type=float, default=0.05)
-        parser.add_argument("--padded_chunks", type=bool, default=False)
+        parser.add_argument("--shard_size", type=int, default=2 * 1000 * 1000)
         parser.add_argument("--seqlen", type=int, default=512)
         parser.add_argument("--mlm_prob", type=float, default=0.15)
 
@@ -293,7 +315,7 @@ def add_args(parser):
                             help="Path to a checkpoint to load model weights and training state. It overwrites args")
         parser.add_argument("--resume_model_only", type=str, default=None,
                             help="Path to a checkpoint to load model weights but not training state")
-        parser.add_argument("--log_rate", type=int, default=16)
+        parser.add_argument("--log_rate", type=int, default=10)
         parser.add_argument("--disable_checkpointing", type=bool, default=False)
 
         # Training hyperparams

From 129a3f954a754a166f5c05cd35eb77860ab03e23 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 30 Jul 2020 14:38:32 -0700
Subject: [PATCH 072/112] log and resume data preprocessing

---
 scripts/pretrain.py | 57 +++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 253cfc5..f6955cc 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -25,7 +25,6 @@
 # DONE: testing ddp single machine
 # DONE: testing ddp multiple machines
 # DONE: testing resume from checkpoint
-# TODO: enable gradient norm logging on a single TPU
 # TODO: try on a TPU-pod
 # TODO: run on beaker on ai2-server1/2
 
@@ -77,28 +76,42 @@ def raw_text_to_mmap(args):
         assert len(tokenizer) < 65535  # will use uint16 to store token ids
         all_files = glob.glob(f'{args.input_dir}/*.txt')
 
-        if os.path.exists(f'{args.input_dir}/cache/'):
+        if os.path.exists(f'{args.input_dir}/cache/train.bin') and os.path.exists(f'{args.input_dir}/cache/val.bin'):
             logger.info("Cache already exists. Remove the cache directory to regenerate")
             return
-        os.mkdir(f'{args.input_dir}/cache/')
-        os.mkdir(f'{args.input_dir}/shards/')
-
-        # TODO: support continue after a crash
+        try:
+            os.mkdir(f'{args.input_dir}/cache/')
+        except FileExistsError:
+            pass
+        try:
+            os.mkdir(f'{args.input_dir}/shards/')
+        except FileExistsError:
+            pass
+        try:
+            os.mkdir(f'{args.input_dir}/logs/')  # log progrss to be able to resume
+        except FileExistsError:
+            pass
 
         for full_fname in tqdm(all_files):  # TODO: process each input file in a separate worker
             fname = full_fname.split('/')[-1]
+            log_filename = f'{args.input_dir}/logs/{fname}.log'
+            if os.path.isfile(log_filename):
+                logging.info(f'Skipping {full_fname} ...')
+                continue  # log file already exists. Skip current file.
+            logging.info(f'Processing {full_fname} ...')
             with open(full_fname, 'r') as fin:
+                token_list = []
+                shard_count = 0
+                tokens_count = 0
 
-                def _write_shard(data, idx):
-                    if len(data) == 0:
+                def _write_shard():
+                    if len(token_list) == 0:
                         return
-                    shared_filename = f'{args.input_dir}/shards/{fname}-{idx}.bin'
-                    logging.info(f'Writing {len(data)} tokens to shared {shared_filename}')
-                    fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(data))
-                    fp[:] = data[:]
+                    shared_filename = f'{args.input_dir}/shards/{fname}-{shard_count}.bin'
+                    logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}')
+                    fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list))
+                    fp[:] = token_list[:]
                     del fp  # flush and close file
-                token_list = []
-                shard_num = 0
                 for line in tqdm(fin):
                     line = line.strip()
                     if line == '':  # drop empty lines
@@ -106,12 +119,16 @@ def _write_shard(data, idx):
                     tokens = tokenizer.encode(line, add_special_tokens=False)  # Special tokens are in `__getitem__`
                     token_list.extend(tokens)
                     if len(token_list) > args.shard_size:
-                        _write_shard(token_list, shard_num)
+                        _write_shard()
+                        tokens_count += len(token_list)
                         token_list = []
-                        shard_num += 1
+                        shard_count += 1
                     else:
                         token_list.append(tokenizer.sep_token_id)
-                _write_shard(token_list, shard_num)
+                _write_shard()
+                tokens_count += len(token_list)
+            with open(log_filename, 'w') as f:
+                f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards')
 
         all_shards = glob.glob(f'{args.input_dir}/shards/*.bin')
         random.shuffle(all_shards)  # shuffling based on shards not individual lines
@@ -119,11 +136,15 @@ def _write_shard(data, idx):
         val_shards = all_shards[:val_shards_count]
         train_shards = all_shards[val_shards_count:]
 
+        # TODO: if _combining_shards is very slow for large files, it can be skipped then update
+        # the dataset to read from multiple shards directly
         def _combine_shards(output_fname, shards_list):
             total_size = 0
             for filename in shards_list:
+                # The +1 accounts for additional SEP tokens between shards
                 total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1
-            total_size -= 1
+                print(total_size, filename)
+            total_size -= 1  # account for an unnecessary SEP token at the every end
             logging.info(f'Writing {total_size} tokens to {output_fname}')
             all_token_ids = np.empty(total_size, dtype=np.uint16)
             last_token_index = 0

From 1c42f964cf487f1df0737b7ffc17c9131b40b7f9 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 30 Jul 2020 16:32:30 -0700
Subject: [PATCH 073/112] multiprocessed preprocessing

---
 scripts/pretrain.py | 151 +++++++++++++++++++++++++-------------------
 1 file changed, 85 insertions(+), 66 deletions(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index f6955cc..32d540c 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -62,6 +62,67 @@ def __getitem__(self, i):
         data = np.concatenate(([self._bos_token_id], self.token_ids[from_index:to_index], [self._eos_token_id]))
         return torch.tensor(data, dtype=torch.long)
 
+    # ========================= preprocessing code ========================= #
+    @staticmethod
+    def _process_file(full_fname):
+        "Step 1: tokenize an input text file then save token ids into `np.memmap` shards of size `args.shard_size`"
+        fname = full_fname.split('/')[-1]
+        log_filename = f'{args.input_dir}/logs-{args.shard_size}/{fname}.log'
+        if os.path.isfile(log_filename):
+            logging.info(f'Skipping {full_fname} ...')
+            return  # log file already exists. Skip current file.
+
+        logging.info(f'Processing {full_fname} ...')
+        with open(full_fname, 'r') as fin:
+            token_list = []
+            shard_count = 0
+            tokens_count = 0
+
+            def _write_shard():
+                if len(token_list) == 0:
+                    return
+                if token_list[-1] != MMapTextDataset.tokenizer.sep_token_id:  # handle a rare case
+                    token_list.append(MMapTextDataset.tokenizer.sep_token_id)
+                shared_filename = f'{args.input_dir}/shards-{args.shard_size}/{fname}-{shard_count}.bin'
+                logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}')
+                fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list))
+                fp[:] = token_list[:]
+                del fp  # flush and close file
+            for line in tqdm(fin):
+                line = line.strip()
+                if line == '':  # drop empty lines
+                    continue
+                tokens = MMapTextDataset.tokenizer.encode(line, add_special_tokens=False)  # `__getitem__` adds special tokens
+                token_list.extend(tokens)
+                if len(token_list) > args.shard_size:
+                    _write_shard()
+                    tokens_count += len(token_list)
+                    token_list = []
+                    shard_count += 1
+                else:
+                    token_list.append(MMapTextDataset.tokenizer.sep_token_id)
+            _write_shard()
+            tokens_count += len(token_list)
+        with open(log_filename, 'w') as f:
+            f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards')
+
+    @staticmethod
+    def _combine_shards(output_fname, shards_list):
+        "Step 2: combining memmap shards into one `train.bin` or `val.bin` file"
+        total_size = 0
+        for filename in shards_list:
+            total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0]
+        logging.info(f'Writing {total_size} tokens to {output_fname}')
+        all_token_ids = np.empty(total_size, dtype=np.uint16)
+        last_token_index = 0
+        for filename in tqdm(shards_list):
+            shared = np.memmap(filename, mode='r', dtype=np.uint16)
+            all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:]
+            last_token_index += len(shared)
+        fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size)
+        fp[:] = all_token_ids[:]
+        del fp
+
     @staticmethod
     def raw_text_to_mmap(args):
         """This is the main preprocessing function. It processes all the text files in `args.input_dir` and
@@ -72,8 +133,8 @@ def raw_text_to_mmap(args):
         are added. The reason for not adding them at preprocessing time is to allow different sequence lengths
         later on. Notice that this is the "FULL-SENTENCES" setting in the RoBERTa paper, Table2.
         """
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
-        assert len(tokenizer) < 65535  # will use uint16 to store token ids
+        MMapTextDataset.tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
+        assert len(MMapTextDataset.tokenizer) < 65535  # will use uint16 to store token ids
         all_files = glob.glob(f'{args.input_dir}/*.txt')
 
         if os.path.exists(f'{args.input_dir}/cache/train.bin') and os.path.exists(f'{args.input_dir}/cache/val.bin'):
@@ -84,80 +145,35 @@ def raw_text_to_mmap(args):
         except FileExistsError:
             pass
         try:
-            os.mkdir(f'{args.input_dir}/shards/')
+            os.mkdir(f'{args.input_dir}/shards-{args.shard_size}/')
         except FileExistsError:
             pass
         try:
-            os.mkdir(f'{args.input_dir}/logs/')  # log progrss to be able to resume
+            os.mkdir(f'{args.input_dir}/logs-{args.shard_size}/')  # log progrss to be able to resume
         except FileExistsError:
             pass
 
-        for full_fname in tqdm(all_files):  # TODO: process each input file in a separate worker
-            fname = full_fname.split('/')[-1]
-            log_filename = f'{args.input_dir}/logs/{fname}.log'
-            if os.path.isfile(log_filename):
-                logging.info(f'Skipping {full_fname} ...')
-                continue  # log file already exists. Skip current file.
-            logging.info(f'Processing {full_fname} ...')
-            with open(full_fname, 'r') as fin:
-                token_list = []
-                shard_count = 0
-                tokens_count = 0
-
-                def _write_shard():
-                    if len(token_list) == 0:
-                        return
-                    shared_filename = f'{args.input_dir}/shards/{fname}-{shard_count}.bin'
-                    logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}')
-                    fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list))
-                    fp[:] = token_list[:]
-                    del fp  # flush and close file
-                for line in tqdm(fin):
-                    line = line.strip()
-                    if line == '':  # drop empty lines
-                        continue
-                    tokens = tokenizer.encode(line, add_special_tokens=False)  # Special tokens are in `__getitem__`
-                    token_list.extend(tokens)
-                    if len(token_list) > args.shard_size:
-                        _write_shard()
-                        tokens_count += len(token_list)
-                        token_list = []
-                        shard_count += 1
-                    else:
-                        token_list.append(tokenizer.sep_token_id)
-                _write_shard()
-                tokens_count += len(token_list)
-            with open(log_filename, 'w') as f:
-                f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards')
-
-        all_shards = glob.glob(f'{args.input_dir}/shards/*.bin')
+        # STEP1: tokenizing and saving to shards
+        if args.num_preprocessing_workers > 1:
+            from multiprocessing.pool import Pool
+            with Pool(args.num_preprocessing_workers) as p:
+                list(tqdm(p.imap(MMapTextDataset._process_file, all_files), total=len(all_files)))
+        else:
+            [MMapTextDataset._process_file(f) for f in tqdm(all_files)]
+
+        # STEP2: shuffling shards and combining them into train.bin and val.bin files
+        all_shards = glob.glob(f'{args.input_dir}/shards-{args.shard_size}/*.bin')
         random.shuffle(all_shards)  # shuffling based on shards not individual lines
         val_shards_count = int(args.train_dev_split * len(all_shards))
         val_shards = all_shards[:val_shards_count]
         train_shards = all_shards[val_shards_count:]
+        # TODO: if MMapTextDataset._combining_shards is very slow for large files, it can be skipped but we nned to
+        # update the dataset to read from multiple shards directly
+        MMapTextDataset._combine_shards(f'{args.input_dir}/cache/val.bin', val_shards)
+        MMapTextDataset._combine_shards(f'{args.input_dir}/cache/train.bin', train_shards)
 
-        # TODO: if _combining_shards is very slow for large files, it can be skipped then update
-        # the dataset to read from multiple shards directly
-        def _combine_shards(output_fname, shards_list):
-            total_size = 0
-            for filename in shards_list:
-                # The +1 accounts for additional SEP tokens between shards
-                total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1
-                print(total_size, filename)
-            total_size -= 1  # account for an unnecessary SEP token at the every end
-            logging.info(f'Writing {total_size} tokens to {output_fname}')
-            all_token_ids = np.empty(total_size, dtype=np.uint16)
-            last_token_index = 0
-            for filename in tqdm(shards_list):
-                shared = np.memmap(filename, mode='r', dtype=np.uint16)
-                all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:]
-                last_token_index += len(shared)
-            fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size)
-            fp[:] = all_token_ids[:]
-            del fp
-
-        _combine_shards(f'{args.input_dir}/cache/val.bin', val_shards)
-        _combine_shards(f'{args.input_dir}/cache/train.bin', train_shards)
+        del MMapTextDataset.tokenizer
+    # ========================= end preprocessing code ========================= #
 
 
 class Pretrainer(ptl.LightningModule):
@@ -319,8 +335,11 @@ def add_args(parser):
 
         # Dataset. Some of these params are only useful when generating the dataset cache
         parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/')
+        # Used only at the preprocessing phase
         parser.add_argument("--train_dev_split", type=float, default=0.05)
-        parser.add_argument("--shard_size", type=int, default=2 * 1000 * 1000)
+        parser.add_argument("--shard_size", type=int, default=1024 ** 3 // 4)  # 250MB
+        parser.add_argument("--num_preprocessing_workers", type=int, default=1)
+        # Used only at the training phase
         parser.add_argument("--seqlen", type=int, default=512)
         parser.add_argument("--mlm_prob", type=float, default=0.15)
 

From c20264e6f802ea6fea66dabe5b6aaa3e08f60721 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 3 Aug 2020 13:23:25 -0700
Subject: [PATCH 074/112] wip

---
 scripts/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index 32d540c..b5c581a 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -233,7 +233,7 @@ def training_step(self, batch, batch_nb):
             elapsed_time = time.time() - self.start_time
             tensorboard_logs['second_per_batch'] = elapsed_time
         self.start_time = time.time()
-        if not self.use_tpu:
+        if self.on_gpu:
             tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3
 
         return {'loss': loss, 'log': tensorboard_logs}

From ff96351b17c8485f6b558bdff6c2ec0f34d4a364 Mon Sep 17 00:00:00 2001
From: Slater <megan.e.slater@gmail.com>
Date: Mon, 3 Aug 2020 15:23:14 -0700
Subject: [PATCH 075/112] Save this directory as a dataset and use it directly
 on a plain base image.

---
 experiment.yml          | 18 +++++++++++++++
 longformer_on_beaker.sh | 51 +++++++++++++++++++++++++++++++++++++++++
 scripts/pretrain.py     |  2 +-
 3 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 experiment.yml
 create mode 100755 longformer_on_beaker.sh

diff --git a/experiment.yml b/experiment.yml
new file mode 100644
index 0000000..156faf5
--- /dev/null
+++ b/experiment.yml
@@ -0,0 +1,18 @@
+tasks: 
+  - cluster: {{.Env.CLUSTER}}
+    spec:
+      # This is a python3.7/nvidia base image with basic libraries
+      image: im_j69gti4atcw9
+      resultPath: {{.Env.RESULT_PATH}}
+      args:
+        - /bin/bash 
+        - -c
+        - "cd /longformer_on_beaker && pip install . && {{.Env.ARGS}}"
+      datasetMounts:
+        - datasetId: {{.Env.INPUT_DATASET_ID}}
+          containerPath: /data
+        - datasetId: {{.Env.SCRIPTS}}
+          containerPath: /longformer_on_beaker
+      requirements:
+        gpuCount: {{.Env.GPU_COUNT}}
+        cpu: {{.Env.CPU_COUNT}}
diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh
new file mode 100755
index 0000000..6e873a1
--- /dev/null
+++ b/longformer_on_beaker.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+export SCRIPTS=$(beaker dataset create -q .)
+export INPUT_DATASET_ID="ds_6r0phxc5fiap"
+export RESULT_SAVE_DIR="/runs"
+export RESULT_SAVE_PREFIX="test"
+export ARGS=""
+export GPU_COUNT=1
+export CPU_COUNT=6
+copy=("$@")
+for i in "${!copy[@]}"
+do
+  if [[ "${copy[$i]}" = "--save_dir" ]]
+  then
+    export RESULT_SAVE_DIR="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--input_dir" ]]
+  then
+    export INPUT_DATASET_ID=$(beaker dataset create -q ${copy[$i+1]})
+    copy[$i+1]="/data"
+  fi
+
+  if [[ "${copy[$i]}" = "--save_prefix" ]]
+  then
+    export RESULT_SAVE_PREFIX="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--num_workers" ]]
+  then
+    export CPU_COUNT="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--gpu_count" ]]
+  then
+    export GPU_COUNT="${copy[$i+1]}"
+  fi
+  ARGS="$ARGS ${copy[$i]}"
+done
+
+# If an input dataset was not specified, use the default
+if [[ "ds_6r0phxc5fiap" = $INPUT_DATASET_ID ]]
+then
+  ARGS="$ARGS --input_dir /data"
+fi
+
+echo $ARGS
+
+export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX
+
+beaker experiment create -f experiment.yml
diff --git a/scripts/pretrain.py b/scripts/pretrain.py
index b5c581a..8de5bbd 100644
--- a/scripts/pretrain.py
+++ b/scripts/pretrain.py
@@ -348,7 +348,7 @@ def add_args(parser):
         parser.add_argument("--model", type=str, default='roberta-base')
 
         # Checkpointing and logging
-        parser.add_argument("--save_dir", type=str, default='runs/')
+        parser.add_argument("--save_dir", type=str, default='/runs/')
         parser.add_argument("--save_prefix", type=str, default='test',
                             help="path of output directory is --save_dir/--save_prefix")
         parser.add_argument("--resume", type=str, default=None,  # It is better to use a different output dir.

From 0557e24c8cd654c01be68098126fec66eff89956 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 6 Aug 2020 12:03:48 -0700
Subject: [PATCH 076/112] bug fix

---
 longformer/longformer_encoder_decoder.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py
index 67ab3e4..f29fb49 100644
--- a/longformer/longformer_encoder_decoder.py
+++ b/longformer/longformer_encoder_decoder.py
@@ -62,10 +62,8 @@ def forward(
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
         assert attn_mask is None
 
-        # LongformerSelfAttention expects this shape
-        query = query.view(bsz, tgt_len, embed_dim)
         outputs = self.longformer_self_attn(
-            query,
+            query.transpose(0, 1),  # LongformerSelfAttention expects (bsz, seqlen, embd_dim)
             attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1,
             head_mask=None,
             encoder_hidden_states=None,

From 6ae5051bf704edc9548334d6bb88f321d57a06af Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 7 Aug 2020 11:14:12 -0700
Subject: [PATCH 077/112] fix a bug with the mapping from
 longformerselfattention to bartselfattention

---
 longformer/longformer_encoder_decoder.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py
index f29fb49..df38224 100644
--- a/longformer/longformer_encoder_decoder.py
+++ b/longformer/longformer_encoder_decoder.py
@@ -71,8 +71,6 @@ def forward(
             output_attentions=output_attentions,
         )
 
-        attn_output = outputs[0]
-        attn_output = attn_output.contiguous().view(tgt_len, bsz, embed_dim)
-        attn_output = self.output(attn_output)
+        attn_output = self.output(outputs[0].transpose(0, 1))
 
         return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None)

From a1de977980d1cd658d3d8f740a3f108bbf84ea47 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 7 Aug 2020 11:14:28 -0700
Subject: [PATCH 078/112] mem_profiler

---
 scripts/mem_profiler.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py
index 4edc6b0..5d8e2f7 100644
--- a/scripts/mem_profiler.py
+++ b/scripts/mem_profiler.py
@@ -9,7 +9,7 @@
 from pytorch_lightning import Trainer
 import pytorch_lightning as pl
 
-seqlen = 1024 * 8
+seqlen = 1024 * 2
 global_size = seqlen // 100
 attention_window = 256  # one sided
 
@@ -31,16 +31,16 @@ def __init__(self, hparams=None):
         super().__init__()
         self.hparams = hparams
 
-        # config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096')
-        config = LongformerConfig.from_pretrained('roberta-large')
+        config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096')
+        # config = LongformerConfig.from_pretrained('roberta-large')
         config.max_position_embeddings = seqlen + 2
         config.gradient_checkpointing = True
-        # config.attention_mode = 'sliding_chunks'
-        config.attention_mode = 'n2'
+        config.attention_mode = 'sliding_chunks'
+        # config.attention_mode = 'n2'
         config.attention_window = [attention_window] * config.num_hidden_layers
         config.attention_dilation = [1] * config.num_hidden_layers
-        # self.model = LongformerEncoderDecoderForConditionalGeneration(config)
-        self.model = LongformerForMaskedLM(config)
+        self.model = LongformerEncoderDecoderForConditionalGeneration(config)
+        # self.model = LongformerForMaskedLM(config)
 
     def forward(self, x, y):
         print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3)
@@ -60,7 +60,7 @@ def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.001)
 
     def train_dataloader(self):
-        return DataLoader(CoolDataset(), batch_size=1, num_workers=0)
+        return DataLoader(CoolDataset(), batch_size=2, num_workers=0)
 
 
 if __name__ == '__main__':

From 1bf6c7c66ebf15a7490e8050df6587f43ea2d047 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 12 Aug 2020 11:10:42 -0700
Subject: [PATCH 079/112] extend encoder only

---
 ...onvert_bart_to_longformerencoderdecoder.py | 38 ++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/scripts/convert_bart_to_longformerencoderdecoder.py b/scripts/convert_bart_to_longformerencoderdecoder.py
index e469819..fc94996 100644
--- a/scripts/convert_bart_to_longformerencoderdecoder.py
+++ b/scripts/convert_bart_to_longformerencoderdecoder.py
@@ -5,6 +5,7 @@
 from transformers import BartTokenizer
 
 from transformers import BartForConditionalGeneration
+from transformers.modeling_bart import shift_tokens_right
 from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart, LongformerEncoderDecoderConfig
 from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
 
@@ -14,10 +15,10 @@
 
 def create_long_model(
     save_model_to,
-    base_model='facebook/bart-large',
-    tokenizer_name_or_path='facebook/bart-large',
-    attention_window=512,
-    max_pos=4096
+    base_model,
+    tokenizer_name_or_path,
+    attention_window,
+    max_pos
 ):
     model = BartForConditionalGeneration.from_pretrained(base_model)
     tokenizer = BartTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos)
@@ -35,7 +36,9 @@ def create_long_model(
     current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
     assert current_max_pos == config.max_position_embeddings + 2
 
-    config.max_position_embeddings = max_pos
+    config.max_encoder_position_embeddings = max_pos
+    config.max_decoder_position_embeddings = config.max_position_embeddings
+    del config.max_position_embeddings
     max_pos += 2  # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2
     assert max_pos >= current_max_pos
 
@@ -50,14 +53,14 @@ def create_long_model(
     model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed
 
     # allocate a larger position embedding matrix for the decoder
-    new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size)
-    # copy position embeddings over and over to initialize the new position embeddings
-    k = 2
-    step = current_max_pos - 2
-    while k < max_pos - 1:
-        new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:]
-        k += step
-    model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed
+    # new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size)
+    # # copy position embeddings over and over to initialize the new position embeddings
+    # k = 2
+    # step = current_max_pos - 2
+    # while k < max_pos - 1:
+    #     new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:]
+    #     k += step
+    # model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed
 
     # replace the `modeling_bart.SelfAttention` object with `LongformerSelfAttention`
     config.attention_window = [attention_window] * config.num_hidden_layers
@@ -107,12 +110,12 @@ def main():
         '--attention_window',
         type=int,
         default=512,
-        help='attention window size for longformer self attention'
+        help='attention window size for longformer self attention (one sided)'
     )
     parser.add_argument(
         '--max_pos',
         type=int,
-        default=4096,
+        default=4096 * 4,
         help='maximum encoder positions'
     )
 
@@ -137,11 +140,12 @@ def main():
     data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048)
     input_ids = data['input_ids']
     attention_mask = data['attention_mask']
-    logits = model(input_ids, attention_mask=attention_mask)[0]
+    decoder_input_ids = shift_tokens_right(input_ids[:, :5], tokenizer.pad_token_id)
+    logits = model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, use_cache=False)[0]
     masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
     probs = logits[0, masked_index].softmax(dim=0)
     values, predictions = probs.topk(5)
-    print(tokenizer.decode(predictions).split())
+    print(tokenizer.convert_ids_to_tokens(predictions))
 
 
 if __name__ == "__main__":

From 5b31f5e7396210c6c73d1be268f1310e09d85d6e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 12 Aug 2020 14:29:17 -0700
Subject: [PATCH 080/112] upgrade triviaqa script to PLv0.8.5

---
 scripts/triviaqa.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 281c297..6924c81 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -528,21 +528,14 @@ def test_end(self, outputs):
 
         return {'count': len(qid_to_answer_text)}
 
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
-        optimizer.step()
-        optimizer.zero_grad()
-        self.scheduler.step(self.global_step)
-
     def configure_optimizers(self):
         def lr_lambda(current_step):
             if current_step < self.args.warmup:
                 return float(current_step) / float(max(1, self.args.warmup))
             return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup)))
         optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr)
-        self.scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)  # scheduler is not saved in the checkpoint, but global_step is, which is enough to restart
-        self.scheduler.step(self.global_step)
-
-        return optimizer
+        scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
 
     @pl.data_loader
     def train_dataloader(self):
@@ -610,8 +603,8 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format")
         parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format")
         parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
-        parser.add_argument("--gpus", type=str, default='0',
-                            help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ")
+        parser.add_argument("--gpus", type=int, default=1,
+                            help="Number of gpus. 0 for CPU")
         parser.add_argument("--warmup", type=int, default=200, help="Number of warmup steps")
         parser.add_argument("--lr", type=float, default=0.0001, help="Maximum learning rate")
         parser.add_argument("--val_every", type=float, default=0.2, help="Number of training steps between validations")
@@ -672,15 +665,14 @@ def main(args):
         prefix=''
     )
 
-    args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus is not "" else None  # use CPU if no gpu provided
     print(args)
     train_set_size = 110648  # hardcode dataset size. Needed to compute number of steps for the lr scheduler
-    num_devices = 1 or len(args.gpus)
-    args.steps = args.epochs * train_set_size / (args.batch_size * num_devices)
-    print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<')
+    args.steps = args.epochs * train_set_size / (args.batch_size * args.gpus)
+    print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * args.gpus} <<<<<<<')
 
-    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and (len(args.gpus) > 1) else None,
-                         track_grad_norm=-1, max_nb_epochs=args.epochs, early_stop_callback=None,
+    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None,
+                         track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None,
+                         replace_sampler_ddp=False,
                          accumulate_grad_batches=args.batch_size,
                          val_check_interval=args.val_every,
                          val_percent_check=args.val_percent_check,

From 405739e735d30048780f06f4cb627a380331eb3a Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 12 Aug 2020 22:47:03 -0700
Subject: [PATCH 081/112] add roberta baseline

---
 scripts/triviaqa.py | 65 ++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 6924c81..da5ea61 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -9,7 +9,7 @@
 from torch.optim.lr_scheduler import LambdaLR
 
 from torch.utils.data import DataLoader, Dataset
-from transformers import RobertaTokenizer
+from transformers import RobertaTokenizer, AutoModel
 from scripts.triviaqa_utils import evaluation_utils
 
 import pytorch_lightning as pl
@@ -263,10 +263,13 @@ def __init__(self, args):
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
 
     def load_model(self):
-        model = Longformer.from_pretrained(self.args.model_path)
-        for layer in model.encoder.layer:
-            layer.attention.self.attention_mode = self.args.attention_mode
-            self.args.attention_window = layer.attention.self.attention_window
+        if 'longformer' in self.args.model_path:
+            model = Longformer.from_pretrained(self.args.model_path)
+            for layer in model.encoder.layer:
+                layer.attention.self.attention_mode = self.args.attention_mode
+                self.args.attention_window = layer.attention.self.attention_window
+        else:
+            model = AutoModel.from_pretrained(self.args.model_path)
 
         print("Loaded model with config:")
         print(model.config)
@@ -277,29 +280,34 @@ def load_model(self):
         return model
 
     def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions):
-        question_end_index = self._get_question_end_index(input_ids)
-        # Each batch is one document, and each row of the batch is a chunck of the document.
-        # Make sure all rows have the same question length.
-        assert (question_end_index[0].float() == question_end_index.float().mean()).item()
-
-        # local attention everywhere
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
-        # global attention for the question tokens
-        attention_mask[:, :question_end_index.item()] = 2
-
-        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
-        input_ids, attention_mask = pad_to_window_size(
-            input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)
-
-        sequence_output = self.model(
-                input_ids,
-                attention_mask=attention_mask)[0]
-
-        # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens
-        # before computing loss and decoding.
-        padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
-        if padding_len > 0:
-            sequence_output = sequence_output[:, :-padding_len]
+        if 'longformer' in self.args.model_path:
+            question_end_index = self._get_question_end_index(input_ids)
+            # Each batch is one document, and each row of the batch is a chunck of the document.
+            # Make sure all rows have the same question length.
+            assert (question_end_index[0].float() == question_end_index.float().mean()).item()
+
+            # local attention everywhere
+            attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
+            # global attention for the question tokens
+            attention_mask[:, :question_end_index.item()] = 2
+
+            # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
+            input_ids, attention_mask = pad_to_window_size(
+                input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)
+
+            sequence_output = self.model(
+                    input_ids,
+                    attention_mask=attention_mask)[0]
+
+            # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens
+            # before computing loss and decoding.
+            padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
+            if padding_len > 0:
+                sequence_output = sequence_output[:, :-padding_len]
+        else:
+            sequence_output = self.model(
+                    input_ids,
+                    attention_mask=attention_mask)[0]
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -637,6 +645,7 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                             default='sliding_chunks', help='Which implementation of selfattention to use')
         parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
+        # parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model")
 
         return parser
 

From c132d4e3384e1de6c83c09d1ea63260fa30f7604 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 17 Aug 2020 06:36:05 -0700
Subject: [PATCH 082/112] triviaqa seq2seq + fix bart-base bug

---
 scripts/triviaqa.py | 85 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 18 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index da5ea61..362dfa5 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -9,7 +9,7 @@
 from torch.optim.lr_scheduler import LambdaLR
 
 from torch.utils.data import DataLoader, Dataset
-from transformers import RobertaTokenizer, AutoModel
+from transformers import RobertaTokenizer, AutoModel, AutoConfig
 from scripts.triviaqa_utils import evaluation_utils
 
 import pytorch_lightning as pl
@@ -110,11 +110,13 @@ def is_whitespace(c):
                     try:
                         start_position = char_to_word_offset[answer_offset]
                         end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                    except:
+                        token_ids = self.tokenizer.encode(orig_answer_text)
+                    except RuntimeError:
                         print(f'Reading example {idx} failed')
                         start_position = 0
                         end_position = 0
-                    answer_spans.append({'start': start_position, 'end': end_position})
+                    answer_spans.append({'start': start_position, 'end': end_position,
+                                         'text': orig_answer_text, 'token_ids': token_ids})
 
                 # ===== Given an example, convert it into tensors  =============
                 query_tokens = self.tokenizer.tokenize(question_text)
@@ -146,6 +148,7 @@ def is_whitespace(c):
                 segment_ids_list = []
                 start_positions_list = []
                 end_positions_list = []
+                answer_token_ids_list = []
                 for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride):
                     slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))
 
@@ -172,6 +175,7 @@ def is_whitespace(c):
                     doc_offset = len(query_tokens) + 2 - slice_start
                     start_positions = []
                     end_positions = []
+                    answer_token_ids = []
                     for answer_span in answer_spans:
                         start_position = answer_span['start']
                         end_position = answer_span['end']
@@ -183,6 +187,7 @@ def is_whitespace(c):
                             continue
                         start_positions.append(tok_start_position_in_doc + doc_offset)
                         end_positions.append(tok_end_position_in_doc + doc_offset)
+                        answer_token_ids.append(answer_span['token_ids'])
                     assert len(start_positions) == len(end_positions)
                     if self.ignore_seq_with_no_answers and len(start_positions) == 0:
                         continue
@@ -190,32 +195,58 @@ def is_whitespace(c):
                     # answers from start_positions and end_positions if > self.max_num_answers
                     start_positions = start_positions[:self.max_num_answers]
                     end_positions = end_positions[:self.max_num_answers]
+                    answer_token_ids = answer_token_ids[:self.max_num_answers]
 
                     # -1 padding up to self.max_num_answers
                     padding_len = self.max_num_answers - len(start_positions)
                     start_positions.extend([-1] * padding_len)
                     end_positions.extend([-1] * padding_len)
+                    answer_token_ids.extend([[]] * padding_len)
 
                     # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
                     found_start_positions = set()
                     found_end_positions = set()
-                    for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)):
+                    found_answer_token_ids = set()
+                    for i, (start_position, end_position, answer_tokens) in enumerate(
+                            zip(start_positions, end_positions, answer_token_ids)
+                            ):
                         if start_position in found_start_positions:
                             start_positions[i] = -1
                         if end_position in found_end_positions:
                             end_positions[i] = -1
+                        answer_tokens_as_str = ','.join([str(x) for x in answer_tokens])
+                        if answer_tokens_as_str in found_answer_token_ids:
+                            answer_token_ids[i] = []
                         found_start_positions.add(start_position)
                         found_end_positions.add(end_position)
+                        found_answer_token_ids.add(answer_tokens_as_str)
 
                     input_ids_list.append(input_ids)
                     input_mask_list.append(input_mask)
                     segment_ids_list.append(segment_ids)
                     start_positions_list.append(start_positions)
                     end_positions_list.append(end_positions)
+                    answer_token_ids_list.append(answer_token_ids)
+
+                # pad answers in answer_token_ids_list to the longest answer
+                max_answer_len = max([len(item) for sublist in answer_token_ids_list for item in sublist])  # flat list
+                if max_answer_len == 0:
+                    max_answer_len = 2
+                for answers_of_one_slice in answer_token_ids_list:
+                    for answer_tokens in answers_of_one_slice:
+                        if len(answer_tokens) == 0:
+                            # TODO: <s></s><pad><pad><pad> or <pad><pad><pad><pad><pad> ?
+                            padding_len = max_answer_len - len(answer_tokens) - 2
+                            answer_tokens.extend([self.tokenizer.bos_token_id, self.tokenizer.eos_token_id] +
+                                                 ([self.tokenizer.pad_token_id] * padding_len))
+                        else:
+                            padding_len = max_answer_len - len(answer_tokens)
+                            answer_tokens.extend([self.tokenizer.pad_token_id] * padding_len)
 
                 tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list),
                                      torch.tensor(segment_ids_list),
                                      torch.tensor(start_positions_list), torch.tensor(end_positions_list),
+                                     torch.tensor(answer_token_ids_list),
                                      self._get_qid(qa['id']),  qa["aliases"]))  # for eval
         return tensors_list
 
@@ -268,6 +299,20 @@ def load_model(self):
             for layer in model.encoder.layer:
                 layer.attention.self.attention_mode = self.args.attention_mode
                 self.args.attention_window = layer.attention.self.attention_window
+        elif self.args.model_path in ['bart.large', 'bart.base']:
+            model = torch.hub.load('pytorch/fairseq', self.args.model_path)
+            model.config = model.args
+            model.config.hidden_size = model.config.decoder_output_dim
+        elif 'bart' in self.args.model_path and 'base' in self.args.model_path:
+            config = AutoConfig.from_pretrained(self.args.model_path)
+            config.encoder_attention_heads = 12
+            config.decoder_attention_heads = 12
+            config.attention_dropout = 0.1
+            model = AutoModel.from_pretrained(self.args.model_path, config=config)
+        elif 'bart' in self.args.model_path and 'large' in self.args.model_path:
+            config = AutoConfig.from_pretrained(self.args.model_path)
+            config.attention_dropout = 0.1
+            model = AutoModel.from_pretrained(self.args.model_path, config=config)
         else:
             model = AutoModel.from_pretrained(self.args.model_path)
 
@@ -279,7 +324,7 @@ def load_model(self):
         model.train()
         return model
 
-    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions):
+    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, answer_token_ids):
         if 'longformer' in self.args.model_path:
             question_end_index = self._get_question_end_index(input_ids)
             # Each batch is one document, and each row of the batch is a chunck of the document.
@@ -304,6 +349,8 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p
             padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
             if padding_len > 0:
                 sequence_output = sequence_output[:, :-padding_len]
+        elif self.args.model_path in ['bart.large', 'bart.base']:
+            sequence_output = self.model.extract_features(input_ids)
         else:
             sequence_output = self.model(
                     input_ids,
@@ -376,8 +423,8 @@ def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1,
         return loss[~torch.isinf(loss)].sum()
 
     def training_step(self, batch, batch_nb):
-        input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch
-        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends)
+        input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
+        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
         loss = output[0]
         lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']
         tensorboard_logs = {'train_loss': loss, 'lr': lr,
@@ -386,8 +433,8 @@ def training_step(self, batch, batch_nb):
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
-        input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch
-        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends)
+        input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
+        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
         loss, start_logits, end_logits = output[:3]
         answers = self.decode(input_ids, start_logits, end_logits)
 
@@ -461,8 +508,8 @@ def decode(self, input_ids, start_logits, end_logits):
                 answers.append({'text': text, 'score': score})
         return answers
 
-    def sync_list_across_gpus(self, l, device, dtype):
-        l_tensor = torch.tensor(l, device=device, dtype=dtype)
+    def sync_list_across_gpus(self, list_to_sync, device, dtype):
+        l_tensor = torch.tensor(list_to_sync, device=device, dtype=dtype)
         gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)]
         torch.distributed.all_gather(gather_l_tensor, l_tensor)
         return torch.cat(gather_l_tensor).tolist()
@@ -507,8 +554,8 @@ def validation_end(self, outputs):
         return {'avg_val_loss': avg_loss, 'log': logs, 'progress_bar': logs}
 
     def test_step(self, batch, batch_nb):
-        input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch
-        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends)
+        input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
+        output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
         loss, start_logits, end_logits = output[:3]
         answers = self.decode(input_ids, start_logits, end_logits)
 
@@ -555,7 +602,7 @@ def train_dataloader(self):
                                   max_num_answers=self.args.max_num_answers,
                                   max_question_len=self.args.max_question_len,
                                   ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers)
-        sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) if self.trainer.use_ddp else None
         dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None),
                         num_workers=self.args.num_workers, sampler=sampler,
                         collate_fn=TriviaQADataset.collate_one_doc_and_lists)
@@ -572,8 +619,8 @@ def val_dataloader(self):
                                   max_num_answers=self.args.max_num_answers,
                                   max_question_len=self.args.max_question_len,
                                   ignore_seq_with_no_answers=False)  # evaluation data should keep all examples
-        sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None
-        dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None),
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False) if self.trainer.use_ddp else None
+        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                         num_workers=self.args.num_workers, sampler=sampler,
                         collate_fn=TriviaQADataset.collate_one_doc_and_lists)
         self.val_dataloader_object = dl
@@ -637,7 +684,8 @@ def add_model_specific_args(parser, root_dir):
                             help="Number of answer candidates. Used at decoding time")
         parser.add_argument("--max_answer_length", type=int, default=30,
                             help="maximum num of wordpieces/answer. Used at decoding time")
-        parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss")
+        parser.add_argument("--regular_softmax_loss", action='store_true',
+                            help="IF true, use regular softmax. Default is using ORed softmax loss")
         parser.add_argument("--test", action='store_true', help="Test only, no training")
         parser.add_argument("--model_path", type=str, required=True,
                             help="Path to the checkpoint directory")
@@ -645,7 +693,7 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                             default='sliding_chunks', help='Which implementation of selfattention to use')
         parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
-        # parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model")
+        parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model")
 
         return parser
 
@@ -684,6 +732,7 @@ def main(args):
                          replace_sampler_ddp=False,
                          accumulate_grad_batches=args.batch_size,
                          val_check_interval=args.val_every,
+                         # check_val_every_n_epoch=2,
                          val_percent_check=args.val_percent_check,
                          test_percent_check=args.val_percent_check,
                          logger=logger if not args.disable_checkpointing else False,

From d1349e96ded239b0c872e579608a47901a2d38a2 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sun, 23 Aug 2020 18:05:44 +0000
Subject: [PATCH 083/112] beaker

---
 longformer_on_beaker.sh | 48 +++++------------------------------------
 requirements.txt        |  3 +--
 scripts/triviaqa.py     |  3 ++-
 3 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh
index 6e873a1..425dcef 100755
--- a/longformer_on_beaker.sh
+++ b/longformer_on_beaker.sh
@@ -1,51 +1,13 @@
 #!/bin/bash
 
 export SCRIPTS=$(beaker dataset create -q .)
-export INPUT_DATASET_ID="ds_6r0phxc5fiap"
+export INPUT_DATASET_ID="ds_drt127wv4aun"
 export RESULT_SAVE_DIR="/runs"
 export RESULT_SAVE_PREFIX="test"
-export ARGS=""
-export GPU_COUNT=1
-export CPU_COUNT=6
-copy=("$@")
-for i in "${!copy[@]}"
-do
-  if [[ "${copy[$i]}" = "--save_dir" ]]
-  then
-    export RESULT_SAVE_DIR="${copy[$i+1]}"
-  fi
-
-  if [[ "${copy[$i]}" = "--input_dir" ]]
-  then
-    export INPUT_DATASET_ID=$(beaker dataset create -q ${copy[$i+1]})
-    copy[$i+1]="/data"
-  fi
-
-  if [[ "${copy[$i]}" = "--save_prefix" ]]
-  then
-    export RESULT_SAVE_PREFIX="${copy[$i+1]}"
-  fi
-
-  if [[ "${copy[$i]}" = "--num_workers" ]]
-  then
-    export CPU_COUNT="${copy[$i+1]}"
-  fi
-
-  if [[ "${copy[$i]}" = "--gpu_count" ]]
-  then
-    export GPU_COUNT="${copy[$i+1]}"
-  fi
-  ARGS="$ARGS ${copy[$i]}"
-done
-
-# If an input dataset was not specified, use the default
-if [[ "ds_6r0phxc5fiap" = $INPUT_DATASET_ID ]]
-then
-  ARGS="$ARGS --input_dir /data"
-fi
-
-echo $ARGS
-
+export ARGS="$@"
+export GPU_COUNT=8
+export CPU_COUNT=32
+export CLUSTER="ai2/on-prem-ai2-server2"
 export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX
 
 beaker experiment create -f experiment.yml
diff --git a/requirements.txt b/requirements.txt
index 54829f1..d91e5de 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 transformers @ git+http://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers
 pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
-torch>=1.2.0
-transformers==3.0.2
+torch==1.6.0
 tensorboardX
 test-tube==0.7.5
diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 362dfa5..e5e488d 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -312,6 +312,7 @@ def load_model(self):
         elif 'bart' in self.args.model_path and 'large' in self.args.model_path:
             config = AutoConfig.from_pretrained(self.args.model_path)
             config.attention_dropout = 0.1
+            config.gradient_checkpointing = True
             model = AutoModel.from_pretrained(self.args.model_path, config=config)
         else:
             model = AutoModel.from_pretrained(self.args.model_path)
@@ -647,7 +648,7 @@ def configure_ddp(self, model, device_ids):
         model = LightningDistributedDataParallel(
             model,
             device_ids=device_ids,
-            find_unused_parameters=True
+            find_unused_parameters=False
         )
         return model
 

From 5a2b9da317c3407293b5069417a3556a5b15e28a Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Tue, 25 Aug 2020 10:40:24 -0700
Subject: [PATCH 084/112] sliding_chunks_no_overlap (#100)

---
 longformer/longformer.py     | 27 ++++++++++++++++------
 longformer/sliding_chunks.py | 43 ++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/longformer/longformer.py b/longformer/longformer.py
index 953bd2c..e239c6e 100644
--- a/longformer/longformer.py
+++ b/longformer/longformer.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 from longformer.diagonaled_mm_tvm import diagonaled_mm as diagonaled_mm_tvm, mask_invalid_locations
 from longformer.sliding_chunks import sliding_chunks_matmul_qk, sliding_chunks_matmul_pv
+from longformer.sliding_chunks import sliding_chunks_no_overlap_matmul_qk, sliding_chunks_no_overlap_matmul_pv
 from transformers.modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM
 
 
@@ -48,7 +49,7 @@ def __init__(self, attention_window: List[int] = None, attention_dilation: List[
         self.attention_dilation = attention_dilation
         self.autoregressive = autoregressive
         self.attention_mode = attention_mode
-        assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
+        assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2', 'sliding_chunks_no_overlap']
 
 
 class LongformerSelfAttention(nn.Module):
@@ -80,8 +81,8 @@ def __init__(self, config, layer_id):
         self.autoregressive = config.autoregressive
         assert self.attention_window > 0
         assert self.attention_dilation > 0
-        assert self.attention_mode in ['tvm', 'sliding_chunks']
-        if self.attention_mode == 'sliding_chunks':
+        assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap']
+        if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']:
             assert not self.autoregressive  # not supported
             assert self.attention_dilation == 1  # dilation is not supported
 
@@ -147,8 +148,12 @@ def forward(
             q = q.float().contiguous()
             k = k.float().contiguous()
             attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False)
-        else:  # "sliding_chunks"
+        elif self.attention_mode == "sliding_chunks":
             attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0)
+        elif self.attention_mode == "sliding_chunks_no_overlap":
+            attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0)
+        else:
+            raise False
         mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False)
         if remove_from_windowed_attention_mask is not None:
             # This implementation is fast and takes very little memory because num_heads x hidden_size = 1
@@ -162,10 +167,14 @@ def forward(
             # diagonal mask with zeros everywhere and -inf inplace of padding
             if self.attention_mode == 'tvm':
                 d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False)
-            else:
+            elif self.attention_mode == "sliding_chunks":
                 d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
+            elif self.attention_mode == "sliding_chunks_no_overlap":
+                d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
+
             attn_weights += d_mask
-        assert list(attn_weights.size()) == [bsz, seq_len, self.num_heads, self.attention_window * 2 + 1]
+        assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads]
+        assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3]
 
         # the extra attention
         if extra_attention_mask is not None:
@@ -199,8 +208,12 @@ def forward(
         if self.attention_mode == 'tvm':
             v = v.float().contiguous()
             attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False)
-        else:  # "sliding_chunks"
+        elif self.attention_mode == "sliding_chunks":
             attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window)
+        elif self.attention_mode == "sliding_chunks_no_overlap":
+            attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window)
+        else:
+            raise False
 
         attn = attn.type_as(hidden_states)
         assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim]
diff --git a/longformer/sliding_chunks.py b/longformer/sliding_chunks.py
index d39fe9b..4eed8d8 100644
--- a/longformer/sliding_chunks.py
+++ b/longformer/sliding_chunks.py
@@ -131,3 +131,46 @@ def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor,
     input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id)
     attention_mask = F.pad(attention_mask, (0, padding_len), value=False)  # no attention on the padding tokens
     return input_ids, attention_mask
+
+
+# ========= "sliding_chunks_no_overlap": alternative implemenation of the sliding window attention =========
+# This implementation uses non-overlapping chunks (or blocks) of size `w` with number of local attention = 3xw
+# To make this implemenation comparable to "sliding_chunks" set w such that
+#       w_of_sliding_chunks_no_overlap = w_of_sliding_chunks * 2 / 3
+# For example,
+#    w_of_sliding_chunks = 256 (this is one sided. Total attention size = 512)
+#    w_of_sliding_chunks_no_overlap = 170 (Total attention size = 510)
+# Performance:
+# - Speed: 30% faster than "sliding_chunks"
+# - Memory: 95% of the memory usage of "sliding_chunks"
+# The windows are asymmetric where number of attention on each side of a token ranges between w to 2w
+# while "sliding_chunks" has a symmetric window around each token.
+# This implementation is roughly similar to the implementation described in the BigBird paper https://arxiv.org/abs/2007.14062
+
+def sliding_chunks_no_overlap_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float):
+    bsz, seqlen, num_heads, head_dim = q.size()
+    assert seqlen % w == 0
+    assert q.size() == k.size()
+    # chunk seqlen into non-overlapping chunks of size w
+    chunk_q = q.view(bsz, seqlen // w, w, num_heads, head_dim)
+    chunk_k = k.view(bsz, seqlen // w, w, num_heads, head_dim)
+    chunk_k_expanded = torch.stack((
+        F.pad(chunk_k[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0),
+        chunk_k,
+        F.pad(chunk_k[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0),
+    ), dim=-1)
+    diagonal_attn = torch.einsum('bcxhd,bcyhde->bcxhey', (chunk_q, chunk_k_expanded))  # multiply
+    return diagonal_attn.reshape(bsz, seqlen, num_heads, 3 * w)
+
+
+def sliding_chunks_no_overlap_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int):
+    bsz, seqlen, num_heads, head_dim = v.size()
+    chunk_prob = prob.view(bsz, seqlen // w, w, num_heads, 3, w)
+    chunk_v = v.view(bsz, seqlen // w, w, num_heads, head_dim)
+    chunk_v_extended = torch.stack((
+        F.pad(chunk_v[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0),
+        chunk_v,
+        F.pad(chunk_v[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0),
+    ), dim=-1)
+    context = torch.einsum('bcwhpd,bcdhep->bcwhe', (chunk_prob, chunk_v_extended))
+    return context.reshape(bsz, seqlen, num_heads, head_dim)

From b15607b5ec9f3decbb93bafa3f310d3f3cdd8c53 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 26 Aug 2020 16:17:19 -0700
Subject: [PATCH 085/112] seq2seq

---
 longformer_on_beaker.sh |  2 +-
 scripts/triviaqa.py     | 62 ++++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh
index 425dcef..bedf8d9 100755
--- a/longformer_on_beaker.sh
+++ b/longformer_on_beaker.sh
@@ -7,7 +7,7 @@ export RESULT_SAVE_PREFIX="test"
 export ARGS="$@"
 export GPU_COUNT=8
 export CPU_COUNT=32
-export CLUSTER="ai2/on-prem-ai2-server2"
+export CLUSTER="ai2/on-prem-ai2-server3"
 export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX
 
 beaker experiment create -f experiment.yml
diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index e5e488d..9dbb6bc 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -9,7 +9,7 @@
 from torch.optim.lr_scheduler import LambdaLR
 
 from torch.utils.data import DataLoader, Dataset
-from transformers import RobertaTokenizer, AutoModel, AutoConfig
+from transformers import RobertaTokenizer, AutoModel, AutoConfig, AutoModelWithLMHead
 from scripts.triviaqa_utils import evaluation_utils
 
 import pytorch_lightning as pl
@@ -308,12 +308,18 @@ def load_model(self):
             config.encoder_attention_heads = 12
             config.decoder_attention_heads = 12
             config.attention_dropout = 0.1
-            model = AutoModel.from_pretrained(self.args.model_path, config=config)
+            if self.args.seq2seq:
+                model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config)
+            else:
+                model = AutoModel.from_pretrained(self.args.model_path, config=config)
         elif 'bart' in self.args.model_path and 'large' in self.args.model_path:
             config = AutoConfig.from_pretrained(self.args.model_path)
             config.attention_dropout = 0.1
             config.gradient_checkpointing = True
-            model = AutoModel.from_pretrained(self.args.model_path, config=config)
+            if self.args.seq2seq:
+                model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config)
+            else:
+                model = AutoModel.from_pretrained(self.args.model_path, config=config)
         else:
             model = AutoModel.from_pretrained(self.args.model_path)
 
@@ -353,9 +359,20 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p
         elif self.args.model_path in ['bart.large', 'bart.base']:
             sequence_output = self.model.extract_features(input_ids)
         else:
-            sequence_output = self.model(
-                    input_ids,
-                    attention_mask=attention_mask)[0]
+            if self.args.seq2seq:
+                decoder_input_ids = answer_token_ids[:, 0, :-1].clone()
+                decoder_input_ids[decoder_input_ids == self.tokenizer.eos_token_id] = self.tokenizer.pad_token_id
+                decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
+                labels = answer_token_ids[:, 0, 1:].contiguous()
+                loss = self.model(
+                        input_ids,
+                        attention_mask=attention_mask,
+                        decoder_input_ids=decoder_input_ids,
+                        decoder_attention_mask=decoder_attention_mask,
+                        labels=labels)[0]
+                return [loss]
+            else:
+                sequence_output = self.model(input_ids, attention_mask=attention_mask)[0]
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -436,6 +453,8 @@ def training_step(self, batch, batch_nb):
     def validation_step(self, batch, batch_nb):
         input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
         output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
+        if self.args.seq2seq:
+            return {'vloss': output[0]}
         loss, start_logits, end_logits = output[:3]
         answers = self.decode(input_ids, start_logits, end_logits)
 
@@ -517,23 +536,28 @@ def sync_list_across_gpus(self, list_to_sync, device, dtype):
 
     def validation_end(self, outputs):
         avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()
-        avg_em = torch.stack([x['vem'] for x in outputs]).mean()
-        string_qids = [item for sublist in outputs for item in sublist['qids']]
-        int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids]
-        answer_scores = [item for sublist in outputs for item in sublist['answer_scores']]
-        f1_scores = [item for sublist in outputs for item in sublist['f1']]
-        em_scores = [item for sublist in outputs for item in sublist['em']]
-        print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
+        if not self.args.seq2seq:
+            avg_em = torch.stack([x['vem'] for x in outputs]).mean()
+            string_qids = [item for sublist in outputs for item in sublist['qids']]
+            int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids]
+            answer_scores = [item for sublist in outputs for item in sublist['answer_scores']]
+            f1_scores = [item for sublist in outputs for item in sublist['f1']]
+            em_scores = [item for sublist in outputs for item in sublist['em']]
+            print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
         if self.trainer.use_ddp:
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= self.trainer.world_size
-            torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM)
-            avg_em /= self.trainer.world_size
+            if not self.args.seq2seq:
+                torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM)
+                avg_em /= self.trainer.world_size
+
+                int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
+                answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
+                f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
+                em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int)
+        if self.args.seq2seq:
+            return {'avg_val_loss': avg_loss, 'log': {'val_loss': avg_loss}, 'progress_bar': {'val_loss': avg_loss}}
 
-            int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
-            answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
-            f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
-            em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int)
         print(f'after sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
 
         # Because of having multiple documents per questions, some questions might have multiple corresponding answers

From 82741c3c9ec4107a42d5981eac2ca98cc605a136 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 27 Aug 2020 21:06:12 +0000
Subject: [PATCH 086/112] wip

---
 scripts/triviaqa.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 9dbb6bc..8b6c9a7 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -290,7 +290,8 @@ def __init__(self, args):
         self.tokenizer.model_max_length = self.args.max_seq_len
         self.model = self.load_model()
         self.num_labels = 2
-        self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
+        if not self.args.seq2seq:
+            self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
 
     def load_model(self):
@@ -364,6 +365,7 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p
                 decoder_input_ids[decoder_input_ids == self.tokenizer.eos_token_id] = self.tokenizer.pad_token_id
                 decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
                 labels = answer_token_ids[:, 0, 1:].contiguous()
+                labels[answer_token_ids[:, 0, 1:] == self.tokenizer.pad_token_id] = -100
                 loss = self.model(
                         input_ids,
                         attention_mask=attention_mask,

From 5c3a22ac08b5da4c5ce933d18d770061bdfa584b Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 27 Aug 2020 14:26:35 -0700
Subject: [PATCH 087/112] wip

---
 scripts/triviaqa.py | 62 ++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 8b6c9a7..8a55462 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -366,13 +366,15 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p
                 decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
                 labels = answer_token_ids[:, 0, 1:].contiguous()
                 labels[answer_token_ids[:, 0, 1:] == self.tokenizer.pad_token_id] = -100
-                loss = self.model(
+                outputs = self.model(
                         input_ids,
                         attention_mask=attention_mask,
                         decoder_input_ids=decoder_input_ids,
                         decoder_attention_mask=decoder_attention_mask,
-                        labels=labels)[0]
-                return [loss]
+                        labels=labels)
+                loss = outputs[0]
+                logit_scores = outputs[1].softmax(dim=2)[:, :, 0].sum(dim=1)
+                return [loss, logit_scores]
             else:
                 sequence_output = self.model(input_ids, attention_mask=attention_mask)[0]
 
@@ -456,7 +458,23 @@ def validation_step(self, batch, batch_nb):
         input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
         output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
         if self.args.seq2seq:
-            return {'vloss': output[0]}
+            logit_scores = output[1]
+            best_answer_score = logit_scores.max()
+            best_answer_index = logit_scores.argmax().item()
+            generated_ids = self.model.generate(input_ids=input_ids[best_answer_index:best_answer_index + 1],
+                                                attention_mask=input_mask[best_answer_index:best_answer_index + 1],
+                                                use_cache=True,)
+            generated_answer_ids = generated_ids[0]
+            generated_answer_ids[-1] = self.tokenizer.eos_token_id
+            index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item()
+            generated_answer_ids = generated_answer_ids[1:index_of_eos_token]
+            answer_text = self.tokenizer.decode(generated_answer_ids)
+            f1_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.f1_score, answer_text, aliases)
+            em_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.exact_match_score, answer_text, aliases)
+            return {'vloss': output[0], 'vem': generated_answer_ids.new_zeros([1]).float(),
+                    'qids': [qids], 'answer_scores': [best_answer_score],
+                    'f1': [f1_score], 'em': [em_score]}
+
         loss, start_logits, end_logits = output[:3]
         answers = self.decode(input_ids, start_logits, end_logits)
 
@@ -538,28 +556,23 @@ def sync_list_across_gpus(self, list_to_sync, device, dtype):
 
     def validation_end(self, outputs):
         avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()
-        if not self.args.seq2seq:
-            avg_em = torch.stack([x['vem'] for x in outputs]).mean()
-            string_qids = [item for sublist in outputs for item in sublist['qids']]
-            int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids]
-            answer_scores = [item for sublist in outputs for item in sublist['answer_scores']]
-            f1_scores = [item for sublist in outputs for item in sublist['f1']]
-            em_scores = [item for sublist in outputs for item in sublist['em']]
-            print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
+        avg_em = torch.stack([x['vem'] for x in outputs]).mean()
+        string_qids = [item for sublist in outputs for item in sublist['qids']]
+        int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids]
+        answer_scores = [item for sublist in outputs for item in sublist['answer_scores']]
+        f1_scores = [item for sublist in outputs for item in sublist['f1']]
+        em_scores = [item for sublist in outputs for item in sublist['em']]
+        print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
         if self.trainer.use_ddp:
             torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
             avg_loss /= self.trainer.world_size
-            if not self.args.seq2seq:
-                torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM)
-                avg_em /= self.trainer.world_size
-
-                int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
-                answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
-                f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
-                em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int)
-        if self.args.seq2seq:
-            return {'avg_val_loss': avg_loss, 'log': {'val_loss': avg_loss}, 'progress_bar': {'val_loss': avg_loss}}
+            torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM)
+            avg_em /= self.trainer.world_size
 
+            int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
+            answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
+            f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
+            em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int)
         print(f'after sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
 
         # Because of having multiple documents per questions, some questions might have multiple corresponding answers
@@ -583,6 +596,9 @@ def validation_end(self, outputs):
     def test_step(self, batch, batch_nb):
         input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch
         output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
+        if self.args.seq2seq:
+            raise NotImplemented
+
         loss, start_logits, end_logits = output[:3]
         answers = self.decode(input_ids, start_logits, end_logits)
 
@@ -689,7 +705,7 @@ def add_model_specific_args(parser, root_dir):
                             help="Number of gpus. 0 for CPU")
         parser.add_argument("--warmup", type=int, default=200, help="Number of warmup steps")
         parser.add_argument("--lr", type=float, default=0.0001, help="Maximum learning rate")
-        parser.add_argument("--val_every", type=float, default=0.2, help="Number of training steps between validations")
+        parser.add_argument("--val_every", type=float, default=0.5, help="Number of training steps between validations")
         parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
         parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers")
         parser.add_argument("--seed", type=int, default=1234, help="Seed")

From 75aeb4764f661299e5aed05c151132b9ff167db3 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 28 Aug 2020 04:19:06 +0000
Subject: [PATCH 088/112] wip

---
 scripts/triviaqa.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index 8a55462..bba1e5f 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -737,6 +737,8 @@ def add_model_specific_args(parser, root_dir):
                             default='sliding_chunks', help='Which implementation of selfattention to use')
         parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
         parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model")
+        parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
+
 
         return parser
 
@@ -760,14 +762,16 @@ def main(args):
         filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
         save_top_k=5,
         verbose=True,
-        monitor='avg_val_f1',
-        mode='max',
+        monitor='avg_val_loss',
+        # save_last=True,
+        mode='min',
+        period=-1,
         prefix=''
     )
 
     print(args)
     train_set_size = 110648  # hardcode dataset size. Needed to compute number of steps for the lr scheduler
-    args.steps = args.epochs * train_set_size / (args.batch_size * args.gpus)
+    args.steps = args.epochs * train_set_size / (args.batch_size * max(args.gpus, 1))
     print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * args.gpus} <<<<<<<')
 
     trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None,
@@ -775,6 +779,7 @@ def main(args):
                          replace_sampler_ddp=False,
                          accumulate_grad_batches=args.batch_size,
                          val_check_interval=args.val_every,
+                         num_sanity_val_steps=2,
                          # check_val_every_n_epoch=2,
                          val_percent_check=args.val_percent_check,
                          test_percent_check=args.val_percent_check,
@@ -782,6 +787,7 @@ def main(args):
                          checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
                          show_progress_bar=not args.no_progress_bar,
                          use_amp=not args.fp32, amp_level='O2',
+                         resume_from_checkpoint=args.resume_ckpt,
                          )
     if not args.test:
         trainer.fit(model)

From 391d8de413b094ae3a0b1a5a3935e91cb3c89bed Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Tue, 1 Sep 2020 16:16:08 -0700
Subject: [PATCH 089/112] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5631783..89816c9 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\***
 
-A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. 
+A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 16K. 
 ```
 pip install git+https://github.com/allenai/longformer.git@encoderdecoder  
 

From bf9e58a447f3fe8c19513d20b043201fcd24acbc Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 2 Sep 2020 16:50:35 -0700
Subject: [PATCH 090/112] summarization

---
 scripts/summarization.py | 247 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 scripts/summarization.py

diff --git a/scripts/summarization.py b/scripts/summarization.py
new file mode 100644
index 0000000..37d4a14
--- /dev/null
+++ b/scripts/summarization.py
@@ -0,0 +1,247 @@
+import os
+import argparse
+import random
+import numpy as np
+
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers.optimization import get_linear_schedule_with_warmup
+import nlp
+
+import pytorch_lightning as pl
+from pytorch_lightning.logging import TestTubeLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+
+
+class SummarizationDataset(Dataset):
+    def __init__(self, hf_dataset, tokenizer, max_output_len):
+        self.hf_dataset = hf_dataset
+        self.tokenizer = tokenizer
+        self.max_output_len = max_output_len
+
+    def __len__(self):
+        return len(self.hf_dataset)
+
+    def __getitem__(self, idx):
+        entry = self.hf_dataset[idx]
+        input_ids = self.tokenizer.encode(entry['article'], truncation=True)
+        output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len)
+        return torch.tensor(input_ids), torch.tensor(output_ids)
+
+    @staticmethod
+    def collate_fn(batch):
+        pad_token_id = 1  # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id
+        input_ids, output_ids = list(zip(*batch))
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
+        output_ids = torch.nn.utils.rnn.pad_sequence(output_ids, batch_first=True, padding_value=pad_token_id)
+        return input_ids, output_ids
+
+
+class Summarizer(pl.LightningModule):
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.hparams = args
+        self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path)
+        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
+        self.rouge = None
+
+    def forward(self, input_ids, output_ids):
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
+        attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
+        decoder_input_ids = output_ids[:, :-1]
+        decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
+        labels = output_ids[:, 1:].clone()
+        labels[labels == self.tokenizer.pad_token_id] = -100
+        outputs = self.model(
+                input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                labels=labels)
+        return outputs
+
+    def training_step(self, batch, batch_nb):
+        output = self.forward(*batch)
+        loss = output[0]
+        lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']
+        tensorboard_logs = {'train_loss': loss, 'lr': lr,
+                            'input_size': batch[0].numel(),
+                            'output_size': batch[1].numel(),
+                            'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3}
+        return {'loss': loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_nb):
+        outputs = self.forward(*batch)
+        vloss = outputs[0]
+        input_ids, output_ids = batch
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
+        attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
+        generated_ids = self.model.generate(input_ids=input_ids,
+                                            attention_mask=attention_mask,
+                                            use_cache=True,
+                                            max_length=self.args.max_output_len)
+        generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        if self.rouge is None:
+            self.rouge = nlp.load_metric("rouge")
+        rouge_scores = self.rouge.compute(predictions=generated_str, references=gold_str, rouge_types=['rouge2', 'rouge1', 'rougeL'])
+        return {'vloss': vloss,
+                'rouge1': vloss.new_zeros(1) + rouge_scores['rouge1'].mid.fmeasure,
+                'rouge2': vloss.new_zeros(1) + rouge_scores['rouge2'].mid.fmeasure,
+                'rougeL': vloss.new_zeros(1) + rouge_scores['rougeL'].mid.fmeasure}
+
+    def validation_epoch_end(self, outputs):
+        names = ['vloss', 'rouge1', 'rouge2', 'rougeL']
+        metrics = []
+        for name in names:
+            metric = torch.stack([x[name] for x in outputs]).mean()
+            if self.trainer.use_ddp:
+                torch.distributed.all_reduce(metric, op=torch.distributed.ReduceOp.SUM)
+                metric /= self.trainer.world_size
+            metrics.append(metric)
+        logs = dict(zip(*[names, metrics]))
+        return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs}
+
+    def test_step(self, batch, batch_nb):
+        raise NotImplementedError
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
+        if self.args.debug:
+            return optimizer  # const LR
+        num_gpus = torch.cuda.device_count()
+        num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps
+        )
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+
+    def _get_dataloader(self, current_dataloader, hf_dataset, is_train):
+        if current_dataloader is not None:
+            return current_dataloader
+        dataset = SummarizationDataset(hf_dataset=hf_dataset, tokenizer=self.tokenizer, max_output_len=self.args.max_output_len)
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None
+        return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None),
+                          num_workers=self.args.num_workers, sampler=sampler,
+                          collate_fn=SummarizationDataset.collate_fn)
+
+    @pl.data_loader
+    def train_dataloader(self):
+        if self.hf_datasets is None:
+            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
+        self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, self.hf_datasets['train'], is_train=True)
+        return self.train_dataloader_object
+
+    @pl.data_loader
+    def val_dataloader(self):
+        if self.hf_datasets is None:
+            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
+        dataset_split = 'validation' if not self.args.debug else 'train'
+        self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, self.hf_datasets[dataset_split], is_train=False)
+        return self.val_dataloader_object
+
+    @pl.data_loader
+    def test_dataloader(self):
+        if self.hf_datasets is None:
+            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
+        self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, self.hf_datasets['test'], is_train=False)
+        return self.test_dataloader_object
+
+    def configure_ddp(self, model, device_ids):
+        model = LightningDistributedDataParallel(
+            model,
+            device_ids=device_ids,
+            find_unused_parameters=False
+        )
+        return model
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        parser.add_argument("--save_dir", type=str, default='summarization')
+        parser.add_argument("--save_prefix", type=str, default='test')
+        parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
+        parser.add_argument("--grad_accum", type=int, default=1, help="number of gradient accumulation steps")
+        parser.add_argument("--gpus", type=int, default=-1,
+                            help="Number of gpus. 0 for CPU")
+        parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps")
+        parser.add_argument("--lr", type=float, default=0.00003, help="Maximum learning rate")
+        parser.add_argument("--val_every", type=float, default=1.0, help="Number of training steps between validations")
+        parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
+        parser.add_argument("--num_workers", type=int, default=0, help="Number of data loader workers")
+        parser.add_argument("--seed", type=int, default=1234, help="Seed")
+        parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
+        parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
+        parser.add_argument("--max_output_len", type=int, default=256,
+                            help="maximum num of wordpieces/summary. Used for training and testing")
+        parser.add_argument("--test", action='store_true', help="Test only, no training")
+        parser.add_argument("--model_path", type=str, default='facebook/bart-base',
+                            help="Path to the checkpoint directory or model name")
+        parser.add_argument("--tokenizer", type=str, default='facebook/bart-base')
+        parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing")
+        parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
+        parser.add_argument("--debug", action='store_true', help="debug run")
+        parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
+
+        return parser
+
+
+def main(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+    model = Summarizer(args)
+
+    logger = TestTubeLogger(
+        save_dir=args.save_dir,
+        name=args.save_prefix,
+        version=0  # always use version=0
+    )
+
+    checkpoint_callback = ModelCheckpoint(
+        filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
+        save_top_k=5,
+        verbose=True,
+        monitor='avg_val_loss',
+        mode='min',
+        period=-1,
+        prefix=''
+    )
+
+    print(args)
+    model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
+    args.dataset_size = 203037  # hardcode dataset size. Needed to compute number of steps for the lr scheduler
+
+    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None,
+                         track_grad_norm=-1,
+                         max_epochs=args.epochs if not args.debug else 100,
+                         replace_sampler_ddp=False,
+                         accumulate_grad_batches=args.grad_accum,
+                         val_check_interval=args.val_every,
+                         num_sanity_val_steps=2,
+                         check_val_every_n_epoch=1 if not args.debug else 5,
+                         val_percent_check=args.val_percent_check,
+                         test_percent_check=args.val_percent_check,
+                         logger=logger,
+                         checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
+                         show_progress_bar=not args.no_progress_bar,
+                         use_amp=not args.fp32, amp_level='O2',
+                         resume_from_checkpoint=args.resume_ckpt,
+                         )
+    if not args.test:
+        trainer.fit(model)
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main_arg_parser = argparse.ArgumentParser(description="summarization")
+    parser = Summarizer.add_model_specific_args(main_arg_parser, os.getcwd())
+    args = parser.parse_args()
+    main(args)

From 01581253b921045077a330650412d9925332d0a8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Wed, 2 Sep 2020 17:07:07 -0700
Subject: [PATCH 091/112] fix loading data

---
 scripts/summarization.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 37d4a14..6bcf956 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -121,10 +121,10 @@ def configure_optimizers(self):
         )
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
 
-    def _get_dataloader(self, current_dataloader, hf_dataset, is_train):
+    def _get_dataloader(self, current_dataloader, split_name, is_train):
         if current_dataloader is not None:
             return current_dataloader
-        dataset = SummarizationDataset(hf_dataset=hf_dataset, tokenizer=self.tokenizer, max_output_len=self.args.max_output_len)
+        dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer, max_output_len=self.args.max_output_len)
         sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None
         return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None),
                           num_workers=self.args.num_workers, sampler=sampler,
@@ -132,24 +132,18 @@ def _get_dataloader(self, current_dataloader, hf_dataset, is_train):
 
     @pl.data_loader
     def train_dataloader(self):
-        if self.hf_datasets is None:
-            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
-        self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, self.hf_datasets['train'], is_train=True)
+        self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, 'train', is_train=True)
         return self.train_dataloader_object
 
     @pl.data_loader
     def val_dataloader(self):
-        if self.hf_datasets is None:
-            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
-        dataset_split = 'validation' if not self.args.debug else 'train'
-        self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, self.hf_datasets[dataset_split], is_train=False)
+        split_name = 'validation' if not self.args.debug else 'train'
+        self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, split_name, is_train=False)
         return self.val_dataloader_object
 
     @pl.data_loader
     def test_dataloader(self):
-        if self.hf_datasets is None:
-            self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
-        self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, self.hf_datasets['test'], is_train=False)
+        self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, 'test', is_train=False)
         return self.test_dataloader_object
 
     def configure_ddp(self, model, device_ids):
@@ -198,6 +192,7 @@ def main(args):
         torch.cuda.manual_seed_all(args.seed)
 
     model = Summarizer(args)
+    model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
 
     logger = TestTubeLogger(
         save_dir=args.save_dir,
@@ -216,10 +211,10 @@ def main(args):
     )
 
     print(args)
-    model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv')
+
     args.dataset_size = 203037  # hardcode dataset size. Needed to compute number of steps for the lr scheduler
 
-    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None,
+    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp',
                          track_grad_norm=-1,
                          max_epochs=args.epochs if not args.debug else 100,
                          replace_sampler_ddp=False,

From 9fdef528053364fcd6316b7bfbf7fcb8654d84a1 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 3 Sep 2020 07:35:23 -0700
Subject: [PATCH 092/112] wip

---
 scripts/summarization.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 6bcf956..f9e3b30 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -14,6 +14,8 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 
+from rouge_score import rouge_scorer
+
 
 class SummarizationDataset(Dataset):
     def __init__(self, hf_dataset, tokenizer, max_output_len):
@@ -26,7 +28,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         entry = self.hf_dataset[idx]
-        input_ids = self.tokenizer.encode(entry['article'], truncation=True)
+        input_ids = self.tokenizer.encode(entry['article'], truncation=True, max_length=self.max_input_len)
         output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len)
         return torch.tensor(input_ids), torch.tensor(output_ids)
 
@@ -48,7 +50,6 @@ def __init__(self, args):
         self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path)
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
-        self.rouge = None
 
     def forward(self, input_ids, output_ids):
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
@@ -87,13 +88,21 @@ def validation_step(self, batch, batch_nb):
                                             max_length=self.args.max_output_len)
         generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        if self.rouge is None:
-            self.rouge = nlp.load_metric("rouge")
-        rouge_scores = self.rouge.compute(predictions=generated_str, references=gold_str, rouge_types=['rouge2', 'rouge1', 'rougeL'])
+        scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
+        rouge1 = rouge2 = rougel = 0.0
+        for ref, pred in zip(gold_str, generated_str):
+            score = scorer.score(ref, pred)
+            rouge1 += score['rouge1'].fmeasure
+            rouge2 += score['rouge2'].fmeasure
+            rougel += score['rougeL'].fmeasure
+        rouge1 /= len(generated_str)
+        rouge2 /= len(generated_str)
+        rougel /= len(generated_str)
+
         return {'vloss': vloss,
-                'rouge1': vloss.new_zeros(1) + rouge_scores['rouge1'].mid.fmeasure,
-                'rouge2': vloss.new_zeros(1) + rouge_scores['rouge2'].mid.fmeasure,
-                'rougeL': vloss.new_zeros(1) + rouge_scores['rougeL'].mid.fmeasure}
+                'rouge1': vloss.new_zeros(1) + rouge1,
+                'rouge2': vloss.new_zeros(1) + rouge2,
+                'rougeL': vloss.new_zeros(1) + rougel, }
 
     def validation_epoch_end(self, outputs):
         names = ['vloss', 'rouge1', 'rouge2', 'rougeL']
@@ -108,7 +117,11 @@ def validation_epoch_end(self, outputs):
         return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs}
 
     def test_step(self, batch, batch_nb):
-        raise NotImplementedError
+        return self.validation_step(batch, batch_nb)
+
+    def test_epoch_end(self, outputs):
+        result = self.validation_epoch_end(outputs)
+        print(result)
 
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
@@ -172,6 +185,8 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
         parser.add_argument("--max_output_len", type=int, default=256,
                             help="maximum num of wordpieces/summary. Used for training and testing")
+        parser.add_argument("--max_input_len", type=int, default=512,
+                            help="maximum num of wordpieces/summary. Used for training and testing")
         parser.add_argument("--test", action='store_true', help="Test only, no training")
         parser.add_argument("--model_path", type=str, default='facebook/bart-base',
                             help="Path to the checkpoint directory or model name")

From cbb407d6ae13202dffd9c436303f3f5ec4d6e920 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 3 Sep 2020 07:41:40 -0700
Subject: [PATCH 093/112] wip

---
 scripts/summarization.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index f9e3b30..a6df4f3 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -18,9 +18,10 @@
 
 
 class SummarizationDataset(Dataset):
-    def __init__(self, hf_dataset, tokenizer, max_output_len):
+    def __init__(self, hf_dataset, tokenizer, max_input_len, max_output_len):
         self.hf_dataset = hf_dataset
         self.tokenizer = tokenizer
+        self.max_input_len = max_input_len
         self.max_output_len = max_output_len
 
     def __len__(self):
@@ -137,7 +138,8 @@ def configure_optimizers(self):
     def _get_dataloader(self, current_dataloader, split_name, is_train):
         if current_dataloader is not None:
             return current_dataloader
-        dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer, max_output_len=self.args.max_output_len)
+        dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer,
+                                       max_input_len=self.args.max_input_len, max_output_len=self.args.max_output_len)
         sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None
         return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None),
                           num_workers=self.args.num_workers, sampler=sampler,

From eb34cc0b880617e8fd952e3ecfb1bcf906e7d9b8 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 3 Sep 2020 16:55:51 -0700
Subject: [PATCH 094/112] grad_ckpt + reqs + long

---
 requirements.txt         |  2 ++
 scripts/summarization.py | 19 ++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d91e5de..a98ef2a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_
 torch==1.6.0
 tensorboardX
 test-tube==0.7.5
+nlp
+rouge_score
diff --git a/scripts/summarization.py b/scripts/summarization.py
index a6df4f3..a57bab4 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -8,13 +8,15 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from transformers.optimization import get_linear_schedule_with_warmup
 import nlp
+from rouge_score import rouge_scorer
 
 import pytorch_lightning as pl
 from pytorch_lightning.logging import TestTubeLogger
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 
-from rouge_score import rouge_scorer
+from longformer import LongformerEncoderDecoderForConditionalGeneration
+from longformer.sliding_chunks import pad_to_window_size
 
 
 class SummarizationDataset(Dataset):
@@ -49,16 +51,26 @@ def __init__(self, args):
         self.args = args
         self.hparams = args
         self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path)
+        if 'long' in self.args.model_path:
+            # TODO: remember to set attention_dropout = 0.1
+            self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(
+                self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,)
+        else:
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path)
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
 
     def forward(self, input_ids, output_ids):
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
         attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
+        if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration):
+            attention_mask[:, 0] = 2  # global attention on one token for all model params to be used, which is important for gradient checkpointing to work
+            input_ids, attention_mask = pad_to_window_size(  # ideally, should be moved inside the LongformerModel
+                input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id)
         decoder_input_ids = output_ids[:, :-1]
         decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
         labels = output_ids[:, 1:].clone()
         labels[labels == self.tokenizer.pad_token_id] = -100
+
         outputs = self.model(
                 input_ids,
                 attention_mask=attention_mask,
@@ -129,7 +141,7 @@ def configure_optimizers(self):
         if self.args.debug:
             return optimizer  # const LR
         num_gpus = torch.cuda.device_count()
-        num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum
+        num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size
         scheduler = get_linear_schedule_with_warmup(
             optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps
         )
@@ -197,6 +209,7 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
         parser.add_argument("--debug", action='store_true', help="debug run")
         parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
+        parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory')
 
         return parser
 

From 42481fd427879d11b3c332f91c8e26206003d11c Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 4 Sep 2020 02:44:23 +0000
Subject: [PATCH 095/112] ignore empty answers

---
 scripts/triviaqa.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py
index bba1e5f..967f97a 100644
--- a/scripts/triviaqa.py
+++ b/scripts/triviaqa.py
@@ -459,16 +459,19 @@ def validation_step(self, batch, batch_nb):
         output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids)
         if self.args.seq2seq:
             logit_scores = output[1]
-            best_answer_score = logit_scores.max()
-            best_answer_index = logit_scores.argmax().item()
-            generated_ids = self.model.generate(input_ids=input_ids[best_answer_index:best_answer_index + 1],
-                                                attention_mask=input_mask[best_answer_index:best_answer_index + 1],
-                                                use_cache=True,)
-            generated_answer_ids = generated_ids[0]
-            generated_answer_ids[-1] = self.tokenizer.eos_token_id
-            index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item()
-            generated_answer_ids = generated_answer_ids[1:index_of_eos_token]
-            answer_text = self.tokenizer.decode(generated_answer_ids)
+            answer_score_indices = logit_scores.sort().indices
+            generated_ids = self.model.generate(input_ids=input_ids, attention_mask=input_mask, use_cache=True,)
+            answer_text = ''
+            best_answer_score = 0
+            for i in answer_score_indices:
+                generated_answer_ids = generated_ids[answer_score_indices[i]]
+                generated_answer_ids[-1] = self.tokenizer.eos_token_id
+                index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item()
+                generated_answer_ids = generated_answer_ids[1:index_of_eos_token]
+                answer_text = self.tokenizer.decode(generated_answer_ids)
+                if answer_text != '':
+                    best_answer_score = logit_scores[answer_score_indices[i]]
+                    break
             f1_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.f1_score, answer_text, aliases)
             em_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.exact_match_score, answer_text, aliases)
             return {'vloss': output[0], 'vem': generated_answer_ids.new_zeros([1]).float(),

From c6f23353f151dad213f3dc9a0d992ec5c19ab599 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 3 Sep 2020 20:52:58 -0700
Subject: [PATCH 096/112] attention dropout

---
 scripts/summarization.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index a57bab4..64d42c1 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch.utils.data import DataLoader, Dataset
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
 from transformers.optimization import get_linear_schedule_with_warmup
 import nlp
 from rouge_score import rouge_scorer
@@ -15,7 +15,7 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 
-from longformer import LongformerEncoderDecoderForConditionalGeneration
+from longformer import LongformerEncoderDecoderForConditionalGeneration, LongformerEncoderDecoderConfig
 from longformer.sliding_chunks import pad_to_window_size
 
 
@@ -51,12 +51,18 @@ def __init__(self, args):
         self.args = args
         self.hparams = args
         self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True)
+
         if 'long' in self.args.model_path:
-            # TODO: remember to set attention_dropout = 0.1
+            config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path)
+            config.attention_dropout = self.args.attention_dropout
             self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(
-                self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,)
+                self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,
+                config=config)
         else:
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path)
+            config = AutoConfig.from_pretrained(self.args.model_path)
+            config.attention_dropout = self.args.attention_dropout
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(
+                self.args.model_path, config=config)
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
 
     def forward(self, input_ids, output_ids):
@@ -210,6 +216,8 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--debug", action='store_true', help="debug run")
         parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
         parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory')
+        parser.add_argument("--attention_dropout", type=float, default=0.1,
+                            help="attention dropout")
 
         return parser
 

From f5a798d083f18b3ec2ce5def37307ab66c6098c1 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Thu, 3 Sep 2020 22:22:59 -0700
Subject: [PATCH 097/112] model.generate takes a lot of memory. Set
 requires_grad=False

---
 scripts/summarization.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 64d42c1..dac4b08 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -55,9 +55,9 @@ def __init__(self, args):
         if 'long' in self.args.model_path:
             config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path)
             config.attention_dropout = self.args.attention_dropout
+            config.gradient_checkpointing = self.args.grad_ckpt
             self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(
-                self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,
-                config=config)
+                self.args.model_path, config=config)
         else:
             config = AutoConfig.from_pretrained(self.args.model_path)
             config.attention_dropout = self.args.attention_dropout
@@ -96,15 +96,18 @@ def training_step(self, batch, batch_nb):
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
+        for p in self.model.parameters():
+            p.requires_grad = False
+
         outputs = self.forward(*batch)
         vloss = outputs[0]
         input_ids, output_ids = batch
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
         attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
-        generated_ids = self.model.generate(input_ids=input_ids,
-                                            attention_mask=attention_mask,
-                                            use_cache=True,
-                                            max_length=self.args.max_output_len)
+
+        generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,
+                                            use_cache=True, max_length=self.args.max_output_len,
+                                            num_beams=1)
         generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
@@ -124,6 +127,9 @@ def validation_step(self, batch, batch_nb):
                 'rougeL': vloss.new_zeros(1) + rougel, }
 
     def validation_epoch_end(self, outputs):
+        for p in self.model.parameters():
+            p.requires_grad = True
+
         names = ['vloss', 'rouge1', 'rouge2', 'rougeL']
         metrics = []
         for name in names:

From 274d017a1687e6c90bd6a324e53d1c28ef8d7f44 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Fri, 4 Sep 2020 23:16:23 -0700
Subject: [PATCH 098/112] wip

---
 scripts/summarization.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index dac4b08..7d338e6 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -139,6 +139,7 @@ def validation_epoch_end(self, outputs):
                 metric /= self.trainer.world_size
             metrics.append(metric)
         logs = dict(zip(*[names, metrics]))
+        print(logs)
         return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs}
 
     def test_step(self, batch, batch_nb):
@@ -176,8 +177,7 @@ def train_dataloader(self):
 
     @pl.data_loader
     def val_dataloader(self):
-        split_name = 'validation' if not self.args.debug else 'train'
-        self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, split_name, is_train=False)
+        self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, 'validation', is_train=False)
         return self.val_dataloader_object
 
     @pl.data_loader
@@ -263,9 +263,9 @@ def main(args):
                          max_epochs=args.epochs if not args.debug else 100,
                          replace_sampler_ddp=False,
                          accumulate_grad_batches=args.grad_accum,
-                         val_check_interval=args.val_every,
+                         val_check_interval=args.val_every if not args.debug else 1,
                          num_sanity_val_steps=2,
-                         check_val_every_n_epoch=1 if not args.debug else 5,
+                         check_val_every_n_epoch=1 if not args.debug else 1,
                          val_percent_check=args.val_percent_check,
                          test_percent_check=args.val_percent_check,
                          logger=logger,

From 5f765b9021ab5abb082443964d54bf9265179022 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 5 Sep 2020 02:08:05 -0700
Subject: [PATCH 099/112] wip

---
 scripts/summarization.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 7d338e6..91ecb87 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -65,13 +65,17 @@ def __init__(self, args):
                 self.args.model_path, config=config)
         self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
 
-    def forward(self, input_ids, output_ids):
+    def _prepare_input(self, input_ids):
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
         attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
         if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration):
             attention_mask[:, 0] = 2  # global attention on one token for all model params to be used, which is important for gradient checkpointing to work
             input_ids, attention_mask = pad_to_window_size(  # ideally, should be moved inside the LongformerModel
                 input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id)
+        return input_ids, attention_mask
+
+    def forward(self, input_ids, output_ids):
+        input_ids, attention_mask = self._prepare_input(input_ids)
         decoder_input_ids = output_ids[:, :-1]
         decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
         labels = output_ids[:, 1:].clone()
@@ -102,9 +106,7 @@ def validation_step(self, batch, batch_nb):
         outputs = self.forward(*batch)
         vloss = outputs[0]
         input_ids, output_ids = batch
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
-        attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
-
+        input_ids, attention_mask = self._prepare_input(input_ids)
         generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                             use_cache=True, max_length=self.args.max_output_len,
                                             num_beams=1)

From 5784aee109b7e7bd379b8b33c36466b21dec8f22 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sat, 5 Sep 2020 21:09:47 -0700
Subject: [PATCH 100/112] attention_mode

---
 longformer/sliding_chunks.py |  2 +-
 scripts/summarization.py     | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/longformer/sliding_chunks.py b/longformer/sliding_chunks.py
index 4eed8d8..8ee30a1 100644
--- a/longformer/sliding_chunks.py
+++ b/longformer/sliding_chunks.py
@@ -125,7 +125,7 @@ def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor,
     Returns
         (input_ids, attention_mask) padded to length divisible by 2 * one_sided_window_size
     '''
-    w = 2 * one_sided_window_size
+    w = int(2 * one_sided_window_size)
     seqlen = input_ids.size(1)
     padding_len = (w - seqlen % w) % w
     input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id)
diff --git a/scripts/summarization.py b/scripts/summarization.py
index 91ecb87..454d2bc 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -56,6 +56,8 @@ def __init__(self, args):
             config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path)
             config.attention_dropout = self.args.attention_dropout
             config.gradient_checkpointing = self.args.grad_ckpt
+            config.attention_mode = self.args.attention_mode
+            config.attention_window = [self.args.attention_window] * config.encoder_layers
             self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(
                 self.args.model_path, config=config)
         else:
@@ -70,8 +72,14 @@ def _prepare_input(self, input_ids):
         attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
         if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration):
             attention_mask[:, 0] = 2  # global attention on one token for all model params to be used, which is important for gradient checkpointing to work
+            if self.args.attention_mode == 'sliding_chunks':
+                half_padding_mod = self.model.config.attention_window[0]
+            elif self.args.attention_mode == 'sliding_chunks_no_overlap':
+                half_padding_mod = self.model.config.attention_window[0] / 2
+            else:
+                raise NotImplementedError
             input_ids, attention_mask = pad_to_window_size(  # ideally, should be moved inside the LongformerModel
-                input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id)
+                input_ids, attention_mask, half_padding_mod, self.tokenizer.pad_token_id)
         return input_ids, attention_mask
 
     def forward(self, input_ids, output_ids):
@@ -224,8 +232,9 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--debug", action='store_true', help="debug run")
         parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
         parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory')
-        parser.add_argument("--attention_dropout", type=float, default=0.1,
-                            help="attention dropout")
+        parser.add_argument("--attention_dropout", type=float, default=0.1, help="attention dropout")
+        parser.add_argument("--attention_mode", type=str, default='sliding_chunks', help="Longformer attention mode")
+        parser.add_argument("--attention_window", type=int, default=512, help="Attention window")
 
         return parser
 

From b78384a826635a2bc656295ffbd2beb17e44fb4e Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 7 Sep 2020 04:15:32 +0000
Subject: [PATCH 101/112] wip

---
 scripts/summarization.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index dac4b08..3486685 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -37,7 +37,14 @@ def __getitem__(self, idx):
 
     @staticmethod
     def collate_fn(batch):
-        pad_token_id = 1  # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id
+        # A hack to know if this is bart or pegasus. DDP doesn't like global variables nor class-level memebr variables
+        if batch[0][0][-1].item() == 2:
+            pad_token_id = 1  # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id
+        elif batch[0][0][-1].item() == 1:
+            pad_token_id = 0  # AutoTokenizer.from_pretrained('google/pegasus-large').pad_token_id
+        else:
+            assert False
+
         input_ids, output_ids = list(zip(*batch))
         input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
         output_ids = torch.nn.utils.rnn.pad_sequence(output_ids, batch_first=True, padding_value=pad_token_id)
@@ -76,7 +83,6 @@ def forward(self, input_ids, output_ids):
         decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
         labels = output_ids[:, 1:].clone()
         labels[labels == self.tokenizer.pad_token_id] = -100
-
         outputs = self.model(
                 input_ids,
                 attention_mask=attention_mask,
@@ -108,8 +114,8 @@ def validation_step(self, batch, batch_nb):
         generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                             use_cache=True, max_length=self.args.max_output_len,
                                             num_beams=1)
-        generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True)
+        gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True)
         scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
         rouge1 = rouge2 = rougel = 0.0
         for ref, pred in zip(gold_str, generated_str):

From 327b72932f27dd608f6a5619508db27b35ea0c06 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 7 Sep 2020 06:36:42 +0000
Subject: [PATCH 102/112] pegasus bug

---
 scripts/summarization.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 35773d0..9885001 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -33,6 +33,8 @@ def __getitem__(self, idx):
         entry = self.hf_dataset[idx]
         input_ids = self.tokenizer.encode(entry['article'], truncation=True, max_length=self.max_input_len)
         output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len)
+        if self.tokenizer.bos_token_id is None:  # pegasus
+            output_ids = [self.tokenizer.pad_token_id] + output_ids
         return torch.tensor(input_ids), torch.tensor(output_ids)
 
     @staticmethod

From 4944fb851ad5a02d55d676c6f8c0801c71db0944 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 7 Sep 2020 06:47:07 +0000
Subject: [PATCH 103/112] run on cpu

---
 scripts/summarization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index 9885001..e477d9d 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -112,7 +112,7 @@ def training_step(self, batch, batch_nb):
         tensorboard_logs = {'train_loss': loss, 'lr': lr,
                             'input_size': batch[0].numel(),
                             'output_size': batch[1].numel(),
-                            'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3}
+                            'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3 if torch.cuda.is_available() else 0}
         return {'loss': loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_nb):
@@ -171,7 +171,7 @@ def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
         if self.args.debug:
             return optimizer  # const LR
-        num_gpus = torch.cuda.device_count()
+        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
         num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size
         scheduler = get_linear_schedule_with_warmup(
             optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps
@@ -277,7 +277,7 @@ def main(args):
 
     args.dataset_size = 203037  # hardcode dataset size. Needed to compute number of steps for the lr scheduler
 
-    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp',
+    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if torch.cuda.is_available() else None,
                          track_grad_norm=-1,
                          max_epochs=args.epochs if not args.debug else 100,
                          replace_sampler_ddp=False,

From 36252c07bbc9adcd506e88d80bad76095df750ed Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 7 Sep 2020 00:20:32 -0700
Subject: [PATCH 104/112] readme

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4693102..b2e6611 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,17 @@
 
 **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\***
 
-A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 16K. 
+A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BART`. With gradient checkpointing, fp16, and 48GB gpu, the input length can be up to 16K tokens.
 ```
-pip install git+https://github.com/allenai/longformer.git@encoderdecoder  
+pip install git+https://github.com/allenai/longformer.git@encoderdecoder
 
-# checkpoint: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-12288.tar.gz
+# checkpoint-base: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-base-16384.tar.gz
+# checkpoint-large: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-16384.tar.gz
 
 from longformer import LongformerEncoderDecoderForConditionalGeneration
 model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True)
 ```
+Check the script `scripts/summarization.py` for an example of how to use the model.
 
 **\*\*\*\*\* New July 23rd, 2020: Speed degradation \*\*\*\*\***
 

From 281999fdf4a449e2eb4d738e4b56a03b12c89f18 Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Mon, 7 Sep 2020 00:22:35 -0700
Subject: [PATCH 105/112] readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b2e6611..aed7e73 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,10 @@ pip install git+https://github.com/allenai/longformer.git@encoderdecoder
 from longformer import LongformerEncoderDecoderForConditionalGeneration
 model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True)
 ```
-Check the script `scripts/summarization.py` for an example of how to use the model.
+
+- Check the script `scripts/summarization.py` for an example of how to use the model.
+
+- Make sure to use the huggingface/transformers fork specified in `requirements.txt`.
 
 **\*\*\*\*\* New July 23rd, 2020: Speed degradation \*\*\*\*\***
 

From dee3daf1a6feea8dab056ade780c661effa9045f Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Tue, 8 Sep 2020 21:14:15 -0700
Subject: [PATCH 106/112] adafactor and label smoothing

---
 scripts/summarization.py | 49 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index e477d9d..614ab5a 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -6,7 +6,7 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
-from transformers.optimization import get_linear_schedule_with_warmup
+from transformers.optimization import get_linear_schedule_with_warmup, Adafactor
 import nlp
 from rouge_score import rouge_scorer
 
@@ -15,10 +15,34 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 
+
 from longformer import LongformerEncoderDecoderForConditionalGeneration, LongformerEncoderDecoderConfig
 from longformer.sliding_chunks import pad_to_window_size
 
 
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
+    """From fairseq"""
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+        count = (~pad_mask).sum()
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+        count = nll_loss.numel()
+
+    nll_loss = nll_loss.sum() / count
+    smooth_loss = smooth_loss.sum() / count
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
 class SummarizationDataset(Dataset):
     def __init__(self, hf_dataset, tokenizer, max_input_len, max_output_len):
         self.hf_dataset = hf_dataset
@@ -96,14 +120,24 @@ def forward(self, input_ids, output_ids):
         decoder_input_ids = output_ids[:, :-1]
         decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id)
         labels = output_ids[:, 1:].clone()
-        labels[labels == self.tokenizer.pad_token_id] = -100
         outputs = self.model(
                 input_ids,
                 attention_mask=attention_mask,
                 decoder_input_ids=decoder_input_ids,
                 decoder_attention_mask=decoder_attention_mask,
-                labels=labels)
-        return outputs
+                use_cache=False,)
+        lm_logits = outputs[0]
+        if self.args.label_smoothing == 0:
+            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
+            ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
+            assert lm_logits.shape[-1] == self.model.config.vocab_size
+            loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
+        else:
+            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
+            loss, nll_loss = label_smoothed_nll_loss(
+                lprobs, labels, self.args.label_smoothing, ignore_index=self.tokenizer.pad_token_id
+            )
+        return [loss]
 
     def training_step(self, batch, batch_nb):
         output = self.forward(*batch)
@@ -168,7 +202,10 @@ def test_epoch_end(self, outputs):
         print(result)
 
     def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
+        if self.args.adafactor:
+            optimizer = Adafactor(self.model.parameters(), lr=self.args.lr, scale_parameter=False, relative_step=False)
+        else:
+            optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
         if self.args.debug:
             return optimizer  # const LR
         num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
@@ -243,6 +280,8 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--attention_dropout", type=float, default=0.1, help="attention dropout")
         parser.add_argument("--attention_mode", type=str, default='sliding_chunks', help="Longformer attention mode")
         parser.add_argument("--attention_window", type=int, default=512, help="Attention window")
+        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
+        parser.add_argument("--adafactor", action='store_true', help="Use adafactor optimizer")
 
         return parser
 

From 498ca0408049c7b61724d135538f6680c8f9be1a Mon Sep 17 00:00:00 2001
From: Iz Beltagy <beltagy@allenai.org>
Date: Sun, 13 Sep 2020 04:54:20 +0000
Subject: [PATCH 107/112] add rougeLsum

---
 scripts/summarization.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/summarization.py b/scripts/summarization.py
index e477d9d..26264b6 100644
--- a/scripts/summarization.py
+++ b/scripts/summarization.py
@@ -128,27 +128,30 @@ def validation_step(self, batch, batch_nb):
                                             num_beams=1)
         generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True)
         gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True)
-        scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
-        rouge1 = rouge2 = rougel = 0.0
+        scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=False)
+        rouge1 = rouge2 = rougel = rougelsum = 0.0
         for ref, pred in zip(gold_str, generated_str):
             score = scorer.score(ref, pred)
             rouge1 += score['rouge1'].fmeasure
             rouge2 += score['rouge2'].fmeasure
             rougel += score['rougeL'].fmeasure
+            rougelsum += score['rougeLsum'].fmeasure
         rouge1 /= len(generated_str)
         rouge2 /= len(generated_str)
         rougel /= len(generated_str)
+        rougelsum /= len(generated_str)
 
         return {'vloss': vloss,
                 'rouge1': vloss.new_zeros(1) + rouge1,
                 'rouge2': vloss.new_zeros(1) + rouge2,
-                'rougeL': vloss.new_zeros(1) + rougel, }
+                'rougeL': vloss.new_zeros(1) + rougel,
+                'rougeLsum': vloss.new_zeros(1) + rougelsum, }
 
     def validation_epoch_end(self, outputs):
         for p in self.model.parameters():
             p.requires_grad = True
 
-        names = ['vloss', 'rouge1', 'rouge2', 'rougeL']
+        names = ['vloss', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum']
         metrics = []
         for name in names:
             metric = torch.stack([x[name] for x in outputs]).mean()
@@ -280,10 +283,11 @@ def main(args):
     trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if torch.cuda.is_available() else None,
                          track_grad_norm=-1,
                          max_epochs=args.epochs if not args.debug else 100,
+                         max_steps=None if not args.debug else 1,
                          replace_sampler_ddp=False,
                          accumulate_grad_batches=args.grad_accum,
                          val_check_interval=args.val_every if not args.debug else 1,
-                         num_sanity_val_steps=2,
+                         num_sanity_val_steps=2 if not args.debug else 0,
                          check_val_every_n_epoch=1 if not args.debug else 1,
                          val_percent_check=args.val_percent_check,
                          test_percent_check=args.val_percent_check,

From 0f3875fa875ca3220cb01801ad6c1ef9be86e6a5 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 12 Nov 2020 02:19:40 -0800
Subject: [PATCH 108/112] wip code for LongT5

---
 longformer/longformer_t5_encoder_decoder.py   | 395 ++++++++++++++++++
 requirements.txt                              |   2 +-
 .../convert_t5_to_longformerencoderdecoder.py | 156 +++++++
 3 files changed, 552 insertions(+), 1 deletion(-)
 create mode 100644 longformer/longformer_t5_encoder_decoder.py
 create mode 100644 scripts/convert_t5_to_longformerencoderdecoder.py

diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py
new file mode 100644
index 0000000..b02cb36
--- /dev/null
+++ b/longformer/longformer_t5_encoder_decoder.py
@@ -0,0 +1,395 @@
+import math
+from typing import List, Optional, Tuple, Dict
+from torch import nn, Tensor
+from longformer.longformer import LongformerSelfAttention
+from longformer.sliding_chunks import *
+from transformers.modeling_t5 import T5Config, T5ForConditionalGeneration
+
+
+class LongformerEncoderDecoderForConditionalGenerationT5(T5ForConditionalGeneration):
+    def __init__(self, config):
+        super().__init__(config)
+        if config.attention_mode == 'n2':
+            pass  # do nothing, use BertSelfAttention instead
+        else:
+            for i, layer in enumerate(self.encoder.block):
+                layer.layer[0].SelfAttention = LongformerSelfAttentionForT5(config, layer_id=i)
+
+
+class LongformerEncoderDecoderConfigT5(T5Config):
+    def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
+                 autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
+                 gradient_checkpointing: bool = False, **kwargs):
+        """
+        Args:
+            attention_window: list of attention window sizes of length = number of layers.
+                window size = number of attention locations on each side.
+                For an affective window size of 512, use `attention_window=[256]*num_layers`
+                which is 256 on each side.
+            attention_dilation: list of attention dilation of length = number of layers.
+                attention dilation of `1` means no dilation.
+            autoregressive: do autoregressive attention or have attention of both sides
+            attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
+                selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
+        """
+        super().__init__(**kwargs)
+        self.attention_window = attention_window
+        self.attention_dilation = attention_dilation
+        self.autoregressive = autoregressive
+        self.attention_mode = attention_mode
+        self.gradient_checkpointing = gradient_checkpointing
+        # self.attention_probs_dropout_prob = self.dropout_rate
+        assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
+
+class LongformerSelfAttentionT5Basic(nn.Module):
+    def __init__(self, config, layer_id, has_relative_attention_bias=False):
+        super(LongformerSelfAttentionT5Basic, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_heads = config.num_attention_heads
+        self.head_dim = int(config.hidden_size / config.num_attention_heads)
+        self.embed_dim = config.hidden_size
+
+        self.query = nn.Linear(config.hidden_size, self.embed_dim)
+        self.key = nn.Linear(config.hidden_size, self.embed_dim)
+        self.value = nn.Linear(config.hidden_size, self.embed_dim)
+
+        # this is for the T5 setting
+        self.is_decoder = config.is_decoder
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.num_heads)
+
+        self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
+        self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
+        self.value_global = nn.Linear(config.hidden_size, self.embed_dim)
+
+        self.dropout = config.attention_probs_dropout_prob
+
+        self.layer_id = layer_id
+        self.attention_window = config.attention_window[self.layer_id]
+        self.attention_dilation = config.attention_dilation[self.layer_id]
+        self.attention_mode = config.attention_mode
+        self.autoregressive = config.autoregressive
+        assert self.attention_window > 0
+        assert self.attention_dilation > 0
+        assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap']
+        if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']:
+            assert not self.autoregressive  # not supported
+            assert self.attention_dilation == 1  # dilation is not supported
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
+        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,  # shape (qlen, klen)
+            bidirectional=not self.is_decoder,
+            num_buckets=self.relative_attention_num_buckets,
+        )
+        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        # values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
+        # Changing the shape to below because that's what LongformerSelfAttention's attn_weights need.
+        values = values.permute([0, 2, 1]).unsqueeze(0) # shape (1, qlen, num_heads, klen)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        past_key_value_state=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        '''
+        The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
+            -ve: no attention
+              0: local attention
+            +ve: global attention
+        '''
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
+            key_padding_mask = attention_mask < 0
+            extra_attention_mask = attention_mask > 0
+            remove_from_windowed_attention_mask = attention_mask != 0
+
+            num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1)
+            max_num_extra_indices_per_batch = num_extra_indices_per_batch.max()
+            if max_num_extra_indices_per_batch <= 0:
+                extra_attention_mask = None
+            else:
+                # To support the case of variable number of global attention in the rows of a batch,
+                # we use the following three selection masks to select global attention embeddings
+                # in a 3d tensor and pad it to `max_num_extra_indices_per_batch`
+                # 1) selecting embeddings that correspond to global attention
+                extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True)
+                zero_to_max_range = torch.arange(0, max_num_extra_indices_per_batch,
+                                                 device=num_extra_indices_per_batch.device)
+                # mask indicating which values are actually going to be padding
+                selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1)
+                # 2) location of the non-padding values in the selected global attention
+                selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True)
+                # 3) location of the padding values in the selected global attention
+                selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True)
+        else:
+            remove_from_windowed_attention_mask = None
+            extra_attention_mask = None
+            key_padding_mask = None
+
+        hidden_states = hidden_states.transpose(0, 1)
+        seq_len, bsz, embed_dim = hidden_states.size()
+        assert embed_dim == self.embed_dim
+        q = self.query(hidden_states)
+        k = self.key(hidden_states)
+        v = self.value(hidden_states)
+        q /= math.sqrt(self.head_dim)
+
+        q = q.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
+        k = k.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
+        # attn_weights = (bsz, seq_len, num_heads, window*2+1)
+        if self.attention_mode == 'tvm':
+            q = q.float().contiguous()
+            k = k.float().contiguous()
+            attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False)
+        elif self.attention_mode == "sliding_chunks":
+            attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0)
+        elif self.attention_mode == "sliding_chunks_no_overlap":
+            attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0)
+        else:
+            raise False
+        mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False)
+        if remove_from_windowed_attention_mask is not None:
+            # This implementation is fast and takes very little memory because num_heads x hidden_size = 1
+            # from (bsz x seq_len) to (bsz x seq_len x num_heads x hidden_size)
+            remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze(dim=-1)
+            # cast to float/half then replace 1's with -inf
+            float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill(remove_from_windowed_attention_mask, -10000.0)
+            repeat_size = 1 if isinstance(self.attention_dilation, int) else len(self.attention_dilation)
+            float_mask = float_mask.repeat(1, 1, repeat_size, 1)
+            ones = float_mask.new_ones(size=float_mask.size())  # tensor of ones
+            # diagonal mask with zeros everywhere and -inf inplace of padding
+            if self.attention_mode == 'tvm':
+                d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False)
+            elif self.attention_mode == "sliding_chunks":
+                d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
+            elif self.attention_mode == "sliding_chunks_no_overlap":
+                d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
+
+            attn_weights += d_mask
+        assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads]
+        assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3]
+
+        # the extra attention
+        if extra_attention_mask is not None:
+            selected_k = k.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim)
+            selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros]
+            # (bsz, seq_len, num_heads, max_num_extra_indices_per_batch)
+            selected_attn_weights = torch.einsum('blhd,bshd->blhs', (q, selected_k))
+            selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000
+            # concat to attn_weights
+            # (bsz, seq_len, num_heads, extra attention count + 2*window+1)
+            attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1)
+
+        # TODO: added position_bias for T5
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            
+            position_bias = self.compute_bias(seq_len, 2*self.attention_window + 1)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value_state is not None:
+                position_bias = position_bias[:, :, -1:, :]
+
+            # TODO: what should be the attn_mask added here?
+            # if attention_mask is not None:
+            #     position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
+
+        # ipdb.set_trace()
+        attn_weights += position_bias
+
+        attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)  # use fp32 for numerical stability
+        if key_padding_mask is not None:
+            # softmax sometimes inserts NaN if all positions are masked, replace them with 0
+            attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+        v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
+        attn = 0
+        if extra_attention_mask is not None:
+            selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch)
+            selected_v = v.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim)
+            selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros]
+            # use `matmul` because `einsum` crashes sometimes with fp16
+            # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v))
+            attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2).type_as(selected_attn_probs)).transpose(1, 2)
+            attn_probs = attn_probs.narrow(-1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch).contiguous()
+
+        if self.attention_mode == 'tvm':
+            v = v.float().contiguous()
+            attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False)
+        elif self.attention_mode == "sliding_chunks":
+            attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window)
+        elif self.attention_mode == "sliding_chunks_no_overlap":
+            attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window)
+        else:
+            raise False
+
+        attn = attn.type_as(hidden_states)
+        assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim]
+        attn = attn.transpose(0, 1).reshape(seq_len, bsz, embed_dim).contiguous()
+
+        # For this case, we'll just recompute the attention for these indices
+        # and overwrite the attn tensor. TODO: remove the redundant computation
+        if extra_attention_mask is not None:
+            selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, bsz, embed_dim)
+            selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states[extra_attention_mask_nonzeros[::-1]]
+
+            q = self.query_global(selected_hidden_states)
+            k = self.key_global(hidden_states)
+            v = self.value_global(hidden_states)
+            q /= math.sqrt(self.head_dim)
+
+            q = q.contiguous().view(max_num_extra_indices_per_batch, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # (bsz*self.num_heads, max_num_extra_indices_per_batch, head_dim)
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # bsz * self.num_heads, seq_len, head_dim)
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # bsz * self.num_heads, seq_len, head_dim)
+            attn_weights = torch.bmm(q, k.transpose(1, 2))
+            assert list(attn_weights.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len)
+            attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0
+            if key_padding_mask is not None:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                    -10000.0,
+                )
+            attn_weights = attn_weights.view(bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len)
+            attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)  # use fp32 for numerical stability
+            attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+            selected_attn = torch.bmm(attn_probs, v)
+            assert list(selected_attn.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, self.head_dim]
+
+            selected_attn_4d = selected_attn.view(bsz, self.num_heads, max_num_extra_indices_per_batch, self.head_dim)
+            nonzero_selected_attn = selected_attn_4d[selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1]]
+            attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states)
+
+        context_layer = attn.transpose(0, 1)
+        if output_attentions:
+            if extra_attention_mask is not None:
+                # With global attention, return global attention probabilities only
+                # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
+                # which is the attention weights from tokens with global attention to all tokens
+                # It doesn't not return local attention
+                # In case of variable number of global attantion in the rows of a batch,
+                # attn_weights are padded with -10000.0 attention scores
+                attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len)
+            else:
+                # without global attention, return local attention probabilities
+                # batch_size x num_heads x sequence_length x window_size
+                # which is the attention weights of every token attending to its neighbours
+                attn_weights = attn_weights.permute(0, 2, 1, 3)
+        outputs = (context_layer, attn_weights) if output_attentions else (context_layer,)
+        return outputs
+
+
+class LongformerSelfAttentionForT5(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.longformer_self_attn = LongformerSelfAttentionT5Basic(config, layer_id=layer_id, 
+            has_relative_attention_bias=True) #config.has_relative_attention_bias)
+        self.output = nn.Linear(self.embed_dim, self.embed_dim)
+
+    # def forward(
+    #     self,
+    #     query,
+    #     key: Optional[Tensor],
+    #     key_padding_mask: Optional[Tensor] = None,
+    #     layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
+    #     attn_mask: Optional[Tensor] = None,
+    #     need_weights=False,
+    #     output_attentions=False,
+    # ) -> Tuple[Tensor, Optional[Tensor]]:
+    def forward(
+        self,
+        query,
+        mask=None,
+        kv=None,
+        position_bias=None,
+        past_key_value_state=None,
+        head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        outputs = self.longformer_self_attn(
+            query, #.transpose(0, 1),  # LongformerSelfAttention expects (bsz, seqlen, embd_dim)
+            #attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1,
+            attention_mask=mask, #.unsqueeze(dim=1).unsqueeze(dim=1)*-1,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = self.output(outputs[0].transpose(0, 1))
+
+        return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None)
diff --git a/requirements.txt b/requirements.txt
index a98ef2a..3eb9122 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,5 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_
 torch==1.6.0
 tensorboardX
 test-tube==0.7.5
-nlp
+nlp==0.3.0
 rouge_score
diff --git a/scripts/convert_t5_to_longformerencoderdecoder.py b/scripts/convert_t5_to_longformerencoderdecoder.py
new file mode 100644
index 0000000..6415cc0
--- /dev/null
+++ b/scripts/convert_t5_to_longformerencoderdecoder.py
@@ -0,0 +1,156 @@
+import argparse
+import logging
+import os
+
+from transformers import T5Tokenizer
+
+from transformers import T5ForConditionalGeneration
+from transformers.modeling_bart import shift_tokens_right
+from longformer.longformer_t5_encoder_decoder import LongformerSelfAttentionForT5, LongformerEncoderDecoderConfigT5
+from longformer.longformer_t5_encoder_decoder import LongformerEncoderDecoderForConditionalGenerationT5
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def create_long_model(
+    save_model_to,
+    base_model,
+    tokenizer_name_or_path,
+    attention_window,
+    max_pos
+):
+    model = T5ForConditionalGeneration.from_pretrained(base_model)
+    tokenizer = T5Tokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos)
+    config = LongformerEncoderDecoderConfigT5.from_pretrained(base_model)
+    model.config = config
+
+    # in T5 attention_probs_dropout_prob is dropout_rate, but LongformerSelfAttention
+    # expects attention_probs_dropout_prob, so set it here
+    config.attention_probs_dropout_prob = config.dropout_rate
+    config.architectures = ['LongformerEncoderDecoderForConditionalGenerationT5', ]
+
+    # extend position embeddings
+    tokenizer.model_max_length = max_pos
+    tokenizer.init_kwargs['model_max_length'] = max_pos
+    # current_max_pos, embed_size = model.model.embed_positions.weight.shape
+    # assert current_max_pos == config.max_position_embeddings + 2
+
+    # config.max_encoder_position_embeddings = max_pos
+    # config.max_decoder_position_embeddings = config.max_position_embeddings
+    # del config.max_position_embeddings
+    # # TODO: check what's the deal with T5 here.
+    # max_pos += 2  # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2
+    # assert max_pos >= current_max_pos
+
+    # # allocate a larger position embedding matrix for the encoder
+    # new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size)
+    # # copy position embeddings over and over to initialize the new position embeddings
+    # k = 2
+    # step = current_max_pos - 2
+    # while k < max_pos - 1:
+    #     new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[2:]
+    #     k += step
+    # model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed
+
+    # allocate a larger position embedding matrix for the decoder
+    # new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size)
+    # # copy position embeddings over and over to initialize the new position embeddings
+    # k = 2
+    # step = current_max_pos - 2
+    # while k < max_pos - 1:
+    #     new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:]
+    #     k += step
+    # model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed
+
+    # replace the `modeling_t5.T5Attention` object with `LongformerSelfAttention`
+    config.attention_window = [attention_window] * config.num_hidden_layers
+    config.attention_dilation = [1] * config.num_hidden_layers
+
+    for i, layer in enumerate(model.encoder.block):
+        self_attn = layer.layer[0].SelfAttention
+
+        longformer_self_attn_for_t5 = LongformerSelfAttentionForT5(config, layer_id=i)
+
+        longformer_self_attn_for_t5.longformer_self_attn.query = self_attn.q
+        longformer_self_attn_for_t5.longformer_self_attn.key = self_attn.k
+        longformer_self_attn_for_t5.longformer_self_attn.value = self_attn.v
+
+        longformer_self_attn_for_t5.longformer_self_attn.query_global = self_attn.q
+        longformer_self_attn_for_t5.longformer_self_attn.key_global = self_attn.k
+        longformer_self_attn_for_t5.longformer_self_attn.value_global = self_attn.v
+
+        longformer_self_attn_for_t5.output = self_attn.o
+
+        layer.layer[0].SelfAttention = longformer_self_attn_for_t5
+
+    logger.info(f'saving model to {save_model_to}')
+    model.save_pretrained(save_model_to)
+    tokenizer.save_pretrained(save_model_to)
+    return model, tokenizer
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert T5 to LongT5. Replaces T5 encoder's T5Attention with LongformerSelfAttention")
+    parser.add_argument(
+        '--base_model',
+        type=str,
+        default='t5-large',
+        help='The name or path of the base model you want to convert'
+    )
+    parser.add_argument(
+        '--tokenizer_name_or_path',
+        type=str,
+        default='t5-large',
+        help='The name or path of the tokenizer'
+    )
+    parser.add_argument(
+        '--save_model_to',
+        type=str,
+        required=True,
+        help='The path to save the converted model'
+    )
+    parser.add_argument(
+        '--attention_window',
+        type=int,
+        default=512,
+        help='attention window size for longformer self attention (one sided)'
+    )
+    parser.add_argument(
+        '--max_pos',
+        type=int,
+        default=4096 * 4,
+        help='maximum encoder positions'
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.save_model_to):
+        os.mkdir(args.save_model_to)
+
+    create_long_model(
+        save_model_to=args.save_model_to,
+        base_model=args.base_model,
+        tokenizer_name_or_path=args.tokenizer_name_or_path,
+        attention_window=args.attention_window,
+        max_pos=args.max_pos
+    )
+
+    tokenizer = T5Tokenizer.from_pretrained(args.save_model_to)
+    TXT = "My friends are <mask> but they eat too many carbs."
+    model = LongformerEncoderDecoderForConditionalGenerationT5.from_pretrained(args.save_model_to)
+    model.encoder.config.gradient_checkpointing = True
+    model.decoder.config.gradient_checkpointing = True
+    data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048)
+    input_ids = data['input_ids']
+    attention_mask = data['attention_mask']
+    decoder_input_ids = shift_tokens_right(input_ids[:, :5], tokenizer.pad_token_id)
+    logits = model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, use_cache=False)[0]
+    masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    probs = logits[0, masked_index].softmax(dim=0)
+    values, predictions = probs.topk(5)
+    print(tokenizer.convert_ids_to_tokens(predictions))
+
+
+if __name__ == "__main__":
+    main()

From adc92cabc8e8c6fc89a4ccab03e887a3d53e0d97 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Mon, 23 Nov 2020 15:09:29 -0800
Subject: [PATCH 109/112] naive code for smaller score matrix

---
 longformer/longformer_t5_encoder_decoder.py | 44 ++++++++++++---------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py
index b02cb36..ce46228 100644
--- a/longformer/longformer_t5_encoder_decoder.py
+++ b/longformer/longformer_t5_encoder_decoder.py
@@ -38,7 +38,7 @@ def __init__(self, attention_window: List[int] = None, attention_dilation: List[
         self.autoregressive = autoregressive
         self.attention_mode = attention_mode
         self.gradient_checkpointing = gradient_checkpointing
-        # self.attention_probs_dropout_prob = self.dropout_rate
+        self.attention_probs_dropout_prob = self.dropout_rate
         assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
 
 class LongformerSelfAttentionT5Basic(nn.Module):
@@ -129,6 +129,26 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
         ret += torch.where(is_small, n, val_if_large)
         return ret
 
+    @staticmethod
+    def _smaller_score_matrix(matrix, seq_len, w, bidirectional):
+
+        diag_sums = torch.zeros(seq_len, 2*w+1)
+        #diag_sums.fill_(float('-inf'))
+        last = w+1 if bidirectional else 1
+
+        c = 0
+        for k in range(-w, last):
+            d = torch.diagonal(matrix, offset=k, dim1=-2, dim2=-1)
+            if d.nelement():
+                if k <= 0:
+                    diag_sums[abs(k):seq_len, c] = d
+                else:
+                    diag_sums[0:seq_len-k, c] = d
+            c += 1
+        
+        # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True)
+        return diag_sums.long()
+
     def compute_bias(self, qlen, klen):
         """ Compute binned relative position bias """
         context_position = torch.arange(qlen, dtype=torch.long)[:, None]
@@ -139,9 +159,10 @@ def compute_bias(self, qlen, klen):
             bidirectional=not self.is_decoder,
             num_buckets=self.relative_attention_num_buckets,
         )
+        rp_bucket = self._smaller_score_matrix(rp_bucket, qlen, w=self.attention_window, bidirectional=not self.is_decoder)
         rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        # values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
+#         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
         # Changing the shape to below because that's what LongformerSelfAttention's attn_weights need.
         values = values.permute([0, 2, 1]).unsqueeze(0) # shape (1, qlen, num_heads, klen)
         return values
@@ -245,23 +266,20 @@ def forward(
             # (bsz, seq_len, num_heads, extra attention count + 2*window+1)
             attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1)
 
-        # TODO: added position_bias for T5
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             
-            position_bias = self.compute_bias(seq_len, 2*self.attention_window + 1)
+            position_bias = self.compute_bias(seq_len, seq_len)
 
             # if key and values are already calculated
             # we want only the last query position bias
             if past_key_value_state is not None:
                 position_bias = position_bias[:, :, -1:, :]
 
-            # TODO: what should be the attn_mask added here?
-            # if attention_mask is not None:
-            #     position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
+            if attention_mask is not None:
+                position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
 
-        # ipdb.set_trace()
         attn_weights += position_bias
 
         attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)  # use fp32 for numerical stability
@@ -356,16 +374,6 @@ def __init__(self, config, layer_id):
             has_relative_attention_bias=True) #config.has_relative_attention_bias)
         self.output = nn.Linear(self.embed_dim, self.embed_dim)
 
-    # def forward(
-    #     self,
-    #     query,
-    #     key: Optional[Tensor],
-    #     key_padding_mask: Optional[Tensor] = None,
-    #     layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
-    #     attn_mask: Optional[Tensor] = None,
-    #     need_weights=False,
-    #     output_attentions=False,
-    # ) -> Tuple[Tensor, Optional[Tensor]]:
     def forward(
         self,
         query,

From 02f9c3f50f821c2947dc6d24e58654149a9f35e3 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Mon, 23 Nov 2020 15:28:35 -0800
Subject: [PATCH 110/112] commenting attn_mask

---
 longformer/longformer_t5_encoder_decoder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py
index ce46228..df797e3 100644
--- a/longformer/longformer_t5_encoder_decoder.py
+++ b/longformer/longformer_t5_encoder_decoder.py
@@ -277,8 +277,9 @@ def forward(
             if past_key_value_state is not None:
                 position_bias = position_bias[:, :, -1:, :]
 
-            if attention_mask is not None:
-                position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
+            # attention_mask is not the right size; should it even be added?
+            # if attention_mask is not None:
+            #     position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
 
         attn_weights += position_bias
 

From de147decc6e05d5e95f4523e4fce02ff95880daa Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Mon, 23 Nov 2020 17:50:06 -0800
Subject: [PATCH 111/112] fixing order and issue with inf

---
 longformer/longformer_t5_encoder_decoder.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py
index df797e3..3ab9caa 100644
--- a/longformer/longformer_t5_encoder_decoder.py
+++ b/longformer/longformer_t5_encoder_decoder.py
@@ -108,12 +108,17 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
         """
         ret = 0
         n = -relative_position
+        # Since torch.abs() will not work correctly with converted inf values, explicitly set to a lower value.
+        # TODO: check this!!
+        n[n==float('inf')] = max_distance
+        n[n==float('-inf')] = -max_distance
         if bidirectional:
             num_buckets //= 2
             ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
             n = torch.abs(n)
         else:
             n = torch.max(n, torch.zeros_like(n))
+        n = n.long()
         # now n is in the range [0, inf)
 
         # half of the buckets are for exact increments in positions
@@ -134,6 +139,8 @@ def _smaller_score_matrix(matrix, seq_len, w, bidirectional):
 
         diag_sums = torch.zeros(seq_len, 2*w+1)
         #diag_sums.fill_(float('-inf'))
+        diag_sums[:, 0:w].fill_(float('-inf'))
+        diag_sums[:, w+1:].fill_(float('inf'))
         last = w+1 if bidirectional else 1
 
         c = 0
@@ -147,19 +154,19 @@ def _smaller_score_matrix(matrix, seq_len, w, bidirectional):
             c += 1
         
         # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True)
-        return diag_sums.long()
+        return diag_sums
 
     def compute_bias(self, qlen, klen):
         """ Compute binned relative position bias """
         context_position = torch.arange(qlen, dtype=torch.long)[:, None]
         memory_position = torch.arange(klen, dtype=torch.long)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
+        relative_position = self._smaller_score_matrix(relative_position, qlen, w=self.attention_window, bidirectional=not self.is_decoder)
         rp_bucket = self._relative_position_bucket(
             relative_position,  # shape (qlen, klen)
             bidirectional=not self.is_decoder,
             num_buckets=self.relative_attention_num_buckets,
         )
-        rp_bucket = self._smaller_score_matrix(rp_bucket, qlen, w=self.attention_window, bidirectional=not self.is_decoder)
         rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
 #         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)

From 1ba5286d6f4126491263fe27cb15806a56dfab8a Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Mon, 7 Dec 2020 23:16:48 -0800
Subject: [PATCH 112/112] fixing compute_bias

---
 longformer/longformer_t5_encoder_decoder.py   | 82 ++++++-------------
 .../convert_t5_to_longformerencoderdecoder.py |  1 +
 2 files changed, 26 insertions(+), 57 deletions(-)

diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py
index 3ab9caa..85cf786 100644
--- a/longformer/longformer_t5_encoder_decoder.py
+++ b/longformer/longformer_t5_encoder_decoder.py
@@ -87,81 +87,49 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets
         """
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-        Translate relative position to a bucket number for relative attention.
-        The relative position is defined as memory_position - query_position, i.e.
-        the distance in tokens from the attending position to the attended-to
-        position.  If bidirectional=False, then positive relative positions are
-        invalid.
-        We use smaller buckets for small absolute relative_position and larger buckets
-        for larger absolute relative_positions.  All relative positions >=max_distance
-        map to the same bucket.  All relative positions <=-max_distance map to the
-        same bucket.  This should allow for more graceful generalization to longer
-        sequences than the model has been trained on.
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
         Args:
             relative_position: an int32 Tensor
             bidirectional: a boolean - whether the attention is bidirectional
             num_buckets: an integer
             max_distance: an integer
         Returns:
-            a Tensor with the same shape as relative_position, containing int32
-            values in the range [0, num_buckets)
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
         """
-        ret = 0
-        n = -relative_position
-        # Since torch.abs() will not work correctly with converted inf values, explicitly set to a lower value.
-        # TODO: check this!!
-        n[n==float('inf')] = max_distance
-        n[n==float('-inf')] = -max_distance
+        relative_buckets = 0
         if bidirectional:
             num_buckets //= 2
-            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
-            n = torch.abs(n)
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
         else:
-            n = torch.max(n, torch.zeros_like(n))
-        n = n.long()
-        # now n is in the range [0, inf)
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
 
         # half of the buckets are for exact increments in positions
         max_exact = num_buckets // 2
-        is_small = n < max_exact
+        is_small = relative_position < max_exact
 
         # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        val_if_large = max_exact + (
-            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
         ).to(torch.long)
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
 
-    @staticmethod
-    def _smaller_score_matrix(matrix, seq_len, w, bidirectional):
-
-        diag_sums = torch.zeros(seq_len, 2*w+1)
-        #diag_sums.fill_(float('-inf'))
-        diag_sums[:, 0:w].fill_(float('-inf'))
-        diag_sums[:, w+1:].fill_(float('inf'))
-        last = w+1 if bidirectional else 1
-
-        c = 0
-        for k in range(-w, last):
-            d = torch.diagonal(matrix, offset=k, dim1=-2, dim2=-1)
-            if d.nelement():
-                if k <= 0:
-                    diag_sums[abs(k):seq_len, c] = d
-                else:
-                    diag_sums[0:seq_len-k, c] = d
-            c += 1
-        
-        # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True)
-        return diag_sums
+        relative_buckets += torch.where(is_small, relative_position, relative_postion_if_large)
+        return relative_buckets
 
     def compute_bias(self, qlen, klen):
         """ Compute binned relative position bias """
-        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
-        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
-        relative_position = memory_position - context_position  # shape (qlen, klen)
-        relative_position = self._smaller_score_matrix(relative_position, qlen, w=self.attention_window, bidirectional=not self.is_decoder)
+        relative_position = torch.tensor([[i-self.attention_window for i in range(2*self.attention_window+1)]])
         rp_bucket = self._relative_position_bucket(
             relative_position,  # shape (qlen, klen)
             bidirectional=not self.is_decoder,
@@ -189,7 +157,6 @@ def forward(
               0: local attention
             +ve: global attention
         '''
-
         if attention_mask is not None:
             attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
             key_padding_mask = attention_mask < 0
@@ -284,7 +251,8 @@ def forward(
             if past_key_value_state is not None:
                 position_bias = position_bias[:, :, -1:, :]
 
-            # attention_mask is not the right size; should it even be added?
+            # TODO: attention_mask should also be the same shape as position_bias.
+            # Sliding attention window??
             # if attention_mask is not None:
             #     position_bias = position_bias + attention_mask  # (1, num_heads, seq_len, 2*window+1)
 
diff --git a/scripts/convert_t5_to_longformerencoderdecoder.py b/scripts/convert_t5_to_longformerencoderdecoder.py
index 6415cc0..c06930b 100644
--- a/scripts/convert_t5_to_longformerencoderdecoder.py
+++ b/scripts/convert_t5_to_longformerencoderdecoder.py
@@ -66,6 +66,7 @@ def create_long_model(
     # replace the `modeling_t5.T5Attention` object with `LongformerSelfAttention`
     config.attention_window = [attention_window] * config.num_hidden_layers
     config.attention_dilation = [1] * config.num_hidden_layers
+    # model.encoder.block = model.encoder.block[:1]
 
     for i, layer in enumerate(model.encoder.block):
         self_attn = layer.layer[0].SelfAttention