From 1f5537902aece8ad54ab65cc47c57e61918ac31c Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 01:09:36 -0700 Subject: [PATCH 001/112] adding output_attentions arg --- longformer/longformer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/longformer/longformer.py b/longformer/longformer.py index 81e55d3..89d8d15 100644 --- a/longformer/longformer.py +++ b/longformer/longformer.py @@ -58,7 +58,6 @@ def __init__(self, config, layer_id): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) - self.output_attentions = config.output_attentions self.num_heads = config.num_attention_heads self.head_dim = int(config.hidden_size / config.num_attention_heads) self.embed_dim = config.hidden_size @@ -92,6 +91,7 @@ def forward( head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, + output_attentions=False, ): ''' The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to @@ -181,7 +181,6 @@ def forward( if key_padding_mask is not None: # softmax sometimes inserts NaN if all positions are masked, replace them with 0 attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0) - attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) @@ -240,7 +239,7 @@ def forward( attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states) context_layer = attn.transpose(0, 1) - if self.output_attentions: + if output_attentions: if extra_attention_mask is not None: # With global attention, return global attention probabilities only # batch_size x num_heads x max_num_global_attention_tokens x sequence_length @@ -254,5 +253,5 @@ def forward( # batch_size x num_heads x sequence_length x window_size # which is the attention weights of every token attending to its neighbours attn_weights = attn_weights.permute(0, 2, 1, 3) - outputs = (context_layer, attn_weights) if self.output_attentions else (context_layer,) + outputs = (context_layer, attn_weights) if output_attentions else (context_layer,) return outputs From b98d1912e33e2e28a87a0c0c048a94e9d1bead9e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 01:36:58 -0700 Subject: [PATCH 002/112] adding gradient_checkpointing config --- longformer/longformer_encoder_decoder.py | 77 ++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 longformer/longformer_encoder_decoder.py diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py new file mode 100644 index 0000000..741939a --- /dev/null +++ b/longformer/longformer_encoder_decoder.py @@ -0,0 +1,77 @@ +from typing import List, Optional, Tuple, Dict +from torch import nn, Tensor +from longformer.longformer import LongformerSelfAttention +from transformers.modeling_bart import BartConfig, BartForConditionalGeneration + + +class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration): + def __init__(self, config): + super().__init__(config) + for i, layer in enumerate(self.model.encoder.layers): + layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i) + + +class LongformerEncoderDecoderConfig(BartConfig): + def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None, + autoregressive: bool = False, attention_mode: str = 'sliding_chunks', + gradient_checkpointing: bool = False, **kwargs): + """ + Args: + attention_window: list of attention window sizes of length = number of layers. + window size = number of attention locations on each side. + For an affective window size of 512, use `attention_window=[256]*num_layers` + which is 256 on each side. + attention_dilation: list of attention dilation of length = number of layers. + attention dilation of `1` means no dilation. + autoregressive: do autoregressive attention or have attention of both sides + attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer + selfattention, 'sliding_chunks' for another implementation of Longformer selfattention + """ + super().__init__(**kwargs) + self.attention_window = attention_window + self.attention_dilation = attention_dilation + self.autoregressive = autoregressive + self.attention_mode = attention_mode + self.gradient_checkpointing = gradient_checkpointing + assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] + + +class LongformerSelfAttentionForBart(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.embed_dim = config.d_model + self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id) + self.output = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + query, + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + layer_state: Optional[Dict[str, Optional[Tensor]]] = None, + attn_mask: Optional[Tensor] = None, + need_weights=False, + output_attentions=False, + ) -> Tuple[Tensor, Optional[Tensor]]: + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + assert attn_mask is None + + # LongformerSelfAttention expects this shape + query = query.view(bsz, tgt_len, embed_dim) + outputs = self.longformer_self_attn( + query, + attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=output_attentions, + ) + + attn_output = outputs[0] + attn_output = attn_output.contiguous().view(tgt_len, bsz, embed_dim) + attn_output = self.output(attn_output) + + return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None) From c10277cca8ce58c01bfc04722517a7b019102585 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 01:38:04 -0700 Subject: [PATCH 003/112] convert bart to longformer_encoder_decoder + memory profiler --- ...onvert_bart_to_longformerencoderdecoder.py | 148 ++++++++++++++++++ scripts/mem_profiler.py | 58 +++++++ 2 files changed, 206 insertions(+) create mode 100644 scripts/convert_bart_to_longformerencoderdecoder.py create mode 100644 scripts/mem_profiler.py diff --git a/scripts/convert_bart_to_longformerencoderdecoder.py b/scripts/convert_bart_to_longformerencoderdecoder.py new file mode 100644 index 0000000..e469819 --- /dev/null +++ b/scripts/convert_bart_to_longformerencoderdecoder.py @@ -0,0 +1,148 @@ +import argparse +import logging +import os + +from transformers import BartTokenizer + +from transformers import BartForConditionalGeneration +from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart, LongformerEncoderDecoderConfig +from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def create_long_model( + save_model_to, + base_model='facebook/bart-large', + tokenizer_name_or_path='facebook/bart-large', + attention_window=512, + max_pos=4096 +): + model = BartForConditionalGeneration.from_pretrained(base_model) + tokenizer = BartTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos) + config = LongformerEncoderDecoderConfig.from_pretrained(base_model) + model.config = config + + # in BART attention_probs_dropout_prob is attention_dropout, but LongformerSelfAttention + # expects attention_probs_dropout_prob, so set it here + config.attention_probs_dropout_prob = config.attention_dropout + config.architectures = ['LongformerEncoderDecoderForConditionalGeneration', ] + + # extend position embeddings + tokenizer.model_max_length = max_pos + tokenizer.init_kwargs['model_max_length'] = max_pos + current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape + assert current_max_pos == config.max_position_embeddings + 2 + + config.max_position_embeddings = max_pos + max_pos += 2 # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2 + assert max_pos >= current_max_pos + + # allocate a larger position embedding matrix for the encoder + new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size) + # copy position embeddings over and over to initialize the new position embeddings + k = 2 + step = current_max_pos - 2 + while k < max_pos - 1: + new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[2:] + k += step + model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed + + # allocate a larger position embedding matrix for the decoder + new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size) + # copy position embeddings over and over to initialize the new position embeddings + k = 2 + step = current_max_pos - 2 + while k < max_pos - 1: + new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:] + k += step + model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed + + # replace the `modeling_bart.SelfAttention` object with `LongformerSelfAttention` + config.attention_window = [attention_window] * config.num_hidden_layers + config.attention_dilation = [1] * config.num_hidden_layers + + for i, layer in enumerate(model.model.encoder.layers): + longformer_self_attn_for_bart = LongformerSelfAttentionForBart(config, layer_id=i) + + longformer_self_attn_for_bart.longformer_self_attn.query = layer.self_attn.q_proj + longformer_self_attn_for_bart.longformer_self_attn.key = layer.self_attn.k_proj + longformer_self_attn_for_bart.longformer_self_attn.value = layer.self_attn.v_proj + + longformer_self_attn_for_bart.longformer_self_attn.query_global = layer.self_attn.q_proj + longformer_self_attn_for_bart.longformer_self_attn.key_global = layer.self_attn.k_proj + longformer_self_attn_for_bart.longformer_self_attn.value_global = layer.self_attn.v_proj + + longformer_self_attn_for_bart.output = layer.self_attn.out_proj + + layer.self_attn = longformer_self_attn_for_bart + logger.info(f'saving model to {save_model_to}') + model.save_pretrained(save_model_to) + tokenizer.save_pretrained(save_model_to) + return model, tokenizer + + +def main(): + parser = argparse.ArgumentParser(description="Convert BART to LongBART. Replaces BART encoder's SelfAttnetion with LongformerSelfAttention") + parser.add_argument( + '--base_model', + type=str, + default='facebook/bart-large', + help='The name or path of the base model you want to convert' + ) + parser.add_argument( + '--tokenizer_name_or_path', + type=str, + default='facebook/bart-large', + help='The name or path of the tokenizer' + ) + parser.add_argument( + '--save_model_to', + type=str, + required=True, + help='The path to save the converted model' + ) + parser.add_argument( + '--attention_window', + type=int, + default=512, + help='attention window size for longformer self attention' + ) + parser.add_argument( + '--max_pos', + type=int, + default=4096, + help='maximum encoder positions' + ) + + args = parser.parse_args() + + if not os.path.exists(args.save_model_to): + os.mkdir(args.save_model_to) + + create_long_model( + save_model_to=args.save_model_to, + base_model=args.base_model, + tokenizer_name_or_path=args.tokenizer_name_or_path, + attention_window=args.attention_window, + max_pos=args.max_pos + ) + + tokenizer = BartTokenizer.from_pretrained(args.save_model_to) + TXT = "My friends are but they eat too many carbs." + model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(args.save_model_to) + model.model.encoder.config.gradient_checkpointing = True + model.model.decoder.config.gradient_checkpointing = True + data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048) + input_ids = data['input_ids'] + attention_mask = data['attention_mask'] + logits = model(input_ids, attention_mask=attention_mask)[0] + masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + probs = logits[0, masked_index].softmax(dim=0) + values, predictions = probs.topk(5) + print(tokenizer.decode(predictions).split()) + + +if __name__ == "__main__": + main() diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py new file mode 100644 index 0000000..64a6d56 --- /dev/null +++ b/scripts/mem_profiler.py @@ -0,0 +1,58 @@ +from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration +from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig + +import torch +from torch.utils.data import DataLoader, Dataset +from pytorch_lightning import Trainer +import pytorch_lightning as pl + +seqlen = 1024 * 12 +global_size = 0 # seqlen // 100 +attention_window = 512 # one sided + + +class CoolDataset(Dataset): + def __len__(self): + return 1024 # number of examples + + def __getitem__(self, idx): + tokne_ids = torch.tensor([5] * seqlen) + mask = torch.tensor([1] * seqlen) + # mask[:global_size] = 2 + return tokne_ids, mask + + +class MemoryProfiler(pl.LightningModule): + + def __init__(self, hparams=None): + super().__init__() + self.hparams = hparams + + config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096') + config.max_position_embeddings = seqlen + 2 + config.gradient_checkpointing = True + config.attention_mode = 'sliding_chunks' + config.attention_window = [attention_window] * config.num_hidden_layers + self.model = LongformerEncoderDecoderForConditionalGeneration(config) + + def forward(self, x, y): + print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3) + return self.model(x, attention_mask=y) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x, y) + loss = y_hat[0].sum() + return {'loss': loss} + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=0.001) + + def train_dataloader(self): + return DataLoader(CoolDataset(), batch_size=1, num_workers=0) + + +if __name__ == '__main__': + model = MemoryProfiler() + trainer = Trainer(gpus=[0], progress_bar_refresh_rate=1, max_epochs=1, amp_level='O2', use_amp=True) + trainer.fit(model) From e29d7f53697fa755c51eec0a4ebc86e53f520d1e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 01:48:12 -0700 Subject: [PATCH 004/112] reqs and init --- longformer/__init__.py | 3 +++ requirements.txt | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/longformer/__init__.py b/longformer/__init__.py index e69de29..d3e343c 100644 --- a/longformer/__init__.py +++ b/longformer/__init__.py @@ -0,0 +1,3 @@ +from longformer.longformer import Longformer, LongformerForMaskedLM, LongformerConfig +from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig +from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f6806dc..bbee48c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +-e git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers + torch>=1.2.0 -transformers>=2.2.0 tensorboardX -pytorch-lightning==0.6.0 -test-tube==0.7.5 +pytorch-lightning>=0.7.6 +test-tube From dd0dc0d55827316ccfff6dad3bac5732855537c6 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 10:47:02 -0700 Subject: [PATCH 005/112] fix req --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bbee48c..87a3c78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ --e git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers - +transformers @ git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers torch>=1.2.0 tensorboardX pytorch-lightning>=0.7.6 From 54a13288477fcea9852f19b71b7ad6853611aad8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 4 Jul 2020 10:49:24 -0700 Subject: [PATCH 006/112] req --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 87a3c78..94a1875 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -transformers @ git://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers +transformers @ git+http://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers torch>=1.2.0 tensorboardX pytorch-lightning>=0.7.6 From 243cfe802096957c866bab14d2a16426fdfbd169 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 6 Jul 2020 09:12:51 -0700 Subject: [PATCH 007/112] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 5b0fd2c..833e7c9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,20 @@ #

`Longformer`

`Longformer` is a BERT-like model for long documents. +**\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\*** + +A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. + +The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. +``` +pip install git+https://github.com/allenai/longformer.git@encoderdecoder + +# checkpoint: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-12288.tar.gz + +from longformer import LongformerEncoderDecoderForConditionalGeneration +model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True) +``` + **\*\*\*\*\* New June 2nd, 2020: Integrating with Huggingface + Train your own long model + Gradient checkpointing \*\*\*\*\*** 1. `Longformer` is now integrated in the huggingface/transformers [release v2.11.0](https://github.com/huggingface/transformers/tree/v2.11.0). Now you can do From fbbc770d6e8d160af7a255144173afca50b48486 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 6 Jul 2020 09:13:18 -0700 Subject: [PATCH 008/112] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 833e7c9..8464003 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,7 @@ **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\*** -A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. - -The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. +A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. ``` pip install git+https://github.com/allenai/longformer.git@encoderdecoder From 95296ad89e200b7a7a56f34d4a87cde47162b777 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 15 Jul 2020 23:19:32 -0700 Subject: [PATCH 009/112] pretraining script --- requirements.txt | 6 +- scripts/pretrain.py | 277 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 scripts/pretrain.py diff --git a/requirements.txt b/requirements.txt index f6806dc..75cbab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -torch>=1.2.0 -transformers>=2.2.0 +torch>=1.5.0 +transformers>=3.0.1 tensorboardX -pytorch-lightning==0.6.0 +pytorch-lightning==0.7.6 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py new file mode 100644 index 0000000..060ab0e --- /dev/null +++ b/scripts/pretrain.py @@ -0,0 +1,277 @@ +import argparse +import glob +import os +import random +import logging +import numpy as np +from tqdm import tqdm +import torch +from transformers import AutoTokenizer, AutoModelWithLMHead +from transformers import DataCollatorForLanguageModeling +from transformers.optimization import AdamW, get_linear_schedule_with_warmup + +from torch.utils.data import Dataset, DataLoader +import pytorch_lightning as ptl +from pytorch_lightning.logging.test_tube import TestTubeLogger +from pytorch_lightning.callbacks import ModelCheckpoint + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class MMapTextDataset(Dataset): + def __init__(self, mmap_filename, chunk_size): + self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size + # defer loading the token_ids memmap until after the first __getitem__ call. + # when spawning new processes for ddp, there is a hard limit in python < 3.8 that + # pickle files need to be < 4GB. By waiting until after the first __getitem__ we + # don't have to pickle the memmap + self.token_ids = None + self._mmap_filename = mmap_filename + self._chunk_size = chunk_size + + def __len__(self): + return self.num_instances + + def __getitem__(self, i): + if self.token_ids is None: + self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16, + shape=(self.num_instances, self._chunk_size)) + return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long) + + @staticmethod + def raw_text_to_mmap(args): + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + assert len(tokenizer) < 65535 # will use uint16 to store token ids + all_files = glob.glob(f'{args.input_dir}/*.txt') + + if os.path.exists(f'{args.input_dir}/cache/'): + logger.info("Cache already exists. Remove the cache directory to regenerate") + return + os.mkdir(f'{args.input_dir}/cache/') + train_chunks = [] + val_chunks = [] + + # TODO: process each shared in a separate worker + # TODO: support multiple documents in one chunk instead of padding + for fname in tqdm(all_files): + with open(fname, 'r') as fin: + for line in tqdm(fin): + if line.strip() == '': # drop empty lines + continue + chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks + tokens = tokenizer.tokenize(line) # each line is one document + # generate chunks of length args.seqlen. The last chunk will be padded. + # padding last chunk is not great for longformer because many chunks will be mostly padding + current_chunk = [tokenizer.bos_token] + for token in tokens: + if len(current_chunk) == args.seqlen - 1: # chunk is full + current_chunk.append(tokenizer.eos_token) + chunks_list.append(current_chunk) + current_chunk = [tokenizer.bos_token] + current_chunk.append(token) + current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) + current_chunk[args.seqlen - 1] = tokenizer.eos_token + chunks_list.append(current_chunk) + + def _tokenized_text_to_mmap(output_fname, chunks_list): + random.shuffle(chunks_list) + num_chunks = len(chunks_list) + all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16) + for k, chunk in enumerate(tqdm(chunks_list)): + token_ids = tokenizer.convert_tokens_to_ids(chunk) + assert len(token_ids) == args.seqlen + all_token_ids[k, :] = [int(t) for t in token_ids] + fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen)) + fp[:, :] = all_token_ids[:, :] + fp.flush() + del fp + + _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks) + _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks) + + +class Pretrainer(ptl.LightningModule): + + def __init__(self, hparams): + super().__init__() + + self.args = hparams + self.hparams = self.args + + self.model = AutoModelWithLMHead.from_pretrained(args.model) + self.config = self.model.config + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + self.pad_token_id = tokenizer.pad_token_id + + logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.') + MMapTextDataset.raw_text_to_mmap(args) + + # TODO: add support for other objective functions + self.data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob + ) + + def forward(self, input_ids=None, labels=None, loss_only=True): + # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD + attention_mask = (input_ids != self.pad_token_id).int() + + if labels is not None: + # output is loss, prediction_scores, hidden_states + output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) + if loss_only: + return output[0] + else: + return {"loss": output[0], "hidden_states": output[2]} + else: + # don't need to run the lm_head + assert not loss_only + output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask) + return {"hidden_states": output[2]} + + def training_step(self, batch, batch_nb): + loss = self(**batch) + tensorboard_logs = { + 'mlm_loss': loss.detach(), + 'mlm_perplexity': torch.exp(loss).detach(), + } + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_nb): + loss = self(**batch) + tensorboard_logs = { + 'val_mlm_loss': loss.detach(), + } + return {'val_loss': tensorboard_logs["val_mlm_loss"], 'log': tensorboard_logs} + + def validation_epoch_end(self, outputs): + avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() + if self.use_ddp: + avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) + avg_loss /= torch.distributed.get_world_size() + avg_loss = avg_loss.item() + logs = {'val_mlm_loss': avg_loss} + return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss} + + def configure_optimizers(self): + no_decay = ["bias", "LayerNorm.weight"] + + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], + "weight_decay": self.args.weight_decay, + }, + { + "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps + ) + + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] + + def _get_loader(self, fname, is_train): + dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen) + + if self.trainer.use_ddp: + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) + shuffle = False + else: + sampler = None + shuffle = is_train + + loader = DataLoader( + dataset, + batch_size=self.args.batch_size, + shuffle=shuffle, + sampler=sampler, + num_workers=self.args.num_workers, + collate_fn=self.data_collator, + drop_last=is_train, + ) + return loader + + def train_dataloader(self): + return self._get_loader(f'{self.args.input_dir}/cache/train.bin', True) + + def val_dataloader(self): + return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False) + + @staticmethod + def add_args(parser): + parser.add_argument("--seed", type=int, default=3) + parser.add_argument("--input_dir", type=str, required=True) + parser.add_argument("--save_dir", type=str, default='runs/') + parser.add_argument("--save_prefix", type=str, required=True) + parser.add_argument("--train_dev_split", type=float, default=0.05) + parser.add_argument("--seqlen", type=int, default=512) + parser.add_argument("--tokenizer", type=str, default='roberta-base') + parser.add_argument("--model", type=str, default='roberta-base') + parser.add_argument("--mlm_prob", type=float, default=0.15) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--adam_epsilon", type=float, default=1e-6) + parser.add_argument("--training_steps", type=int, default=0.01) + parser.add_argument("--warmup_steps", type=int, default=1000) + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--num_workers", type=int, default=0) + parser.add_argument("--grad_accum", type=int, default=1) + parser.add_argument("--gpus", type=str, default='0') + parser.add_argument("--resume", type=str, default=None) + parser.add_argument("--num_tpu_cores", type=int, default=None) + + return parser + + +def main(args): + random.seed(args.seed * 10) + np.random.seed(args.seed * 100) + torch.manual_seed(args.seed * 1000) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(args.seed * 10000) + + pretrainer = Pretrainer(args) + + # logger here is a SummaryWritter for tensorboard + # it is used by the trainer, and certain return variables + # from the model are automatically logged + logger = TestTubeLogger( + save_dir=args.save_dir, + name=args.save_prefix, + version=0 # always use version=0 + ) + + checkpoint_callback = ModelCheckpoint( + # model saved to filepath/prefix_.... + filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'), + prefix='', + save_top_k=3, + verbose=True, + monitor='val_loss', + mode='min', + ) + + args.gpus = [int(x) for x in args.gpus.split(',')] + trainer = ptl.Trainer( + gpus=args.gpus, + num_tpu_cores=args.num_tpu_cores, + distributed_backend='ddp' if len(args.gpus) > 1 else None, + track_grad_norm=-1, + max_epochs=10000, min_epochs=0, max_steps=args.training_steps, # run for many epochs, but stop after max_steps + early_stop_callback=None, + row_log_interval=25, + logger=logger, + checkpoint_callback=checkpoint_callback, + resume_from_checkpoint=args.resume, + ) + trainer.fit(pretrainer) + + +if __name__ == "__main__": + parser = Pretrainer.add_args(argparse.ArgumentParser(description="pretrain")) + args = parser.parse_args() + main(args) From 325693e3ecb6e939da95509c98978ee86cc8fe74 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 14:12:22 -0700 Subject: [PATCH 010/112] wip --- requirements.txt | 4 +-- scripts/pretrain.py | 68 ++++++++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/requirements.txt b/requirements.txt index 75cbab1..2279015 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.5.0 -transformers>=3.0.1 +transformers==3.0.2 tensorboardX -pytorch-lightning==0.7.6 +pytorch-lightning==0.8.5 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 060ab0e..014c33f 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -5,6 +5,7 @@ import logging import numpy as np from tqdm import tqdm +import time import torch from transformers import AutoTokenizer, AutoModelWithLMHead from transformers import DataCollatorForLanguageModeling @@ -13,7 +14,7 @@ from torch.utils.data import Dataset, DataLoader import pytorch_lightning as ptl from pytorch_lightning.logging.test_tube import TestTubeLogger -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger logging.basicConfig(level=logging.INFO) @@ -112,33 +113,36 @@ def __init__(self, hparams): self.data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob ) + self.start_time = 0 - def forward(self, input_ids=None, labels=None, loss_only=True): + def forward(self, input_ids=None, labels=None): # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD attention_mask = (input_ids != self.pad_token_id).int() - if labels is not None: - # output is loss, prediction_scores, hidden_states - output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) - if loss_only: - return output[0] - else: - return {"loss": output[0], "hidden_states": output[2]} - else: - # don't need to run the lm_head - assert not loss_only - output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask) - return {"hidden_states": output[2]} + # output is loss, prediction_scores, hidden_states + output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) + return output[0] # loss def training_step(self, batch, batch_nb): loss = self(**batch) + input_ids = batch['input_ids'] tensorboard_logs = { + 'input_size': input_ids.numel(), + 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, + 'lr': self.trainer.optimizers[0].param_groups[0]['lr'], 'mlm_loss': loss.detach(), 'mlm_perplexity': torch.exp(loss).detach(), + 'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } + if self.start_time != 0: + elapsed_time = time.time() - self.start_time + tensorboard_logs['time per batch'] = elapsed_time + self.start_time = time.time() + return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): + self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { 'val_mlm_loss': loss.detach(), @@ -148,7 +152,7 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: - avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() avg_loss = avg_loss.item() logs = {'val_mlm_loss': avg_loss} @@ -169,7 +173,7 @@ def configure_optimizers(self): ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps + optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps ) return [optimizer], [{"scheduler": scheduler, "interval": "step"}] @@ -215,12 +219,17 @@ def add_args(parser): parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--adam_epsilon", type=float, default=1e-6) - parser.add_argument("--training_steps", type=int, default=0.01) - parser.add_argument("--warmup_steps", type=int, default=1000) + parser.add_argument("--grad_clip", type=float, default=0) + parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates') + parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates') + parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations') + parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**') parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--grad_accum", type=int, default=1) - parser.add_argument("--gpus", type=str, default='0') + # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n + # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward + parser.add_argument("--gpu_count", type=int, default=1) parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) @@ -255,18 +264,25 @@ def main(args): mode='min', ) - args.gpus = [int(x) for x in args.gpus.split(',')] + # TODO: try gradient accumulation + + args.val_every_batches = args.val_every * args.grad_accum # convert val_every_steps to val_every_batches trainer = ptl.Trainer( - gpus=args.gpus, + gpus=args.gpu_count, + auto_select_gpus=False, num_tpu_cores=args.num_tpu_cores, - distributed_backend='ddp' if len(args.gpus) > 1 else None, - track_grad_norm=-1, - max_epochs=10000, min_epochs=0, max_steps=args.training_steps, # run for many epochs, but stop after max_steps + distributed_backend='ddp' if args.gpu_count > 1 else None, + replace_sampler_ddp=False, + track_grad_norm=-1, # TODO: add logging for gradient norm + max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps + val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=25, + row_log_interval=10, logger=logger, - checkpoint_callback=checkpoint_callback, + checkpoint_callback=None, # FIXME: checkpoint_callback, resume_from_checkpoint=args.resume, + gradient_clip_val=args.grad_clip, + callbacks=[LearningRateLogger()] ) trainer.fit(pretrainer) From 985acc9d840fa1d0ea2e6d285f9f2501503606e8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 17:32:04 -0700 Subject: [PATCH 011/112] wip --- scripts/pretrain.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 014c33f..4eb08b5 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -20,6 +20,9 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# TODO: Try on multiple machines +# TODO: try on a single TPU +# TODO: try on a TPU-pod class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): @@ -129,19 +132,19 @@ def training_step(self, batch, batch_nb): tensorboard_logs = { 'input_size': input_ids.numel(), 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, - 'lr': self.trainer.optimizers[0].param_groups[0]['lr'], 'mlm_loss': loss.detach(), 'mlm_perplexity': torch.exp(loss).detach(), - 'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, + 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } if self.start_time != 0: elapsed_time = time.time() - self.start_time - tensorboard_logs['time per batch'] = elapsed_time + tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): + # TODO: log how long evaluation takes self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { @@ -228,7 +231,7 @@ def add_args(parser): parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--grad_accum", type=int, default=1) # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n - # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward + # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward parser.add_argument("--gpu_count", type=int, default=1) parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) @@ -264,9 +267,7 @@ def main(args): mode='min', ) - # TODO: try gradient accumulation - - args.val_every_batches = args.val_every * args.grad_accum # convert val_every_steps to val_every_batches + args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, auto_select_gpus=False, @@ -275,11 +276,12 @@ def main(args): replace_sampler_ddp=False, track_grad_norm=-1, # TODO: add logging for gradient norm max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps - val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches, + val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, row_log_interval=10, logger=logger, - checkpoint_callback=None, # FIXME: checkpoint_callback, + checkpoint_callback=checkpoint_callback, + accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, callbacks=[LearningRateLogger()] From 023dd78227e9021b539fa6fc92c5f7d7d8e6c463 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 18:02:36 -0700 Subject: [PATCH 012/112] wip --- scripts/pretrain.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 4eb08b5..34775a7 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -145,6 +145,7 @@ def training_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb): # TODO: log how long evaluation takes + # TODO: reproduce roberta evaluation numbers on the longformer corpus self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { From 08230ac9f81ae2cc9cd094ba8bded56f24a1a66d Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 21:13:52 -0700 Subject: [PATCH 013/112] wip --- scripts/pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 34775a7..232a4a3 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -46,7 +46,7 @@ def __getitem__(self, i): @staticmethod def raw_text_to_mmap(args): - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) assert len(tokenizer) < 65535 # will use uint16 to store token ids all_files = glob.glob(f'{args.input_dir}/*.txt') @@ -59,6 +59,7 @@ def raw_text_to_mmap(args): # TODO: process each shared in a separate worker # TODO: support multiple documents in one chunk instead of padding + # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files for fname in tqdm(all_files): with open(fname, 'r') as fin: for line in tqdm(fin): From fb65d5794765d3b450af27448bbcb85338d54449 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 22:09:32 -0700 Subject: [PATCH 014/112] . --- scripts/pretrain.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 232a4a3..a5171ad 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -24,6 +24,7 @@ # TODO: try on a single TPU # TODO: try on a TPU-pod + class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size @@ -57,9 +58,9 @@ def raw_text_to_mmap(args): train_chunks = [] val_chunks = [] - # TODO: process each shared in a separate worker + # TODO: process each shared in a separate worker and save their output to files # TODO: support multiple documents in one chunk instead of padding - # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files + for fname in tqdm(all_files): with open(fname, 'r') as fin: for line in tqdm(fin): From 0e80cde3c5949ea8ec32ca866f812951d6854a06 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 08:11:10 -0700 Subject: [PATCH 015/112] pad chunks or start next doc --- scripts/pretrain.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index a5171ad..c787344 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -55,34 +55,42 @@ def raw_text_to_mmap(args): logger.info("Cache already exists. Remove the cache directory to regenerate") return os.mkdir(f'{args.input_dir}/cache/') - train_chunks = [] - val_chunks = [] # TODO: process each shared in a separate worker and save their output to files - # TODO: support multiple documents in one chunk instead of padding + chunks_list = [] for fname in tqdm(all_files): with open(fname, 'r') as fin: + current_chunk = [tokenizer.bos_token] for line in tqdm(fin): if line.strip() == '': # drop empty lines continue - chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks tokens = tokenizer.tokenize(line) # each line is one document # generate chunks of length args.seqlen. The last chunk will be padded. # padding last chunk is not great for longformer because many chunks will be mostly padding - current_chunk = [tokenizer.bos_token] + for token in tokens: if len(current_chunk) == args.seqlen - 1: # chunk is full current_chunk.append(tokenizer.eos_token) chunks_list.append(current_chunk) current_chunk = [tokenizer.bos_token] current_chunk.append(token) - current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) - current_chunk[args.seqlen - 1] = tokenizer.eos_token - chunks_list.append(current_chunk) + if args.padded_chunks: + # fill the rest of the seqlen with pad + current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) + current_chunk[args.seqlen - 1] = tokenizer.eos_token + chunks_list.append(current_chunk) + current_chunk = [tokenizer.bos_token] + else: + # one long doc with sep inbetween + if len(current_chunk) < args.seqlen - 1: + current_chunk.append(tokenizer.sep_token) + random.shuffle(chunks_list) + val_count = int(args.train_dev_split * len(chunks_list)) + val_chunks = chunks_list[:val_count] + train_chunks = chunks_list[val_count:] def _tokenized_text_to_mmap(output_fname, chunks_list): - random.shuffle(chunks_list) num_chunks = len(chunks_list) all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16) for k, chunk in enumerate(tqdm(chunks_list)): @@ -222,6 +230,7 @@ def add_args(parser): parser.add_argument("--tokenizer", type=str, default='roberta-base') parser.add_argument("--model", type=str, default='roberta-base') parser.add_argument("--mlm_prob", type=float, default=0.15) + parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--adam_epsilon", type=float, default=1e-6) From 6ca7d1b8557a6d0db1bdbd51e87bf785649b49c8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 08:18:24 -0700 Subject: [PATCH 016/112] todo --- scripts/pretrain.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index c787344..d2c5378 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -166,6 +166,8 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: + # TODO: PTL already doing this. Is it still needed here? + # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() avg_loss = avg_loss.item() @@ -195,6 +197,7 @@ def configure_optimizers(self): def _get_loader(self, fname, is_train): dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen) + # TODO: consider `replace_sampler_ddp=True` and removing the following if statement if self.trainer.use_ddp: sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) shuffle = False From a2aa4f76630d7433f3dee6fc1d04862efa09bb5e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 09:22:56 -0700 Subject: [PATCH 017/112] wip --- scripts/pretrain.py | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index d2c5378..95ce577 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -4,6 +4,7 @@ import random import logging import numpy as np +import math from tqdm import tqdm import time import torch @@ -143,6 +144,7 @@ def training_step(self, batch, batch_nb): 'input_size': input_ids.numel(), 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, 'mlm_loss': loss.detach(), + 'mlm_bpc': loss.detach()/math.log(2), 'mlm_perplexity': torch.exp(loss).detach(), 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } @@ -225,30 +227,42 @@ def val_dataloader(self): @staticmethod def add_args(parser): parser.add_argument("--seed", type=int, default=3) + + # Dataset. Some of these params are only useful when generating the dataset cache parser.add_argument("--input_dir", type=str, required=True) - parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True) parser.add_argument("--train_dev_split", type=float, default=0.05) + parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--seqlen", type=int, default=512) + parser.add_argument("--mlm_prob", type=float, default=0.15) + + # HF model loading parser.add_argument("--tokenizer", type=str, default='roberta-base') parser.add_argument("--model", type=str, default='roberta-base') - parser.add_argument("--mlm_prob", type=float, default=0.15) - parser.add_argument("--padded_chunks", type=bool, default=False) - parser.add_argument("--weight_decay", type=float, default=0.01) + + # Checkpointing and logging + parser.add_argument("--save_dir", type=str, default='runs/') + parser.add_argument("--save_prefix", type=str, required=True) + parser.add_argument("--resume", type=str, default=None) + + # Training hyperparams parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates') + parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates') + parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations') + parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**') + parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--adam_epsilon", type=float, default=1e-6) parser.add_argument("--grad_clip", type=float, default=0) - parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates') - parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates') - parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations') - parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**') - parser.add_argument("--batch_size", type=int, default=8) + + # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) + parser.add_argument("--batch_size", type=int, default=32) + parser.add_argument("--grad_accum", type=int, default=16) + + # Compute resources parser.add_argument("--num_workers", type=int, default=0) - parser.add_argument("--grad_accum", type=int, default=1) # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward parser.add_argument("--gpu_count", type=int, default=1) - parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) return parser From 62a69d594aa0a2a5c02d83458e36b8f91ffda15b Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 13:51:56 -0700 Subject: [PATCH 018/112] wip --- scripts/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 95ce577..a715c9c 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -24,6 +24,7 @@ # TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod +# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635 class MMapTextDataset(Dataset): @@ -157,7 +158,6 @@ def training_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb): # TODO: log how long evaluation takes - # TODO: reproduce roberta evaluation numbers on the longformer corpus self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { From 3e3a478317a7d0ecb2ca1983de92014889f4c543 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 17:24:05 -0700 Subject: [PATCH 019/112] wip --- requirements.txt | 3 ++- scripts/pretrain.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2279015..cbce7f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning + torch>=1.5.0 transformers==3.0.2 tensorboardX -pytorch-lightning==0.8.5 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index a715c9c..2f6b890 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -21,10 +21,12 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# DONE: reproduce RoBERTa numbers on the Longformer corpus # TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635 +# TODO: try restarting and double check optimizer, lr and lr scheduler +# TODO: try fp16 class MMapTextDataset(Dataset): @@ -260,9 +262,8 @@ def add_args(parser): # Compute resources parser.add_argument("--num_workers", type=int, default=0) - # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n - # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward - parser.add_argument("--gpu_count", type=int, default=1) + parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL + help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") parser.add_argument("--num_tpu_cores", type=int, default=None) return parser @@ -299,7 +300,6 @@ def main(args): args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, - auto_select_gpus=False, num_tpu_cores=args.num_tpu_cores, distributed_backend='ddp' if args.gpu_count > 1 else None, replace_sampler_ddp=False, From 3bc535461451ac37496d66147030ed7fc40429a3 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 20:13:08 -0700 Subject: [PATCH 020/112] wip --- requirements.txt | 2 +- scripts/pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index cbce7f0..b396708 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning -torch>=1.5.0 +torch==1.3.0 transformers==3.0.2 tensorboardX test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 2f6b890..3263537 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -26,7 +26,6 @@ # TODO: try on a single TPU # TODO: try on a TPU-pod # TODO: try restarting and double check optimizer, lr and lr scheduler -# TODO: try fp16 class MMapTextDataset(Dataset): @@ -313,6 +312,7 @@ def main(args): accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, + precision=16, amp_level='O2', callbacks=[LearningRateLogger()] ) trainer.fit(pretrainer) From 1a91024e3470ee2c5ad8fa08d3daca1fc68a0e17 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 22:25:48 -0700 Subject: [PATCH 021/112] wip --- scripts/pretrain.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 3263537..e79eb17 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -225,6 +225,12 @@ def train_dataloader(self): def val_dataloader(self): return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False) + def grad_norm(self, norm_type): + # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params + # TODO: grad_norm reporting needs to take fp16 loss scale into account + all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] + return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))} + @staticmethod def add_args(parser): parser.add_argument("--seed", type=int, default=3) @@ -302,7 +308,7 @@ def main(args): num_tpu_cores=args.num_tpu_cores, distributed_backend='ddp' if args.gpu_count > 1 else None, replace_sampler_ddp=False, - track_grad_norm=-1, # TODO: add logging for gradient norm + track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, @@ -313,7 +319,7 @@ def main(args): resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, precision=16, amp_level='O2', - callbacks=[LearningRateLogger()] + callbacks=[LearningRateLogger()], ) trainer.fit(pretrainer) From 5fa21f24452a990864a31c65dcf9a780e9f47e2c Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 22:32:46 -0700 Subject: [PATCH 022/112] wip --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b396708..e47a1d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning -torch==1.3.0 +torch==1.3.1 transformers==3.0.2 tensorboardX test-tube==0.7.5 From 18eb0036c49c8452bf109c99d339229688f0e714 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 07:58:49 -0700 Subject: [PATCH 023/112] wip --- scripts/pretrain.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index e79eb17..5022aa8 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -22,7 +22,6 @@ logger = logging.getLogger(__name__) # DONE: reproduce RoBERTa numbers on the Longformer corpus -# TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod # TODO: try restarting and double check optimizer, lr and lr scheduler @@ -259,7 +258,7 @@ def add_args(parser): parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**') parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--adam_epsilon", type=float, default=1e-6) - parser.add_argument("--grad_clip", type=float, default=0) + parser.add_argument("--grad_clip", type=float, default=0) # TODO: test this with fp16. Likely not working # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) parser.add_argument("--batch_size", type=int, default=32) @@ -269,6 +268,21 @@ def add_args(parser): parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") + + # For multi-node training, use the PyTorch launch script. The script and instructions can be found here: + # https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py. + # To run PTL in a mode compatible with the launch script, two things are needed: + # - pass the argument `--use_env` to `torch.distributed.launch` + # - make sure `--nproc_per_node` matches `--gpu_count` and `--nnodes` matches `--node_count`. + # For example, to run on 2 nodes, 3 gpus each, the command line on node rank 1 would be like: + # >>>> python -m torch.distributed.launch \ + # --use_env --nnodes 2 --nproc_per_node 3 \ + # --node_rank 1 --master_addr s2-server4 --master_port 12343 \ + # scripts/pretrain.py \ + # --gpu_count 2 --node_count 2 \ + # --input_dir my_data_dir --save_prefix test_multinode + parser.add_argument("--node_count", type=int, default=1, + help="Number of nodes. It needs to match --nnodes of torch.distributed.launch") parser.add_argument("--num_tpu_cores", type=int, default=None) return parser @@ -305,8 +319,9 @@ def main(args): args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, + num_nodes=args.node_count, num_tpu_cores=args.num_tpu_cores, - distributed_backend='ddp' if args.gpu_count > 1 else None, + distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None, replace_sampler_ddp=False, track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps From 607e4465da794d20ef7006a2bf9dddc9250ecbc1 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 09:41:34 -0700 Subject: [PATCH 024/112] wip --- scripts/pretrain.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 5022aa8..10165b0 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -22,9 +22,12 @@ logger = logging.getLogger(__name__) # DONE: reproduce RoBERTa numbers on the Longformer corpus +# DONE: testing ddp single machine +# DONE: testing ddp multiple machines +# DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: try restarting and double check optimizer, lr and lr scheduler +# TODO: only one checkpoint per epoch is saved class MMapTextDataset(Dataset): @@ -168,7 +171,7 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: - # TODO: PTL already doing this. Is it still needed here? + # TODO: PTL is already doing this. Is it still needed here? # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() @@ -189,11 +192,10 @@ def configure_optimizers(self): "weight_decay": 0.0, }, ] - optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps ) - return [optimizer], [{"scheduler": scheduler, "interval": "step"}] def _get_loader(self, fname, is_train): @@ -247,11 +249,15 @@ def add_args(parser): # Checkpointing and logging parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True) - parser.add_argument("--resume", type=str, default=None) + parser.add_argument("--save_prefix", type=str, required=True, + help="path of output directory is --save_dir/--save_prefix") + parser.add_argument("--resume", type=str, default=None, # It is better to use a different output dir. + help="Path to a checkpoint to load model weights and training state. It overwrites args") + parser.add_argument("--resume_model_only", type=str, default=None, + help="Path to a checkpoint to load model weights but not training state") # Training hyperparams - parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--lr", type=float, default=1e-5) parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates') parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates') parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations') @@ -295,7 +301,10 @@ def main(args): if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed * 10000) - pretrainer = Pretrainer(args) + if args.resume_model_only is not None: + pretrainer = Pretrainer.load_from_checkpoint(args.resume_model_only, args) + else: + pretrainer = Pretrainer(args) # logger here is a SummaryWritter for tensorboard # it is used by the trainer, and certain return variables From d4659deeabade71e46da1d2daecaf2e31ac60aab Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 10:24:16 -0700 Subject: [PATCH 025/112] wip --- scripts/pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 10165b0..6015fa0 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -27,7 +27,6 @@ # DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: only one checkpoint per epoch is saved class MMapTextDataset(Dataset): @@ -175,7 +174,6 @@ def validation_epoch_end(self, outputs): # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() - avg_loss = avg_loss.item() logs = {'val_mlm_loss': avg_loss} return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss} @@ -320,9 +318,11 @@ def main(args): filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'), prefix='', save_top_k=3, + save_last=True, verbose=True, monitor='val_loss', mode='min', + period=-1, # to allow multiple checkpoints per epoch ) args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu From c7c53cbde9c244fbc6b4f1119aeb61c268f80284 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sun, 19 Jul 2020 09:06:10 -0700 Subject: [PATCH 026/112] wip --- scripts/pretrain.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 6015fa0..683b008 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -27,6 +27,7 @@ # DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod +# TODO: run on beaker on ai2-server1/2 class MMapTextDataset(Dataset): From 0a07daf8dd0cf2ca24592f4006338b2568cca23b Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 20:16:38 +0000 Subject: [PATCH 027/112] wip --- scripts/cheatsheet.txt | 22 ++++++++++++++++ scripts/test_tpu.py | 57 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 scripts/test_tpu.py diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index be4fc3a..d39371e 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -70,3 +70,25 @@ python -m scripts.triviaqa_utils.evaluation_utils \ --prediction_file predictions.json # Output should be: {'exact_match': 73.07644188665083, 'f1': 77.78523804802242, 'common': 7993, 'denominator': 7993, 'pred_len': 7993, 'gold_len': 7993} + + +# TPU +export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" + +source /anaconda3/bin/activate torch-xla-nightly + +import torch_xla.debug.metrics as met; print(met.metrics_report()) + +curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 + +/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py + + XLA_IR_DEBUG=1 + XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470 + TF_CPP_LOG_THREAD_ID=1 + TF_CPP_MIN_LOG_LEVEL=0 + XLA_HLO_DEBUG=1 + XLA_DUMP_FATAL_STACK=1 + TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1 + XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs + XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py new file mode 100644 index 0000000..618ee6f --- /dev/null +++ b/scripts/test_tpu.py @@ -0,0 +1,57 @@ +import os +import torch +from torch.utils.data import DataLoader, Dataset +from transformers import AutoModel +import pytorch_lightning as pl + +class CoolDataset(Dataset): + def __len__(self): + return 128 * 128 + + def __getitem__(self, idx): + return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128) + +class CoolSystem(pl.LightningModule): + + def __init__(self): + super().__init__() + + # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096') + self.model = AutoModel.from_pretrained('bert-base-uncased') + + def forward(self, x, y): + return self.model(x, attention_mask=None) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x, y) + loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) + tensorboard_logs = {'train_loss': loss} + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x, y) + val_loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) + return {'val_loss': val_loss} + + def validation_epoch_end(self, outputs): + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + tensorboard_logs = {'val_loss': avg_loss} + return {'val_loss': avg_loss, 'log': tensorboard_logs} + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=0.001) + + def train_dataloader(self): + loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) + return loader + + def val_dataloader(self): + loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) + return loader + +if __name__ == '__main__': + model = CoolSystem() + trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0) + trainer.fit(model) From 827576cdc958512d66d67c6cf2041f4f5c9a45de Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 20:43:27 +0000 Subject: [PATCH 028/112] wip --- scripts/cheatsheet.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index d39371e..6dde8ce 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -82,13 +82,3 @@ import torch_xla.debug.metrics as met; print(met.metrics_report()) curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py - - XLA_IR_DEBUG=1 - XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470 - TF_CPP_LOG_THREAD_ID=1 - TF_CPP_MIN_LOG_LEVEL=0 - XLA_HLO_DEBUG=1 - XLA_DUMP_FATAL_STACK=1 - TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1 - XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs - XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics From 5d0c8a2a6dd76b68e0cfa491c50b4714e6befbdd Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 13:52:22 -0700 Subject: [PATCH 029/112] wip --- longformer/longformer_encoder_decoder.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py index 741939a..67ab3e4 100644 --- a/longformer/longformer_encoder_decoder.py +++ b/longformer/longformer_encoder_decoder.py @@ -7,8 +7,11 @@ class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration): def __init__(self, config): super().__init__(config) - for i, layer in enumerate(self.model.encoder.layers): - layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i) + if config.attention_mode == 'n2': + pass # do nothing, use BertSelfAttention instead + else: + for i, layer in enumerate(self.model.encoder.layers): + layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i) class LongformerEncoderDecoderConfig(BartConfig): From 413258aeb41f0ea6cf081c13be85d23e1e10be5f Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 13:52:41 -0700 Subject: [PATCH 030/112] wip --- scripts/mem_profiler.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py index 64a6d56..4edc6b0 100644 --- a/scripts/mem_profiler.py +++ b/scripts/mem_profiler.py @@ -1,14 +1,17 @@ from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig +from longformer.longformer import LongformerForMaskedLM +from longformer.longformer import LongformerConfig + import torch from torch.utils.data import DataLoader, Dataset from pytorch_lightning import Trainer import pytorch_lightning as pl -seqlen = 1024 * 12 -global_size = 0 # seqlen // 100 -attention_window = 512 # one sided +seqlen = 1024 * 8 +global_size = seqlen // 100 +attention_window = 256 # one sided class CoolDataset(Dataset): @@ -18,7 +21,7 @@ def __len__(self): def __getitem__(self, idx): tokne_ids = torch.tensor([5] * seqlen) mask = torch.tensor([1] * seqlen) - # mask[:global_size] = 2 + mask[:global_size] = 2 return tokne_ids, mask @@ -28,21 +31,29 @@ def __init__(self, hparams=None): super().__init__() self.hparams = hparams - config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096') + # config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096') + config = LongformerConfig.from_pretrained('roberta-large') config.max_position_embeddings = seqlen + 2 config.gradient_checkpointing = True - config.attention_mode = 'sliding_chunks' + # config.attention_mode = 'sliding_chunks' + config.attention_mode = 'n2' config.attention_window = [attention_window] * config.num_hidden_layers - self.model = LongformerEncoderDecoderForConditionalGeneration(config) + config.attention_dilation = [1] * config.num_hidden_layers + # self.model = LongformerEncoderDecoderForConditionalGeneration(config) + self.model = LongformerForMaskedLM(config) def forward(self, x, y): print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3) + # import ipdb; ipdb.set_trace() + # return self.model(x, attention_mask=y, decoder_input_ids=x[:, :attention_window * 2], use_cache=False) return self.model(x, attention_mask=y) def training_step(self, batch, batch_idx): + # import ipdb; ipdb.set_trace() x, y = batch y_hat = self(x, y) loss = y_hat[0].sum() + # import ipdb; ipdb.set_trace() return {'loss': loss} def configure_optimizers(self): @@ -53,6 +64,6 @@ def train_dataloader(self): if __name__ == '__main__': - model = MemoryProfiler() + model = MemoryProfiler(hparams={}) trainer = Trainer(gpus=[0], progress_bar_refresh_rate=1, max_epochs=1, amp_level='O2', use_amp=True) trainer.fit(model) From 1a6498c117e380978151cd0b3e91de9f74640e05 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 21:45:45 +0000 Subject: [PATCH 031/112] tpu --- scripts/pretrain.py | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 683b008..93d83a4 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -30,6 +30,14 @@ # TODO: run on beaker on ai2-server1/2 +try: + import torch_xla.core.xla_model as xm +except ImportError: + XLA_AVAILABLE = False +else: + XLA_AVAILABLE = True + + class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size @@ -146,16 +154,17 @@ def training_step(self, batch, batch_nb): input_ids = batch['input_ids'] tensorboard_logs = { 'input_size': input_ids.numel(), - 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, - 'mlm_loss': loss.detach(), - 'mlm_bpc': loss.detach()/math.log(2), - 'mlm_perplexity': torch.exp(loss).detach(), + 'mlm_loss': loss, + 'mlm_bpc': loss/math.log(2), + 'mlm_perplexity': torch.exp(loss), 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } if self.start_time != 0: elapsed_time = time.time() - self.start_time tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() + if not XLA_AVAILABLE: + tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3 return {'loss': loss, 'log': tensorboard_logs} @@ -204,6 +213,14 @@ def _get_loader(self, fname, is_train): if self.trainer.use_ddp: sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) shuffle = False + elif self.trainer.use_tpu: + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, + num_replicas=xm.xrt_world_size(), + rank=xm.get_ordinal(), + shuffle=is_train, + ) + shuffle = False else: sampler = None shuffle = is_train @@ -227,6 +244,10 @@ def val_dataloader(self): def grad_norm(self, norm_type): # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params + + if XLA_AVAILABLE: + return {} # computing grad_norm one parameter at a time takes forever on TPU + # TODO: grad_norm reporting needs to take fp16 loss scale into account all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))} @@ -266,8 +287,8 @@ def add_args(parser): parser.add_argument("--grad_clip", type=float, default=0) # TODO: test this with fp16. Likely not working # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) - parser.add_argument("--batch_size", type=int, default=32) - parser.add_argument("--grad_accum", type=int, default=16) + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--grad_accum", type=int, default=1) # Compute resources parser.add_argument("--num_workers", type=int, default=0) @@ -288,7 +309,7 @@ def add_args(parser): # --input_dir my_data_dir --save_prefix test_multinode parser.add_argument("--node_count", type=int, default=1, help="Number of nodes. It needs to match --nnodes of torch.distributed.launch") - parser.add_argument("--num_tpu_cores", type=int, default=None) + parser.add_argument("--tpu_core_count", type=int, default=None) return parser @@ -330,20 +351,22 @@ def main(args): trainer = ptl.Trainer( gpus=args.gpu_count, num_nodes=args.node_count, - num_tpu_cores=args.num_tpu_cores, + num_tpu_cores=args.tpu_core_count, distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None, replace_sampler_ddp=False, track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=10, + row_log_interval=16, + progress_bar_refresh_rate=16, logger=logger, checkpoint_callback=checkpoint_callback, accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, precision=16, amp_level='O2', + num_sanity_val_steps=2, callbacks=[LearningRateLogger()], ) trainer.fit(pretrainer) From 3e82548dc8be6af6e582ad80534b944335b31c87 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 15:47:11 -0700 Subject: [PATCH 032/112] wip --- scripts/pretrain.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 93d83a4..2202a39 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -26,6 +26,12 @@ # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint # TODO: try on a single TPU +# - tie weights +# - tensorboard +# - getrank +# - barrier +# - val all_reduce +# - checkpointing # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -257,7 +263,7 @@ def add_args(parser): parser.add_argument("--seed", type=int, default=3) # Dataset. Some of these params are only useful when generating the dataset cache - parser.add_argument("--input_dir", type=str, required=True) + parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/') parser.add_argument("--train_dev_split", type=float, default=0.05) parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--seqlen", type=int, default=512) @@ -269,7 +275,7 @@ def add_args(parser): # Checkpointing and logging parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True, + parser.add_argument("--save_prefix", type=str, default='test', help="path of output directory is --save_dir/--save_prefix") parser.add_argument("--resume", type=str, default=None, # It is better to use a different output dir. help="Path to a checkpoint to load model weights and training state. It overwrites args") @@ -291,6 +297,7 @@ def add_args(parser): parser.add_argument("--grad_accum", type=int, default=1) # Compute resources + parser.add_argument("--fp16", type=bool, default=False) parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") @@ -365,7 +372,7 @@ def main(args): accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, - precision=16, amp_level='O2', + precision=16 if args.fp16 else 32, amp_level='O2', num_sanity_val_steps=2, callbacks=[LearningRateLogger()], ) From adadd425c84c4993bb77d2af950b0af17d10079b Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 23 Jul 2020 02:54:28 +0000 Subject: [PATCH 033/112] wip --- scripts/cheatsheet.txt | 2 ++ scripts/pretrain.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index 6dde8ce..1e77b07 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -82,3 +82,5 @@ import torch_xla.debug.metrics as met; print(met.metrics_report()) curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py + +/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 2202a39..ab1120a 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -281,6 +281,7 @@ def add_args(parser): help="Path to a checkpoint to load model weights and training state. It overwrites args") parser.add_argument("--resume_model_only", type=str, default=None, help="Path to a checkpoint to load model weights but not training state") + parser.add_argument("--log_rate", type=int, default=16) # Training hyperparams parser.add_argument("--lr", type=float, default=1e-5) @@ -365,8 +366,8 @@ def main(args): max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=16, - progress_bar_refresh_rate=16, + row_log_interval=args.log_rate, + progress_bar_refresh_rate=args.log_rate, logger=logger, checkpoint_callback=checkpoint_callback, accumulate_grad_batches=args.grad_accum, From 9e191a0b4ecd763e862075a6097366561a99523e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 15 Jul 2020 23:19:32 -0700 Subject: [PATCH 034/112] pretraining script --- requirements.txt | 6 +- scripts/pretrain.py | 277 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 scripts/pretrain.py diff --git a/requirements.txt b/requirements.txt index 5b004e7..75cbab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -torch>=1.2.0 -transformers>=3.0.2 +torch>=1.5.0 +transformers>=3.0.1 tensorboardX -pytorch-lightning==0.6.0 +pytorch-lightning==0.7.6 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py new file mode 100644 index 0000000..060ab0e --- /dev/null +++ b/scripts/pretrain.py @@ -0,0 +1,277 @@ +import argparse +import glob +import os +import random +import logging +import numpy as np +from tqdm import tqdm +import torch +from transformers import AutoTokenizer, AutoModelWithLMHead +from transformers import DataCollatorForLanguageModeling +from transformers.optimization import AdamW, get_linear_schedule_with_warmup + +from torch.utils.data import Dataset, DataLoader +import pytorch_lightning as ptl +from pytorch_lightning.logging.test_tube import TestTubeLogger +from pytorch_lightning.callbacks import ModelCheckpoint + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class MMapTextDataset(Dataset): + def __init__(self, mmap_filename, chunk_size): + self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size + # defer loading the token_ids memmap until after the first __getitem__ call. + # when spawning new processes for ddp, there is a hard limit in python < 3.8 that + # pickle files need to be < 4GB. By waiting until after the first __getitem__ we + # don't have to pickle the memmap + self.token_ids = None + self._mmap_filename = mmap_filename + self._chunk_size = chunk_size + + def __len__(self): + return self.num_instances + + def __getitem__(self, i): + if self.token_ids is None: + self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16, + shape=(self.num_instances, self._chunk_size)) + return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long) + + @staticmethod + def raw_text_to_mmap(args): + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + assert len(tokenizer) < 65535 # will use uint16 to store token ids + all_files = glob.glob(f'{args.input_dir}/*.txt') + + if os.path.exists(f'{args.input_dir}/cache/'): + logger.info("Cache already exists. Remove the cache directory to regenerate") + return + os.mkdir(f'{args.input_dir}/cache/') + train_chunks = [] + val_chunks = [] + + # TODO: process each shared in a separate worker + # TODO: support multiple documents in one chunk instead of padding + for fname in tqdm(all_files): + with open(fname, 'r') as fin: + for line in tqdm(fin): + if line.strip() == '': # drop empty lines + continue + chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks + tokens = tokenizer.tokenize(line) # each line is one document + # generate chunks of length args.seqlen. The last chunk will be padded. + # padding last chunk is not great for longformer because many chunks will be mostly padding + current_chunk = [tokenizer.bos_token] + for token in tokens: + if len(current_chunk) == args.seqlen - 1: # chunk is full + current_chunk.append(tokenizer.eos_token) + chunks_list.append(current_chunk) + current_chunk = [tokenizer.bos_token] + current_chunk.append(token) + current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) + current_chunk[args.seqlen - 1] = tokenizer.eos_token + chunks_list.append(current_chunk) + + def _tokenized_text_to_mmap(output_fname, chunks_list): + random.shuffle(chunks_list) + num_chunks = len(chunks_list) + all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16) + for k, chunk in enumerate(tqdm(chunks_list)): + token_ids = tokenizer.convert_tokens_to_ids(chunk) + assert len(token_ids) == args.seqlen + all_token_ids[k, :] = [int(t) for t in token_ids] + fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen)) + fp[:, :] = all_token_ids[:, :] + fp.flush() + del fp + + _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks) + _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks) + + +class Pretrainer(ptl.LightningModule): + + def __init__(self, hparams): + super().__init__() + + self.args = hparams + self.hparams = self.args + + self.model = AutoModelWithLMHead.from_pretrained(args.model) + self.config = self.model.config + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + self.pad_token_id = tokenizer.pad_token_id + + logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.') + MMapTextDataset.raw_text_to_mmap(args) + + # TODO: add support for other objective functions + self.data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob + ) + + def forward(self, input_ids=None, labels=None, loss_only=True): + # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD + attention_mask = (input_ids != self.pad_token_id).int() + + if labels is not None: + # output is loss, prediction_scores, hidden_states + output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) + if loss_only: + return output[0] + else: + return {"loss": output[0], "hidden_states": output[2]} + else: + # don't need to run the lm_head + assert not loss_only + output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask) + return {"hidden_states": output[2]} + + def training_step(self, batch, batch_nb): + loss = self(**batch) + tensorboard_logs = { + 'mlm_loss': loss.detach(), + 'mlm_perplexity': torch.exp(loss).detach(), + } + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_nb): + loss = self(**batch) + tensorboard_logs = { + 'val_mlm_loss': loss.detach(), + } + return {'val_loss': tensorboard_logs["val_mlm_loss"], 'log': tensorboard_logs} + + def validation_epoch_end(self, outputs): + avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() + if self.use_ddp: + avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) + avg_loss /= torch.distributed.get_world_size() + avg_loss = avg_loss.item() + logs = {'val_mlm_loss': avg_loss} + return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss} + + def configure_optimizers(self): + no_decay = ["bias", "LayerNorm.weight"] + + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], + "weight_decay": self.args.weight_decay, + }, + { + "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps + ) + + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] + + def _get_loader(self, fname, is_train): + dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen) + + if self.trainer.use_ddp: + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) + shuffle = False + else: + sampler = None + shuffle = is_train + + loader = DataLoader( + dataset, + batch_size=self.args.batch_size, + shuffle=shuffle, + sampler=sampler, + num_workers=self.args.num_workers, + collate_fn=self.data_collator, + drop_last=is_train, + ) + return loader + + def train_dataloader(self): + return self._get_loader(f'{self.args.input_dir}/cache/train.bin', True) + + def val_dataloader(self): + return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False) + + @staticmethod + def add_args(parser): + parser.add_argument("--seed", type=int, default=3) + parser.add_argument("--input_dir", type=str, required=True) + parser.add_argument("--save_dir", type=str, default='runs/') + parser.add_argument("--save_prefix", type=str, required=True) + parser.add_argument("--train_dev_split", type=float, default=0.05) + parser.add_argument("--seqlen", type=int, default=512) + parser.add_argument("--tokenizer", type=str, default='roberta-base') + parser.add_argument("--model", type=str, default='roberta-base') + parser.add_argument("--mlm_prob", type=float, default=0.15) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--adam_epsilon", type=float, default=1e-6) + parser.add_argument("--training_steps", type=int, default=0.01) + parser.add_argument("--warmup_steps", type=int, default=1000) + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--num_workers", type=int, default=0) + parser.add_argument("--grad_accum", type=int, default=1) + parser.add_argument("--gpus", type=str, default='0') + parser.add_argument("--resume", type=str, default=None) + parser.add_argument("--num_tpu_cores", type=int, default=None) + + return parser + + +def main(args): + random.seed(args.seed * 10) + np.random.seed(args.seed * 100) + torch.manual_seed(args.seed * 1000) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(args.seed * 10000) + + pretrainer = Pretrainer(args) + + # logger here is a SummaryWritter for tensorboard + # it is used by the trainer, and certain return variables + # from the model are automatically logged + logger = TestTubeLogger( + save_dir=args.save_dir, + name=args.save_prefix, + version=0 # always use version=0 + ) + + checkpoint_callback = ModelCheckpoint( + # model saved to filepath/prefix_.... + filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'), + prefix='', + save_top_k=3, + verbose=True, + monitor='val_loss', + mode='min', + ) + + args.gpus = [int(x) for x in args.gpus.split(',')] + trainer = ptl.Trainer( + gpus=args.gpus, + num_tpu_cores=args.num_tpu_cores, + distributed_backend='ddp' if len(args.gpus) > 1 else None, + track_grad_norm=-1, + max_epochs=10000, min_epochs=0, max_steps=args.training_steps, # run for many epochs, but stop after max_steps + early_stop_callback=None, + row_log_interval=25, + logger=logger, + checkpoint_callback=checkpoint_callback, + resume_from_checkpoint=args.resume, + ) + trainer.fit(pretrainer) + + +if __name__ == "__main__": + parser = Pretrainer.add_args(argparse.ArgumentParser(description="pretrain")) + args = parser.parse_args() + main(args) From 9d18808eaa5debf1f8d0b474558c79e5072788fc Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 14:12:22 -0700 Subject: [PATCH 035/112] wip --- requirements.txt | 4 +-- scripts/pretrain.py | 68 ++++++++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/requirements.txt b/requirements.txt index 75cbab1..2279015 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.5.0 -transformers>=3.0.1 +transformers==3.0.2 tensorboardX -pytorch-lightning==0.7.6 +pytorch-lightning==0.8.5 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 060ab0e..014c33f 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -5,6 +5,7 @@ import logging import numpy as np from tqdm import tqdm +import time import torch from transformers import AutoTokenizer, AutoModelWithLMHead from transformers import DataCollatorForLanguageModeling @@ -13,7 +14,7 @@ from torch.utils.data import Dataset, DataLoader import pytorch_lightning as ptl from pytorch_lightning.logging.test_tube import TestTubeLogger -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger logging.basicConfig(level=logging.INFO) @@ -112,33 +113,36 @@ def __init__(self, hparams): self.data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob ) + self.start_time = 0 - def forward(self, input_ids=None, labels=None, loss_only=True): + def forward(self, input_ids=None, labels=None): # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD attention_mask = (input_ids != self.pad_token_id).int() - if labels is not None: - # output is loss, prediction_scores, hidden_states - output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) - if loss_only: - return output[0] - else: - return {"loss": output[0], "hidden_states": output[2]} - else: - # don't need to run the lm_head - assert not loss_only - output = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask) - return {"hidden_states": output[2]} + # output is loss, prediction_scores, hidden_states + output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) + return output[0] # loss def training_step(self, batch, batch_nb): loss = self(**batch) + input_ids = batch['input_ids'] tensorboard_logs = { + 'input_size': input_ids.numel(), + 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, + 'lr': self.trainer.optimizers[0].param_groups[0]['lr'], 'mlm_loss': loss.detach(), 'mlm_perplexity': torch.exp(loss).detach(), + 'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } + if self.start_time != 0: + elapsed_time = time.time() - self.start_time + tensorboard_logs['time per batch'] = elapsed_time + self.start_time = time.time() + return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): + self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { 'val_mlm_loss': loss.detach(), @@ -148,7 +152,7 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: - avg_loss = torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() avg_loss = avg_loss.item() logs = {'val_mlm_loss': avg_loss} @@ -169,7 +173,7 @@ def configure_optimizers(self): ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.training_steps + optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps ) return [optimizer], [{"scheduler": scheduler, "interval": "step"}] @@ -215,12 +219,17 @@ def add_args(parser): parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--adam_epsilon", type=float, default=1e-6) - parser.add_argument("--training_steps", type=int, default=0.01) - parser.add_argument("--warmup_steps", type=int, default=1000) + parser.add_argument("--grad_clip", type=float, default=0) + parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates') + parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates') + parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations') + parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**') parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--grad_accum", type=int, default=1) - parser.add_argument("--gpus", type=str, default='0') + # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n + # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward + parser.add_argument("--gpu_count", type=int, default=1) parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) @@ -255,18 +264,25 @@ def main(args): mode='min', ) - args.gpus = [int(x) for x in args.gpus.split(',')] + # TODO: try gradient accumulation + + args.val_every_batches = args.val_every * args.grad_accum # convert val_every_steps to val_every_batches trainer = ptl.Trainer( - gpus=args.gpus, + gpus=args.gpu_count, + auto_select_gpus=False, num_tpu_cores=args.num_tpu_cores, - distributed_backend='ddp' if len(args.gpus) > 1 else None, - track_grad_norm=-1, - max_epochs=10000, min_epochs=0, max_steps=args.training_steps, # run for many epochs, but stop after max_steps + distributed_backend='ddp' if args.gpu_count > 1 else None, + replace_sampler_ddp=False, + track_grad_norm=-1, # TODO: add logging for gradient norm + max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps + val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=25, + row_log_interval=10, logger=logger, - checkpoint_callback=checkpoint_callback, + checkpoint_callback=None, # FIXME: checkpoint_callback, resume_from_checkpoint=args.resume, + gradient_clip_val=args.grad_clip, + callbacks=[LearningRateLogger()] ) trainer.fit(pretrainer) From 6e24cee0c5f25a2e9d79d8847e74cc2e96618040 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 17:32:04 -0700 Subject: [PATCH 036/112] wip --- scripts/pretrain.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 014c33f..4eb08b5 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -20,6 +20,9 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# TODO: Try on multiple machines +# TODO: try on a single TPU +# TODO: try on a TPU-pod class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): @@ -129,19 +132,19 @@ def training_step(self, batch, batch_nb): tensorboard_logs = { 'input_size': input_ids.numel(), 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, - 'lr': self.trainer.optimizers[0].param_groups[0]['lr'], 'mlm_loss': loss.detach(), 'mlm_perplexity': torch.exp(loss).detach(), - 'tokens per step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, + 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } if self.start_time != 0: elapsed_time = time.time() - self.start_time - tensorboard_logs['time per batch'] = elapsed_time + tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): + # TODO: log how long evaluation takes self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { @@ -228,7 +231,7 @@ def add_args(parser): parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--grad_accum", type=int, default=1) # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n - # FIXME: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward + # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward parser.add_argument("--gpu_count", type=int, default=1) parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) @@ -264,9 +267,7 @@ def main(args): mode='min', ) - # TODO: try gradient accumulation - - args.val_every_batches = args.val_every * args.grad_accum # convert val_every_steps to val_every_batches + args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, auto_select_gpus=False, @@ -275,11 +276,12 @@ def main(args): replace_sampler_ddp=False, track_grad_norm=-1, # TODO: add logging for gradient norm max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps - val_check_interval=args.val_every_batches, limit_val_batches=args.val_batches, + val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, row_log_interval=10, logger=logger, - checkpoint_callback=None, # FIXME: checkpoint_callback, + checkpoint_callback=checkpoint_callback, + accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, callbacks=[LearningRateLogger()] From a2ab9b353a375cc5e7b8a23721fab86af7666ae5 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 18:02:36 -0700 Subject: [PATCH 037/112] wip --- scripts/pretrain.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 4eb08b5..34775a7 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -145,6 +145,7 @@ def training_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb): # TODO: log how long evaluation takes + # TODO: reproduce roberta evaluation numbers on the longformer corpus self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { From e3f4ba9816e9d99d19b114eb65baf508b9122f7a Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 21:13:52 -0700 Subject: [PATCH 038/112] wip --- scripts/pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 34775a7..232a4a3 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -46,7 +46,7 @@ def __getitem__(self, i): @staticmethod def raw_text_to_mmap(args): - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) assert len(tokenizer) < 65535 # will use uint16 to store token ids all_files = glob.glob(f'{args.input_dir}/*.txt') @@ -59,6 +59,7 @@ def raw_text_to_mmap(args): # TODO: process each shared in a separate worker # TODO: support multiple documents in one chunk instead of padding + # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files for fname in tqdm(all_files): with open(fname, 'r') as fin: for line in tqdm(fin): From f9e654b24c4dbe33901db275bcf52a3ac3271693 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 16 Jul 2020 22:09:32 -0700 Subject: [PATCH 039/112] . --- scripts/pretrain.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 232a4a3..a5171ad 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -24,6 +24,7 @@ # TODO: try on a single TPU # TODO: try on a TPU-pod + class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size @@ -57,9 +58,9 @@ def raw_text_to_mmap(args): train_chunks = [] val_chunks = [] - # TODO: process each shared in a separate worker + # TODO: process each shared in a separate worker and save their output to files # TODO: support multiple documents in one chunk instead of padding - # TODO: replace the in memory lists `train_chunks` and `train_chunks` with files + for fname in tqdm(all_files): with open(fname, 'r') as fin: for line in tqdm(fin): From 9c2646da2061f5ac64d1beaac4e61d836748613c Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 08:11:10 -0700 Subject: [PATCH 040/112] pad chunks or start next doc --- scripts/pretrain.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index a5171ad..c787344 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -55,34 +55,42 @@ def raw_text_to_mmap(args): logger.info("Cache already exists. Remove the cache directory to regenerate") return os.mkdir(f'{args.input_dir}/cache/') - train_chunks = [] - val_chunks = [] # TODO: process each shared in a separate worker and save their output to files - # TODO: support multiple documents in one chunk instead of padding + chunks_list = [] for fname in tqdm(all_files): with open(fname, 'r') as fin: + current_chunk = [tokenizer.bos_token] for line in tqdm(fin): if line.strip() == '': # drop empty lines continue - chunks_list = train_chunks if random.random() > args.train_dev_split else val_chunks tokens = tokenizer.tokenize(line) # each line is one document # generate chunks of length args.seqlen. The last chunk will be padded. # padding last chunk is not great for longformer because many chunks will be mostly padding - current_chunk = [tokenizer.bos_token] + for token in tokens: if len(current_chunk) == args.seqlen - 1: # chunk is full current_chunk.append(tokenizer.eos_token) chunks_list.append(current_chunk) current_chunk = [tokenizer.bos_token] current_chunk.append(token) - current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) - current_chunk[args.seqlen - 1] = tokenizer.eos_token - chunks_list.append(current_chunk) + if args.padded_chunks: + # fill the rest of the seqlen with pad + current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) + current_chunk[args.seqlen - 1] = tokenizer.eos_token + chunks_list.append(current_chunk) + current_chunk = [tokenizer.bos_token] + else: + # one long doc with sep inbetween + if len(current_chunk) < args.seqlen - 1: + current_chunk.append(tokenizer.sep_token) + random.shuffle(chunks_list) + val_count = int(args.train_dev_split * len(chunks_list)) + val_chunks = chunks_list[:val_count] + train_chunks = chunks_list[val_count:] def _tokenized_text_to_mmap(output_fname, chunks_list): - random.shuffle(chunks_list) num_chunks = len(chunks_list) all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16) for k, chunk in enumerate(tqdm(chunks_list)): @@ -222,6 +230,7 @@ def add_args(parser): parser.add_argument("--tokenizer", type=str, default='roberta-base') parser.add_argument("--model", type=str, default='roberta-base') parser.add_argument("--mlm_prob", type=float, default=0.15) + parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--adam_epsilon", type=float, default=1e-6) From 433a2e29d16f263cec3d9165191d819d87ac1e4b Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 08:18:24 -0700 Subject: [PATCH 041/112] todo --- scripts/pretrain.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index c787344..d2c5378 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -166,6 +166,8 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: + # TODO: PTL already doing this. Is it still needed here? + # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() avg_loss = avg_loss.item() @@ -195,6 +197,7 @@ def configure_optimizers(self): def _get_loader(self, fname, is_train): dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen) + # TODO: consider `replace_sampler_ddp=True` and removing the following if statement if self.trainer.use_ddp: sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) shuffle = False From ec472709f678e0aeee1d9071cb9919fb5a5a7179 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 09:22:56 -0700 Subject: [PATCH 042/112] wip --- scripts/pretrain.py | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index d2c5378..95ce577 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -4,6 +4,7 @@ import random import logging import numpy as np +import math from tqdm import tqdm import time import torch @@ -143,6 +144,7 @@ def training_step(self, batch, batch_nb): 'input_size': input_ids.numel(), 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, 'mlm_loss': loss.detach(), + 'mlm_bpc': loss.detach()/math.log(2), 'mlm_perplexity': torch.exp(loss).detach(), 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } @@ -225,30 +227,42 @@ def val_dataloader(self): @staticmethod def add_args(parser): parser.add_argument("--seed", type=int, default=3) + + # Dataset. Some of these params are only useful when generating the dataset cache parser.add_argument("--input_dir", type=str, required=True) - parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True) parser.add_argument("--train_dev_split", type=float, default=0.05) + parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--seqlen", type=int, default=512) + parser.add_argument("--mlm_prob", type=float, default=0.15) + + # HF model loading parser.add_argument("--tokenizer", type=str, default='roberta-base') parser.add_argument("--model", type=str, default='roberta-base') - parser.add_argument("--mlm_prob", type=float, default=0.15) - parser.add_argument("--padded_chunks", type=bool, default=False) - parser.add_argument("--weight_decay", type=float, default=0.01) + + # Checkpointing and logging + parser.add_argument("--save_dir", type=str, default='runs/') + parser.add_argument("--save_prefix", type=str, required=True) + parser.add_argument("--resume", type=str, default=None) + + # Training hyperparams parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates') + parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates') + parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations') + parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**') + parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--adam_epsilon", type=float, default=1e-6) parser.add_argument("--grad_clip", type=float, default=0) - parser.add_argument("--warmup_steps", type=int, default=30, help='# warmup gradient updates') - parser.add_argument("--train_steps", type=int, default=100, help='# training gradient updates') - parser.add_argument("--val_every", type=int, default=25, help='# training gradient updates between evaluations') - parser.add_argument("--val_batches", type=int, default=20, help='# evaluation **batches**') - parser.add_argument("--batch_size", type=int, default=8) + + # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) + parser.add_argument("--batch_size", type=int, default=32) + parser.add_argument("--grad_accum", type=int, default=16) + + # Compute resources parser.add_argument("--num_workers", type=int, default=0) - parser.add_argument("--grad_accum", type=int, default=1) # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward parser.add_argument("--gpu_count", type=int, default=1) - parser.add_argument("--resume", type=str, default=None) parser.add_argument("--num_tpu_cores", type=int, default=None) return parser From 77e105d14a65c17878f07e33bc19979d8c57cf98 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 13:51:56 -0700 Subject: [PATCH 043/112] wip --- scripts/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 95ce577..a715c9c 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -24,6 +24,7 @@ # TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod +# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635 class MMapTextDataset(Dataset): @@ -157,7 +158,6 @@ def training_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb): # TODO: log how long evaluation takes - # TODO: reproduce roberta evaluation numbers on the longformer corpus self.start_time = 0 # reset training_step timer loss = self(**batch) tensorboard_logs = { From af08b5a2ef4d72a859860c5f048204c069d56694 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 17:24:05 -0700 Subject: [PATCH 044/112] wip --- requirements.txt | 3 ++- scripts/pretrain.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2279015..cbce7f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning + torch>=1.5.0 transformers==3.0.2 tensorboardX -pytorch-lightning==0.8.5 test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index a715c9c..2f6b890 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -21,10 +21,12 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# DONE: reproduce RoBERTa numbers on the Longformer corpus # TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: PTL bug: https://github.com/PyTorchLightning/pytorch-lightning/issues/2635 +# TODO: try restarting and double check optimizer, lr and lr scheduler +# TODO: try fp16 class MMapTextDataset(Dataset): @@ -260,9 +262,8 @@ def add_args(parser): # Compute resources parser.add_argument("--num_workers", type=int, default=0) - # `--gpus` is reserved. Always set CUDA_VISIBLE_DEVICES to 0,1,2 ... n - # TODO: PTL has a bug in gpu selection and it will always select gpus starting from 0 upward - parser.add_argument("--gpu_count", type=int, default=1) + parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL + help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") parser.add_argument("--num_tpu_cores", type=int, default=None) return parser @@ -299,7 +300,6 @@ def main(args): args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, - auto_select_gpus=False, num_tpu_cores=args.num_tpu_cores, distributed_backend='ddp' if args.gpu_count > 1 else None, replace_sampler_ddp=False, From d1050232a91b8aee85cca032c761543479f8d748 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 20:13:08 -0700 Subject: [PATCH 045/112] wip --- requirements.txt | 2 +- scripts/pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index cbce7f0..b396708 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning -torch>=1.5.0 +torch==1.3.0 transformers==3.0.2 tensorboardX test-tube==0.7.5 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 2f6b890..3263537 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -26,7 +26,6 @@ # TODO: try on a single TPU # TODO: try on a TPU-pod # TODO: try restarting and double check optimizer, lr and lr scheduler -# TODO: try fp16 class MMapTextDataset(Dataset): @@ -313,6 +312,7 @@ def main(args): accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, + precision=16, amp_level='O2', callbacks=[LearningRateLogger()] ) trainer.fit(pretrainer) From 1183999999500f1043f001a08549bafd18ea84db Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 22:25:48 -0700 Subject: [PATCH 046/112] wip --- scripts/pretrain.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 3263537..e79eb17 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -225,6 +225,12 @@ def train_dataloader(self): def val_dataloader(self): return self._get_loader(f'{self.args.input_dir}/cache/val.bin', False) + def grad_norm(self, norm_type): + # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params + # TODO: grad_norm reporting needs to take fp16 loss scale into account + all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] + return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))} + @staticmethod def add_args(parser): parser.add_argument("--seed", type=int, default=3) @@ -302,7 +308,7 @@ def main(args): num_tpu_cores=args.num_tpu_cores, distributed_backend='ddp' if args.gpu_count > 1 else None, replace_sampler_ddp=False, - track_grad_norm=-1, # TODO: add logging for gradient norm + track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, @@ -313,7 +319,7 @@ def main(args): resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, precision=16, amp_level='O2', - callbacks=[LearningRateLogger()] + callbacks=[LearningRateLogger()], ) trainer.fit(pretrainer) From 20e8208af7617c132bf1d7e2008a4569c97f7beb Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 17 Jul 2020 22:32:46 -0700 Subject: [PATCH 047/112] wip --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b396708..e47a1d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning -torch==1.3.0 +torch==1.3.1 transformers==3.0.2 tensorboardX test-tube==0.7.5 From 224824d9e74a12e7c3a0d7bd6bd1ac858a1a02df Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 07:58:49 -0700 Subject: [PATCH 048/112] wip --- scripts/pretrain.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index e79eb17..5022aa8 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -22,7 +22,6 @@ logger = logging.getLogger(__name__) # DONE: reproduce RoBERTa numbers on the Longformer corpus -# TODO: Try on multiple machines # TODO: try on a single TPU # TODO: try on a TPU-pod # TODO: try restarting and double check optimizer, lr and lr scheduler @@ -259,7 +258,7 @@ def add_args(parser): parser.add_argument("--val_batches", type=int, default=1000, help='# evaluation **batches**') parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--adam_epsilon", type=float, default=1e-6) - parser.add_argument("--grad_clip", type=float, default=0) + parser.add_argument("--grad_clip", type=float, default=0) # TODO: test this with fp16. Likely not working # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) parser.add_argument("--batch_size", type=int, default=32) @@ -269,6 +268,21 @@ def add_args(parser): parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") + + # For multi-node training, use the PyTorch launch script. The script and instructions can be found here: + # https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py. + # To run PTL in a mode compatible with the launch script, two things are needed: + # - pass the argument `--use_env` to `torch.distributed.launch` + # - make sure `--nproc_per_node` matches `--gpu_count` and `--nnodes` matches `--node_count`. + # For example, to run on 2 nodes, 3 gpus each, the command line on node rank 1 would be like: + # >>>> python -m torch.distributed.launch \ + # --use_env --nnodes 2 --nproc_per_node 3 \ + # --node_rank 1 --master_addr s2-server4 --master_port 12343 \ + # scripts/pretrain.py \ + # --gpu_count 2 --node_count 2 \ + # --input_dir my_data_dir --save_prefix test_multinode + parser.add_argument("--node_count", type=int, default=1, + help="Number of nodes. It needs to match --nnodes of torch.distributed.launch") parser.add_argument("--num_tpu_cores", type=int, default=None) return parser @@ -305,8 +319,9 @@ def main(args): args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu trainer = ptl.Trainer( gpus=args.gpu_count, + num_nodes=args.node_count, num_tpu_cores=args.num_tpu_cores, - distributed_backend='ddp' if args.gpu_count > 1 else None, + distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None, replace_sampler_ddp=False, track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps From 4a1273082f7c33de7392825451b20920d380036f Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 09:41:34 -0700 Subject: [PATCH 049/112] wip --- scripts/pretrain.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 5022aa8..10165b0 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -22,9 +22,12 @@ logger = logging.getLogger(__name__) # DONE: reproduce RoBERTa numbers on the Longformer corpus +# DONE: testing ddp single machine +# DONE: testing ddp multiple machines +# DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: try restarting and double check optimizer, lr and lr scheduler +# TODO: only one checkpoint per epoch is saved class MMapTextDataset(Dataset): @@ -168,7 +171,7 @@ def validation_step(self, batch, batch_nb): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['log']['val_mlm_loss'] for x in outputs if 'val_mlm_loss' in x['log']]).mean() if self.use_ddp: - # TODO: PTL already doing this. Is it still needed here? + # TODO: PTL is already doing this. Is it still needed here? # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() @@ -189,11 +192,10 @@ def configure_optimizers(self): "weight_decay": 0.0, }, ] - optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps ) - return [optimizer], [{"scheduler": scheduler, "interval": "step"}] def _get_loader(self, fname, is_train): @@ -247,11 +249,15 @@ def add_args(parser): # Checkpointing and logging parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True) - parser.add_argument("--resume", type=str, default=None) + parser.add_argument("--save_prefix", type=str, required=True, + help="path of output directory is --save_dir/--save_prefix") + parser.add_argument("--resume", type=str, default=None, # It is better to use a different output dir. + help="Path to a checkpoint to load model weights and training state. It overwrites args") + parser.add_argument("--resume_model_only", type=str, default=None, + help="Path to a checkpoint to load model weights but not training state") # Training hyperparams - parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--lr", type=float, default=1e-5) parser.add_argument("--train_steps", type=int, default=3000, help='# training grad. updates') parser.add_argument("--warmup_steps", type=int, default=1000, help='# warmup grad. updates') parser.add_argument("--val_every", type=int, default=1000, help='# training grad. updates between evaluations') @@ -295,7 +301,10 @@ def main(args): if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed * 10000) - pretrainer = Pretrainer(args) + if args.resume_model_only is not None: + pretrainer = Pretrainer.load_from_checkpoint(args.resume_model_only, args) + else: + pretrainer = Pretrainer(args) # logger here is a SummaryWritter for tensorboard # it is used by the trainer, and certain return variables From c936d24414235405b37689b5e0df06040b5acc77 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 18 Jul 2020 10:24:16 -0700 Subject: [PATCH 050/112] wip --- scripts/pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 10165b0..6015fa0 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -27,7 +27,6 @@ # DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod -# TODO: only one checkpoint per epoch is saved class MMapTextDataset(Dataset): @@ -175,7 +174,6 @@ def validation_epoch_end(self, outputs): # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() - avg_loss = avg_loss.item() logs = {'val_mlm_loss': avg_loss} return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss} @@ -320,9 +318,11 @@ def main(args): filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'), prefix='', save_top_k=3, + save_last=True, verbose=True, monitor='val_loss', mode='min', + period=-1, # to allow multiple checkpoints per epoch ) args.val_every *= args.grad_accum # PTL is expecting number of batches_per_gpu From 510801bd76896e43359eef66979c119e1de237a2 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sun, 19 Jul 2020 09:06:10 -0700 Subject: [PATCH 051/112] wip --- scripts/pretrain.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 6015fa0..683b008 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -27,6 +27,7 @@ # DONE: testing resume from checkpoint # TODO: try on a single TPU # TODO: try on a TPU-pod +# TODO: run on beaker on ai2-server1/2 class MMapTextDataset(Dataset): From 9184b718e61c9b8abf3c0d846f16bbe33bc46f18 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 20:16:38 +0000 Subject: [PATCH 052/112] wip --- scripts/cheatsheet.txt | 22 ++++++++++++++++ scripts/test_tpu.py | 57 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 scripts/test_tpu.py diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index be4fc3a..d39371e 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -70,3 +70,25 @@ python -m scripts.triviaqa_utils.evaluation_utils \ --prediction_file predictions.json # Output should be: {'exact_match': 73.07644188665083, 'f1': 77.78523804802242, 'common': 7993, 'denominator': 7993, 'pred_len': 7993, 'gold_len': 7993} + + +# TPU +export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" + +source /anaconda3/bin/activate torch-xla-nightly + +import torch_xla.debug.metrics as met; print(met.metrics_report()) + +curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 + +/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py + + XLA_IR_DEBUG=1 + XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470 + TF_CPP_LOG_THREAD_ID=1 + TF_CPP_MIN_LOG_LEVEL=0 + XLA_HLO_DEBUG=1 + XLA_DUMP_FATAL_STACK=1 + TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1 + XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs + XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py new file mode 100644 index 0000000..618ee6f --- /dev/null +++ b/scripts/test_tpu.py @@ -0,0 +1,57 @@ +import os +import torch +from torch.utils.data import DataLoader, Dataset +from transformers import AutoModel +import pytorch_lightning as pl + +class CoolDataset(Dataset): + def __len__(self): + return 128 * 128 + + def __getitem__(self, idx): + return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128) + +class CoolSystem(pl.LightningModule): + + def __init__(self): + super().__init__() + + # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096') + self.model = AutoModel.from_pretrained('bert-base-uncased') + + def forward(self, x, y): + return self.model(x, attention_mask=None) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x, y) + loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) + tensorboard_logs = {'train_loss': loss} + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x, y) + val_loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) + return {'val_loss': val_loss} + + def validation_epoch_end(self, outputs): + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + tensorboard_logs = {'val_loss': avg_loss} + return {'val_loss': avg_loss, 'log': tensorboard_logs} + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=0.001) + + def train_dataloader(self): + loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) + return loader + + def val_dataloader(self): + loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) + return loader + +if __name__ == '__main__': + model = CoolSystem() + trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0) + trainer.fit(model) From 4ae991a4ffb6470badd593cfbe7f72b8aba0b89f Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 20:43:27 +0000 Subject: [PATCH 053/112] wip --- scripts/cheatsheet.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index d39371e..6dde8ce 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -82,13 +82,3 @@ import torch_xla.debug.metrics as met; print(met.metrics_report()) curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py - - XLA_IR_DEBUG=1 - XRT_TPU_CONFIG=tpu_worker;0;10.125.212.42:8470 - TF_CPP_LOG_THREAD_ID=1 - TF_CPP_MIN_LOG_LEVEL=0 - XLA_HLO_DEBUG=1 - XLA_DUMP_FATAL_STACK=1 - TF_CPP_VMODULE=tensor=5,computation_client=5,xrt_computation_client=5,aten_xla_type=1 - XLA_SAVE_TENSORS_FILE=/tmp/debug_run-beltagy4-beltagy/graphs - XLA_METRICS_FILE=/tmp/debug_run-beltagy4-beltagy/metrics From aea2a984563026bedc10d63b63307d245822cb1f Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 21:45:45 +0000 Subject: [PATCH 054/112] tpu --- scripts/pretrain.py | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 683b008..93d83a4 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -30,6 +30,14 @@ # TODO: run on beaker on ai2-server1/2 +try: + import torch_xla.core.xla_model as xm +except ImportError: + XLA_AVAILABLE = False +else: + XLA_AVAILABLE = True + + class MMapTextDataset(Dataset): def __init__(self, mmap_filename, chunk_size): self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size @@ -146,16 +154,17 @@ def training_step(self, batch, batch_nb): input_ids = batch['input_ids'] tensorboard_logs = { 'input_size': input_ids.numel(), - 'memory': torch.cuda.memory_allocated(loss.device) / 1024 ** 3, - 'mlm_loss': loss.detach(), - 'mlm_bpc': loss.detach()/math.log(2), - 'mlm_perplexity': torch.exp(loss).detach(), + 'mlm_loss': loss, + 'mlm_bpc': loss/math.log(2), + 'mlm_perplexity': torch.exp(loss), 'token_per_step': input_ids.numel() * self.args.grad_accum * self.trainer.world_size, } if self.start_time != 0: elapsed_time = time.time() - self.start_time tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() + if not XLA_AVAILABLE: + tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3 return {'loss': loss, 'log': tensorboard_logs} @@ -204,6 +213,14 @@ def _get_loader(self, fname, is_train): if self.trainer.use_ddp: sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) shuffle = False + elif self.trainer.use_tpu: + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, + num_replicas=xm.xrt_world_size(), + rank=xm.get_ordinal(), + shuffle=is_train, + ) + shuffle = False else: sampler = None shuffle = is_train @@ -227,6 +244,10 @@ def val_dataloader(self): def grad_norm(self, norm_type): # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params + + if XLA_AVAILABLE: + return {} # computing grad_norm one parameter at a time takes forever on TPU + # TODO: grad_norm reporting needs to take fp16 loss scale into account all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))} @@ -266,8 +287,8 @@ def add_args(parser): parser.add_argument("--grad_clip", type=float, default=0) # TODO: test this with fp16. Likely not working # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) - parser.add_argument("--batch_size", type=int, default=32) - parser.add_argument("--grad_accum", type=int, default=16) + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--grad_accum", type=int, default=1) # Compute resources parser.add_argument("--num_workers", type=int, default=0) @@ -288,7 +309,7 @@ def add_args(parser): # --input_dir my_data_dir --save_prefix test_multinode parser.add_argument("--node_count", type=int, default=1, help="Number of nodes. It needs to match --nnodes of torch.distributed.launch") - parser.add_argument("--num_tpu_cores", type=int, default=None) + parser.add_argument("--tpu_core_count", type=int, default=None) return parser @@ -330,20 +351,22 @@ def main(args): trainer = ptl.Trainer( gpus=args.gpu_count, num_nodes=args.node_count, - num_tpu_cores=args.num_tpu_cores, + num_tpu_cores=args.tpu_core_count, distributed_backend='ddp' if (args.gpu_count > 1 or args.node_count > 1) else None, replace_sampler_ddp=False, track_grad_norm=2, max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=10, + row_log_interval=16, + progress_bar_refresh_rate=16, logger=logger, checkpoint_callback=checkpoint_callback, accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, precision=16, amp_level='O2', + num_sanity_val_steps=2, callbacks=[LearningRateLogger()], ) trainer.fit(pretrainer) From 69b717a259d6e82376096fb6ec343bd576d58cd3 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 15:47:11 -0700 Subject: [PATCH 055/112] wip --- scripts/pretrain.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 93d83a4..2202a39 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -26,6 +26,12 @@ # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint # TODO: try on a single TPU +# - tie weights +# - tensorboard +# - getrank +# - barrier +# - val all_reduce +# - checkpointing # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -257,7 +263,7 @@ def add_args(parser): parser.add_argument("--seed", type=int, default=3) # Dataset. Some of these params are only useful when generating the dataset cache - parser.add_argument("--input_dir", type=str, required=True) + parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/') parser.add_argument("--train_dev_split", type=float, default=0.05) parser.add_argument("--padded_chunks", type=bool, default=False) parser.add_argument("--seqlen", type=int, default=512) @@ -269,7 +275,7 @@ def add_args(parser): # Checkpointing and logging parser.add_argument("--save_dir", type=str, default='runs/') - parser.add_argument("--save_prefix", type=str, required=True, + parser.add_argument("--save_prefix", type=str, default='test', help="path of output directory is --save_dir/--save_prefix") parser.add_argument("--resume", type=str, default=None, # It is better to use a different output dir. help="Path to a checkpoint to load model weights and training state. It overwrites args") @@ -291,6 +297,7 @@ def add_args(parser): parser.add_argument("--grad_accum", type=int, default=1) # Compute resources + parser.add_argument("--fp16", type=bool, default=False) parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--gpu_count", type=int, default=1, # `--gpus` is reserved for internal use by PTL help="Number of gpus. This respects `CUDA_VISIBLE_DEVICES`") @@ -365,7 +372,7 @@ def main(args): accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, - precision=16, amp_level='O2', + precision=16 if args.fp16 else 32, amp_level='O2', num_sanity_val_steps=2, callbacks=[LearningRateLogger()], ) From 5f641c05cec9a73406a5911e99a80e80bfb583b1 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 23 Jul 2020 02:54:28 +0000 Subject: [PATCH 056/112] wip --- scripts/cheatsheet.txt | 2 ++ scripts/pretrain.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index 6dde8ce..1e77b07 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -82,3 +82,5 @@ import torch_xla.debug.metrics as met; print(met.metrics_report()) curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py + +/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096 diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 2202a39..ab1120a 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -281,6 +281,7 @@ def add_args(parser): help="Path to a checkpoint to load model weights and training state. It overwrites args") parser.add_argument("--resume_model_only", type=str, default=None, help="Path to a checkpoint to load model weights but not training state") + parser.add_argument("--log_rate", type=int, default=16) # Training hyperparams parser.add_argument("--lr", type=float, default=1e-5) @@ -365,8 +366,8 @@ def main(args): max_epochs=10000, min_epochs=0, max_steps=args.train_steps, # run for many epochs, but stop after max_steps val_check_interval=args.val_every, limit_val_batches=args.val_batches, early_stop_callback=None, - row_log_interval=16, - progress_bar_refresh_rate=16, + row_log_interval=args.log_rate, + progress_bar_refresh_rate=args.log_rate, logger=logger, checkpoint_callback=checkpoint_callback, accumulate_grad_batches=args.grad_accum, From e3ddeca7507e91f5659f1d0f480267e08cc4484c Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 22 Jul 2020 22:12:45 -0700 Subject: [PATCH 057/112] wip --- scripts/test_tpu.py | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py index 618ee6f..0df4091 100644 --- a/scripts/test_tpu.py +++ b/scripts/test_tpu.py @@ -1,23 +1,25 @@ -import os import torch from torch.utils.data import DataLoader, Dataset from transformers import AutoModel import pytorch_lightning as pl + class CoolDataset(Dataset): - def __len__(self): - return 128 * 128 - def __getitem__(self, idx): - return torch.tensor([1, 2, 3, 4,] * 128), torch.tensor([1, 1, 1, 1] * 128) + def __len__(self): + return 128 * 128 + + def __getitem__(self, idx): + return torch.tensor([1, 2, 3, 4] * 128 * 8), torch.tensor([1, 1, 1, 1] * 128 * 8) + class CoolSystem(pl.LightningModule): def __init__(self): super().__init__() - # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096') - self.model = AutoModel.from_pretrained('bert-base-uncased') + self.model = AutoModel.from_pretrained('allenai/longformer-base-4096') + # self.model = AutoModel.from_pretrained('roberta-base') def forward(self, x, y): return self.model(x, attention_mask=None) @@ -25,20 +27,8 @@ def forward(self, x, y): def training_step(self, batch, batch_idx): x, y = batch y_hat = self(x, y) - loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) - tensorboard_logs = {'train_loss': loss} - return {'loss': loss, 'log': tensorboard_logs} - - def validation_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x, y) - val_loss = y_hat[0].sum() # F.cross_entropy(y_hat, y) - return {'val_loss': val_loss} - - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - tensorboard_logs = {'val_loss': avg_loss} - return {'val_loss': avg_loss, 'log': tensorboard_logs} + loss = y_hat[0].sum() + return {'loss': loss} def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=0.001) @@ -47,11 +37,8 @@ def train_dataloader(self): loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) return loader - def val_dataloader(self): - loader = DataLoader(CoolDataset(), batch_size=1, num_workers=0) - return loader if __name__ == '__main__': model = CoolSystem() - trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0) + trainer = pl.Trainer(progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0, num_tpu_cores=1) trainer.fit(model) From 00ce1e9635b2e745b81bc7639949d654b6c10a39 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 23 Jul 2020 07:30:13 +0000 Subject: [PATCH 058/112] wip --- scripts/test_tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py index 0df4091..8f50eba 100644 --- a/scripts/test_tpu.py +++ b/scripts/test_tpu.py @@ -40,5 +40,5 @@ def train_dataloader(self): if __name__ == '__main__': model = CoolSystem() - trainer = pl.Trainer(progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0, num_tpu_cores=1) + trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0) trainer.fit(model) From 56b9c6aa544d86d6652a0ecd05f07d183043d449 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 23 Jul 2020 21:08:35 +0000 Subject: [PATCH 059/112] wip --- scripts/cheatsheet.txt | 3 +-- scripts/pretrain.py | 14 ++++++++++---- scripts/test_tpu.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index 1e77b07..e9f3fba 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -73,12 +73,11 @@ python -m scripts.triviaqa_utils.evaluation_utils \ # TPU +export TPU_IP_ADDRESS=10.125.212.42 export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" - source /anaconda3/bin/activate torch-xla-nightly import torch_xla.debug.metrics as met; print(met.metrics_report()) - curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py diff --git a/scripts/pretrain.py b/scripts/pretrain.py index ab1120a..9706b19 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -30,8 +30,11 @@ # - tensorboard # - getrank # - barrier -# - val all_reduce -# - checkpointing +# - checkpointing (broken) +# - gradient accumulation +# - set_epoch bug +# - gradient clipping +# TODO: use AutoModelForMaskedLM and remove masked_lm_labels # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -169,7 +172,7 @@ def training_step(self, batch, batch_nb): elapsed_time = time.time() - self.start_time tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() - if not XLA_AVAILABLE: + if not self.use_tpu: tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3 return {'loss': loss, 'log': tensorboard_logs} @@ -190,6 +193,9 @@ def validation_epoch_end(self, outputs): # https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/metrics/converters.py#L251 torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= torch.distributed.get_world_size() + elif self.use_tpu: + avg_loss = xm.all_reduce(xm.REDUCE_SUM, avg_loss) / xm.xrt_world_size() + logs = {'val_mlm_loss': avg_loss} return {'log': logs, 'progress_bar': logs, "val_loss": avg_loss} @@ -251,7 +257,7 @@ def val_dataloader(self): def grad_norm(self, norm_type): # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params - if XLA_AVAILABLE: + if self.use_tpu: return {} # computing grad_norm one parameter at a time takes forever on TPU # TODO: grad_norm reporting needs to take fp16 loss scale into account diff --git a/scripts/test_tpu.py b/scripts/test_tpu.py index 8f50eba..e692890 100644 --- a/scripts/test_tpu.py +++ b/scripts/test_tpu.py @@ -40,5 +40,5 @@ def train_dataloader(self): if __name__ == '__main__': model = CoolSystem() - trainer = pl.Trainer(num_tpu_cores=1, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0) + trainer = pl.Trainer(num_tpu_cores=8, progress_bar_refresh_rate=1, max_epochs=10, num_sanity_val_steps=0, gpus=0) trainer.fit(model) From 8fca18703925d062c2557ebb73618e31fd5e0fa8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 03:47:06 +0000 Subject: [PATCH 060/112] wip --- scripts/pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 9706b19..6b88e94 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -79,6 +79,7 @@ def raw_text_to_mmap(args): os.mkdir(f'{args.input_dir}/cache/') # TODO: process each shared in a separate worker and save their output to files + # TODO: update the data generation to avoid the need for regeneration if the seqlen changes chunks_list = [] for fname in tqdm(all_files): @@ -353,7 +354,7 @@ def main(args): # model saved to filepath/prefix_.... filepath=os.path.join(args.save_dir, args.save_prefix, 'checkpoint'), prefix='', - save_top_k=3, + save_top_k=1, save_last=True, verbose=True, monitor='val_loss', From 9dd76b74ca807bd89027a4d9267fa93c79373dea Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 03:58:56 +0000 Subject: [PATCH 061/112] wip --- scripts/pretrain.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 6b88e94..7cfe0b2 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -28,9 +28,6 @@ # TODO: try on a single TPU # - tie weights # - tensorboard -# - getrank -# - barrier -# - checkpointing (broken) # - gradient accumulation # - set_epoch bug # - gradient clipping From d40983a44c8ff86342b581d659ecc428138f1dda Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 08:22:25 -0700 Subject: [PATCH 062/112] wip --- scripts/pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 7cfe0b2..888a731 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -27,12 +27,12 @@ # DONE: testing resume from checkpoint # TODO: try on a single TPU # - tie weights -# - tensorboard # - gradient accumulation # - set_epoch bug # - gradient clipping -# TODO: use AutoModelForMaskedLM and remove masked_lm_labels +# - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698 # TODO: try on a TPU-pod +# TODO: use AutoModelForMaskedLM and remove masked_lm_labels # TODO: run on beaker on ai2-server1/2 From f0f6a3033b7ff190999790e6a45664615ea8cc05 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 15:56:03 +0000 Subject: [PATCH 063/112] wip --- scripts/cheatsheet.txt | 10 ++++++---- scripts/pretrain.py | 8 +++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt index e9f3fba..c0ab4e5 100644 --- a/scripts/cheatsheet.txt +++ b/scripts/cheatsheet.txt @@ -73,13 +73,15 @@ python -m scripts.triviaqa_utils.evaluation_utils \ # TPU -export TPU_IP_ADDRESS=10.125.212.42 -export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" -source /anaconda3/bin/activate torch-xla-nightly - import torch_xla.debug.metrics as met; print(met.metrics_report()) curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722 /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py /usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096 + +python scripts/pretrain.py --input_dir data/ --save_prefix test_grad_accum --gpu_count 0 --tpu_core_count 8 --val_batches 30 --val_every 30 --num_workers 0 --log_rate 1 + +export TPU_IP_ADDRESS=10.125.212.42 +export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" +source /anaconda3/bin/activate torch-xla-nightly diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 7cfe0b2..dd5d486 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -8,7 +8,7 @@ from tqdm import tqdm import time import torch -from transformers import AutoTokenizer, AutoModelWithLMHead +from transformers import AutoTokenizer, AutoModelForMaskedLM from transformers import DataCollatorForLanguageModeling from transformers.optimization import AdamW, get_linear_schedule_with_warmup @@ -28,10 +28,8 @@ # TODO: try on a single TPU # - tie weights # - tensorboard -# - gradient accumulation # - set_epoch bug # - gradient clipping -# TODO: use AutoModelForMaskedLM and remove masked_lm_labels # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -134,7 +132,7 @@ def __init__(self, hparams): self.args = hparams self.hparams = self.args - self.model = AutoModelWithLMHead.from_pretrained(args.model) + self.model = AutoModelForMaskedLM.from_pretrained(args.model) self.config = self.model.config tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) self.pad_token_id = tokenizer.pad_token_id @@ -153,7 +151,7 @@ def forward(self, input_ids=None, labels=None): attention_mask = (input_ids != self.pad_token_id).int() # output is loss, prediction_scores, hidden_states - output = self.model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) + output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) return output[0] # loss def training_step(self, batch, batch_nb): From 9eb6fdf5e1c92c1d8b6a54843de88817a543acdb Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 19:58:16 +0000 Subject: [PATCH 064/112] wip --- scripts/pretrain.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 612043d..1181575 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -147,6 +147,17 @@ def __init__(self, hparams): ) self.start_time = 0 + def to(self, *args, **kwargs): + param_count_before_to = len(list(self.parameters())) + super().to(*args, **kwargs) + if self.trainer.use_tpu: + # need to re-tie the weights after moving to XLA! + self.model.tie_weights() + if 'roberta' in self.args.model: + self.model.lm_head.bias = self.model.lm_head.decoder.bias + param_count_after_to = len(list(self.parameters())) + assert param_count_before_to == param_count_after_to + def forward(self, input_ids=None, labels=None): # get the padding mask - 1 for NOT masked, 0 for MASKED/PAD attention_mask = (input_ids != self.pad_token_id).int() From 14b60745f5952cf32acd9474377bbf13e0e45bc2 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 19:58:44 +0000 Subject: [PATCH 065/112] wip --- scripts/pretrain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 1181575..410c46f 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -26,7 +26,6 @@ # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint # TODO: try on a single TPU -# - tie weights # - set_epoch bug # - gradient clipping # - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698 From 5b97bd6825972c3d536fd664ce373fb20a118de0 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 20:07:24 +0000 Subject: [PATCH 066/112] wip --- scripts/pretrain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 410c46f..46a4deb 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -30,7 +30,6 @@ # - gradient clipping # - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698 # TODO: try on a TPU-pod -# TODO: use AutoModelForMaskedLM and remove masked_lm_labels # TODO: run on beaker on ai2-server1/2 From 71d7a9dd4534a9eef824a34022ccdd66dd7e375d Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 20:41:20 +0000 Subject: [PATCH 067/112] wip --- scripts/pretrain.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 46a4deb..e78ee84 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -25,10 +25,7 @@ # DONE: testing ddp single machine # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint -# TODO: try on a single TPU -# - set_epoch bug -# - gradient clipping -# - tensorboard: https://github.com/PyTorchLightning/pytorch-lightning/issues/2698 +# TODO: check gradient clipping on a single TPU # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -294,6 +291,7 @@ def add_args(parser): parser.add_argument("--resume_model_only", type=str, default=None, help="Path to a checkpoint to load model weights but not training state") parser.add_argument("--log_rate", type=int, default=16) + parser.add_argument("--disable_checkpointing", type=bool, default=False) # Training hyperparams parser.add_argument("--lr", type=float, default=1e-5) @@ -381,7 +379,7 @@ def main(args): row_log_interval=args.log_rate, progress_bar_refresh_rate=args.log_rate, logger=logger, - checkpoint_callback=checkpoint_callback, + checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else None, accumulate_grad_batches=args.grad_accum, resume_from_checkpoint=args.resume, gradient_clip_val=args.grad_clip, From 97a126d37520ef2b37e5c5fa82dda9a6cf003182 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 21:24:58 +0000 Subject: [PATCH 068/112] wip --- scripts/pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index e78ee84..8076273 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -25,7 +25,7 @@ # DONE: testing ddp single machine # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint -# TODO: check gradient clipping on a single TPU +# TODO: enable gradient norm logging on a single TPU # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -261,7 +261,7 @@ def grad_norm(self, norm_type): # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params if self.use_tpu: - return {} # computing grad_norm one parameter at a time takes forever on TPU + return {} # TODO: computing grad_norm one parameter at a time takes forever on TPU # TODO: grad_norm reporting needs to take fp16 loss scale into account all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] From c873da2e7ddd8c6a9e068ffede73bb46405661ef Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 25 Jul 2020 15:32:23 -0700 Subject: [PATCH 069/112] wip --- scripts/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 8076273..3572fa4 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -136,7 +136,7 @@ def __init__(self, hparams): logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.') MMapTextDataset.raw_text_to_mmap(args) - # TODO: add support for other objective functions + # TODO: add support for other objective functions (whole word masking, BART objectives) self.data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=self.args.mlm_prob ) From d602869b1b9a5e401e92c8924b43b0cf09fdff17 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 27 Jul 2020 23:10:56 -0700 Subject: [PATCH 070/112] faster gradnorm --- scripts/pretrain.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 3572fa4..c684a55 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -259,13 +259,16 @@ def val_dataloader(self): def grad_norm(self, norm_type): # Override PTL `grad_norm` function to only return `total_grad_norm` instead norms of individual params - - if self.use_tpu: - return {} # TODO: computing grad_norm one parameter at a time takes forever on TPU - # TODO: grad_norm reporting needs to take fp16 loss scale into account - all_norms = [float(p.grad.data.norm(float(norm_type))) for p in self.parameters() if p.grad is not None] - return {'total_grad_norm': float(torch.tensor(all_norms).norm(norm_type))} + parameters = [p for p in self.parameters() if p.grad is not None] + device = parameters[0].device + total_norm = torch.zeros([], device=device if parameters else None) + norm_type = float(norm_type) + for p in parameters: + param_norm = p.grad.data.pow(norm_type).sum() + total_norm.add_(param_norm) + total_norm = (total_norm ** (1.0 / norm_type)) + return {'total_grad_norm': total_norm} @staticmethod def add_args(parser): @@ -304,7 +307,7 @@ def add_args(parser): parser.add_argument("--grad_clip", type=float, default=0) # TODO: test this with fp16. Likely not working # RoBERTa's tokens_per_step = 2^18 = 512(seqlen) x 1(gpu_count) x 32(batch_size) x 16(grad_accum) - parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--grad_accum", type=int, default=1) # Compute resources From ffd06dd5606d801ca1b7e37c4fc75054f1943739 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 27 Jul 2020 23:54:27 -0700 Subject: [PATCH 071/112] allow changing seqlen at runtime --- scripts/pretrain.py | 128 ++++++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 53 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index c684a55..253cfc5 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -39,8 +39,9 @@ class MMapTextDataset(Dataset): - def __init__(self, mmap_filename, chunk_size): - self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // chunk_size + def __init__(self, mmap_filename, chunk_size, bos_token_id, eos_token_id): + # `chunk_size - 2` to reserve space for and + self.num_instances = np.memmap(mmap_filename, mode='r', dtype=np.uint16).shape[0] // (chunk_size - 2) # defer loading the token_ids memmap until after the first __getitem__ call. # when spawning new processes for ddp, there is a hard limit in python < 3.8 that # pickle files need to be < 4GB. By waiting until after the first __getitem__ we @@ -48,18 +49,30 @@ def __init__(self, mmap_filename, chunk_size): self.token_ids = None self._mmap_filename = mmap_filename self._chunk_size = chunk_size + self._bos_token_id = bos_token_id + self._eos_token_id = eos_token_id def __len__(self): return self.num_instances def __getitem__(self, i): if self.token_ids is None: - self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16, - shape=(self.num_instances, self._chunk_size)) - return torch.tensor(self.token_ids[i, :].astype(np.int32), dtype=torch.long) + self.token_ids = np.memmap(self._mmap_filename, mode='r', dtype=np.uint16) + from_index = i * (self._chunk_size - 2) + to_index = (i + 1) * (self._chunk_size - 2) + data = np.concatenate(([self._bos_token_id], self.token_ids[from_index:to_index], [self._eos_token_id])) + return torch.tensor(data, dtype=torch.long) @staticmethod def raw_text_to_mmap(args): + """This is the main preprocessing function. It processes all the text files in `args.input_dir` and + outputs two np.memmap files, one for training and one for validation with ratio `args.train_dev_split`. + Processing each input file involves tokenizing it, sharding it into shards of size `args.shard_size`, + then writing each shard as an np.memmap file. The stream of tokens in the memmap file represents documents + separated with `tokenizer.sep_token`. In `__getitem__`, the `tokenizer.bos_token` and `tokenizer.eos_token` + are added. The reason for not adding them at preprocessing time is to allow different sequence lengths + later on. Notice that this is the "FULL-SENTENCES" setting in the RoBERTa paper, Table2. + """ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) assert len(tokenizer) < 65535 # will use uint16 to store token ids all_files = glob.glob(f'{args.input_dir}/*.txt') @@ -68,56 +81,62 @@ def raw_text_to_mmap(args): logger.info("Cache already exists. Remove the cache directory to regenerate") return os.mkdir(f'{args.input_dir}/cache/') - - # TODO: process each shared in a separate worker and save their output to files - # TODO: update the data generation to avoid the need for regeneration if the seqlen changes - - chunks_list = [] - for fname in tqdm(all_files): - with open(fname, 'r') as fin: - current_chunk = [tokenizer.bos_token] + os.mkdir(f'{args.input_dir}/shards/') + + # TODO: support continue after a crash + + for full_fname in tqdm(all_files): # TODO: process each input file in a separate worker + fname = full_fname.split('/')[-1] + with open(full_fname, 'r') as fin: + + def _write_shard(data, idx): + if len(data) == 0: + return + shared_filename = f'{args.input_dir}/shards/{fname}-{idx}.bin' + logging.info(f'Writing {len(data)} tokens to shared {shared_filename}') + fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(data)) + fp[:] = data[:] + del fp # flush and close file + token_list = [] + shard_num = 0 for line in tqdm(fin): - if line.strip() == '': # drop empty lines + line = line.strip() + if line == '': # drop empty lines continue - tokens = tokenizer.tokenize(line) # each line is one document - # generate chunks of length args.seqlen. The last chunk will be padded. - # padding last chunk is not great for longformer because many chunks will be mostly padding - - for token in tokens: - if len(current_chunk) == args.seqlen - 1: # chunk is full - current_chunk.append(tokenizer.eos_token) - chunks_list.append(current_chunk) - current_chunk = [tokenizer.bos_token] - current_chunk.append(token) - if args.padded_chunks: - # fill the rest of the seqlen with pad - current_chunk.extend([tokenizer.pad_token] * (args.seqlen - len(current_chunk))) - current_chunk[args.seqlen - 1] = tokenizer.eos_token - chunks_list.append(current_chunk) - current_chunk = [tokenizer.bos_token] + tokens = tokenizer.encode(line, add_special_tokens=False) # Special tokens are in `__getitem__` + token_list.extend(tokens) + if len(token_list) > args.shard_size: + _write_shard(token_list, shard_num) + token_list = [] + shard_num += 1 else: - # one long doc with sep inbetween - if len(current_chunk) < args.seqlen - 1: - current_chunk.append(tokenizer.sep_token) - random.shuffle(chunks_list) - val_count = int(args.train_dev_split * len(chunks_list)) - val_chunks = chunks_list[:val_count] - train_chunks = chunks_list[val_count:] - - def _tokenized_text_to_mmap(output_fname, chunks_list): - num_chunks = len(chunks_list) - all_token_ids = np.empty((num_chunks, args.seqlen), dtype=np.uint16) - for k, chunk in enumerate(tqdm(chunks_list)): - token_ids = tokenizer.convert_tokens_to_ids(chunk) - assert len(token_ids) == args.seqlen - all_token_ids[k, :] = [int(t) for t in token_ids] - fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=(num_chunks, args.seqlen)) - fp[:, :] = all_token_ids[:, :] - fp.flush() + token_list.append(tokenizer.sep_token_id) + _write_shard(token_list, shard_num) + + all_shards = glob.glob(f'{args.input_dir}/shards/*.bin') + random.shuffle(all_shards) # shuffling based on shards not individual lines + val_shards_count = int(args.train_dev_split * len(all_shards)) + val_shards = all_shards[:val_shards_count] + train_shards = all_shards[val_shards_count:] + + def _combine_shards(output_fname, shards_list): + total_size = 0 + for filename in shards_list: + total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1 + total_size -= 1 + logging.info(f'Writing {total_size} tokens to {output_fname}') + all_token_ids = np.empty(total_size, dtype=np.uint16) + last_token_index = 0 + for filename in tqdm(shards_list): + shared = np.memmap(filename, mode='r', dtype=np.uint16) + all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:] + last_token_index += len(shared) + fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size) + fp[:] = all_token_ids[:] del fp - _tokenized_text_to_mmap(f'{args.input_dir}/cache/train.bin', train_chunks) - _tokenized_text_to_mmap(f'{args.input_dir}/cache/val.bin', val_chunks) + _combine_shards(f'{args.input_dir}/cache/val.bin', val_shards) + _combine_shards(f'{args.input_dir}/cache/train.bin', train_shards) class Pretrainer(ptl.LightningModule): @@ -132,6 +151,8 @@ def __init__(self, hparams): self.config = self.model.config tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) self.pad_token_id = tokenizer.pad_token_id + self.eos_token_id = tokenizer.eos_token_id + self.bos_token_id = tokenizer.bos_token_id logger.info(f'Creating dataset cache from dir {self.args.input_dir}. This could be slow the first time.') MMapTextDataset.raw_text_to_mmap(args) @@ -222,7 +243,8 @@ def configure_optimizers(self): return [optimizer], [{"scheduler": scheduler, "interval": "step"}] def _get_loader(self, fname, is_train): - dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen) + dataset = MMapTextDataset(fname, chunk_size=self.args.seqlen, + bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id) # TODO: consider `replace_sampler_ddp=True` and removing the following if statement if self.trainer.use_ddp: @@ -277,7 +299,7 @@ def add_args(parser): # Dataset. Some of these params are only useful when generating the dataset cache parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/') parser.add_argument("--train_dev_split", type=float, default=0.05) - parser.add_argument("--padded_chunks", type=bool, default=False) + parser.add_argument("--shard_size", type=int, default=2 * 1000 * 1000) parser.add_argument("--seqlen", type=int, default=512) parser.add_argument("--mlm_prob", type=float, default=0.15) @@ -293,7 +315,7 @@ def add_args(parser): help="Path to a checkpoint to load model weights and training state. It overwrites args") parser.add_argument("--resume_model_only", type=str, default=None, help="Path to a checkpoint to load model weights but not training state") - parser.add_argument("--log_rate", type=int, default=16) + parser.add_argument("--log_rate", type=int, default=10) parser.add_argument("--disable_checkpointing", type=bool, default=False) # Training hyperparams From 129a3f954a754a166f5c05cd35eb77860ab03e23 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 30 Jul 2020 14:38:32 -0700 Subject: [PATCH 072/112] log and resume data preprocessing --- scripts/pretrain.py | 57 +++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 253cfc5..f6955cc 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -25,7 +25,6 @@ # DONE: testing ddp single machine # DONE: testing ddp multiple machines # DONE: testing resume from checkpoint -# TODO: enable gradient norm logging on a single TPU # TODO: try on a TPU-pod # TODO: run on beaker on ai2-server1/2 @@ -77,28 +76,42 @@ def raw_text_to_mmap(args): assert len(tokenizer) < 65535 # will use uint16 to store token ids all_files = glob.glob(f'{args.input_dir}/*.txt') - if os.path.exists(f'{args.input_dir}/cache/'): + if os.path.exists(f'{args.input_dir}/cache/train.bin') and os.path.exists(f'{args.input_dir}/cache/val.bin'): logger.info("Cache already exists. Remove the cache directory to regenerate") return - os.mkdir(f'{args.input_dir}/cache/') - os.mkdir(f'{args.input_dir}/shards/') - - # TODO: support continue after a crash + try: + os.mkdir(f'{args.input_dir}/cache/') + except FileExistsError: + pass + try: + os.mkdir(f'{args.input_dir}/shards/') + except FileExistsError: + pass + try: + os.mkdir(f'{args.input_dir}/logs/') # log progrss to be able to resume + except FileExistsError: + pass for full_fname in tqdm(all_files): # TODO: process each input file in a separate worker fname = full_fname.split('/')[-1] + log_filename = f'{args.input_dir}/logs/{fname}.log' + if os.path.isfile(log_filename): + logging.info(f'Skipping {full_fname} ...') + continue # log file already exists. Skip current file. + logging.info(f'Processing {full_fname} ...') with open(full_fname, 'r') as fin: + token_list = [] + shard_count = 0 + tokens_count = 0 - def _write_shard(data, idx): - if len(data) == 0: + def _write_shard(): + if len(token_list) == 0: return - shared_filename = f'{args.input_dir}/shards/{fname}-{idx}.bin' - logging.info(f'Writing {len(data)} tokens to shared {shared_filename}') - fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(data)) - fp[:] = data[:] + shared_filename = f'{args.input_dir}/shards/{fname}-{shard_count}.bin' + logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}') + fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list)) + fp[:] = token_list[:] del fp # flush and close file - token_list = [] - shard_num = 0 for line in tqdm(fin): line = line.strip() if line == '': # drop empty lines @@ -106,12 +119,16 @@ def _write_shard(data, idx): tokens = tokenizer.encode(line, add_special_tokens=False) # Special tokens are in `__getitem__` token_list.extend(tokens) if len(token_list) > args.shard_size: - _write_shard(token_list, shard_num) + _write_shard() + tokens_count += len(token_list) token_list = [] - shard_num += 1 + shard_count += 1 else: token_list.append(tokenizer.sep_token_id) - _write_shard(token_list, shard_num) + _write_shard() + tokens_count += len(token_list) + with open(log_filename, 'w') as f: + f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards') all_shards = glob.glob(f'{args.input_dir}/shards/*.bin') random.shuffle(all_shards) # shuffling based on shards not individual lines @@ -119,11 +136,15 @@ def _write_shard(data, idx): val_shards = all_shards[:val_shards_count] train_shards = all_shards[val_shards_count:] + # TODO: if _combining_shards is very slow for large files, it can be skipped then update + # the dataset to read from multiple shards directly def _combine_shards(output_fname, shards_list): total_size = 0 for filename in shards_list: + # The +1 accounts for additional SEP tokens between shards total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1 - total_size -= 1 + print(total_size, filename) + total_size -= 1 # account for an unnecessary SEP token at the every end logging.info(f'Writing {total_size} tokens to {output_fname}') all_token_ids = np.empty(total_size, dtype=np.uint16) last_token_index = 0 From 1c42f964cf487f1df0737b7ffc17c9131b40b7f9 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 30 Jul 2020 16:32:30 -0700 Subject: [PATCH 073/112] multiprocessed preprocessing --- scripts/pretrain.py | 151 +++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 66 deletions(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index f6955cc..32d540c 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -62,6 +62,67 @@ def __getitem__(self, i): data = np.concatenate(([self._bos_token_id], self.token_ids[from_index:to_index], [self._eos_token_id])) return torch.tensor(data, dtype=torch.long) + # ========================= preprocessing code ========================= # + @staticmethod + def _process_file(full_fname): + "Step 1: tokenize an input text file then save token ids into `np.memmap` shards of size `args.shard_size`" + fname = full_fname.split('/')[-1] + log_filename = f'{args.input_dir}/logs-{args.shard_size}/{fname}.log' + if os.path.isfile(log_filename): + logging.info(f'Skipping {full_fname} ...') + return # log file already exists. Skip current file. + + logging.info(f'Processing {full_fname} ...') + with open(full_fname, 'r') as fin: + token_list = [] + shard_count = 0 + tokens_count = 0 + + def _write_shard(): + if len(token_list) == 0: + return + if token_list[-1] != MMapTextDataset.tokenizer.sep_token_id: # handle a rare case + token_list.append(MMapTextDataset.tokenizer.sep_token_id) + shared_filename = f'{args.input_dir}/shards-{args.shard_size}/{fname}-{shard_count}.bin' + logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}') + fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list)) + fp[:] = token_list[:] + del fp # flush and close file + for line in tqdm(fin): + line = line.strip() + if line == '': # drop empty lines + continue + tokens = MMapTextDataset.tokenizer.encode(line, add_special_tokens=False) # `__getitem__` adds special tokens + token_list.extend(tokens) + if len(token_list) > args.shard_size: + _write_shard() + tokens_count += len(token_list) + token_list = [] + shard_count += 1 + else: + token_list.append(MMapTextDataset.tokenizer.sep_token_id) + _write_shard() + tokens_count += len(token_list) + with open(log_filename, 'w') as f: + f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards') + + @staticmethod + def _combine_shards(output_fname, shards_list): + "Step 2: combining memmap shards into one `train.bin` or `val.bin` file" + total_size = 0 + for filename in shards_list: + total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + logging.info(f'Writing {total_size} tokens to {output_fname}') + all_token_ids = np.empty(total_size, dtype=np.uint16) + last_token_index = 0 + for filename in tqdm(shards_list): + shared = np.memmap(filename, mode='r', dtype=np.uint16) + all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:] + last_token_index += len(shared) + fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size) + fp[:] = all_token_ids[:] + del fp + @staticmethod def raw_text_to_mmap(args): """This is the main preprocessing function. It processes all the text files in `args.input_dir` and @@ -72,8 +133,8 @@ def raw_text_to_mmap(args): are added. The reason for not adding them at preprocessing time is to allow different sequence lengths later on. Notice that this is the "FULL-SENTENCES" setting in the RoBERTa paper, Table2. """ - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) - assert len(tokenizer) < 65535 # will use uint16 to store token ids + MMapTextDataset.tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) + assert len(MMapTextDataset.tokenizer) < 65535 # will use uint16 to store token ids all_files = glob.glob(f'{args.input_dir}/*.txt') if os.path.exists(f'{args.input_dir}/cache/train.bin') and os.path.exists(f'{args.input_dir}/cache/val.bin'): @@ -84,80 +145,35 @@ def raw_text_to_mmap(args): except FileExistsError: pass try: - os.mkdir(f'{args.input_dir}/shards/') + os.mkdir(f'{args.input_dir}/shards-{args.shard_size}/') except FileExistsError: pass try: - os.mkdir(f'{args.input_dir}/logs/') # log progrss to be able to resume + os.mkdir(f'{args.input_dir}/logs-{args.shard_size}/') # log progrss to be able to resume except FileExistsError: pass - for full_fname in tqdm(all_files): # TODO: process each input file in a separate worker - fname = full_fname.split('/')[-1] - log_filename = f'{args.input_dir}/logs/{fname}.log' - if os.path.isfile(log_filename): - logging.info(f'Skipping {full_fname} ...') - continue # log file already exists. Skip current file. - logging.info(f'Processing {full_fname} ...') - with open(full_fname, 'r') as fin: - token_list = [] - shard_count = 0 - tokens_count = 0 - - def _write_shard(): - if len(token_list) == 0: - return - shared_filename = f'{args.input_dir}/shards/{fname}-{shard_count}.bin' - logging.info(f'Writing {len(token_list)} tokens to shared {shared_filename}') - fp = np.memmap(shared_filename, dtype=np.uint16, mode='w+', shape=len(token_list)) - fp[:] = token_list[:] - del fp # flush and close file - for line in tqdm(fin): - line = line.strip() - if line == '': # drop empty lines - continue - tokens = tokenizer.encode(line, add_special_tokens=False) # Special tokens are in `__getitem__` - token_list.extend(tokens) - if len(token_list) > args.shard_size: - _write_shard() - tokens_count += len(token_list) - token_list = [] - shard_count += 1 - else: - token_list.append(tokenizer.sep_token_id) - _write_shard() - tokens_count += len(token_list) - with open(log_filename, 'w') as f: - f.write(f'Generated {tokens_count} tokens in {shard_count + 1} shards') - - all_shards = glob.glob(f'{args.input_dir}/shards/*.bin') + # STEP1: tokenizing and saving to shards + if args.num_preprocessing_workers > 1: + from multiprocessing.pool import Pool + with Pool(args.num_preprocessing_workers) as p: + list(tqdm(p.imap(MMapTextDataset._process_file, all_files), total=len(all_files))) + else: + [MMapTextDataset._process_file(f) for f in tqdm(all_files)] + + # STEP2: shuffling shards and combining them into train.bin and val.bin files + all_shards = glob.glob(f'{args.input_dir}/shards-{args.shard_size}/*.bin') random.shuffle(all_shards) # shuffling based on shards not individual lines val_shards_count = int(args.train_dev_split * len(all_shards)) val_shards = all_shards[:val_shards_count] train_shards = all_shards[val_shards_count:] + # TODO: if MMapTextDataset._combining_shards is very slow for large files, it can be skipped but we nned to + # update the dataset to read from multiple shards directly + MMapTextDataset._combine_shards(f'{args.input_dir}/cache/val.bin', val_shards) + MMapTextDataset._combine_shards(f'{args.input_dir}/cache/train.bin', train_shards) - # TODO: if _combining_shards is very slow for large files, it can be skipped then update - # the dataset to read from multiple shards directly - def _combine_shards(output_fname, shards_list): - total_size = 0 - for filename in shards_list: - # The +1 accounts for additional SEP tokens between shards - total_size += np.memmap(filename, mode='r', dtype=np.uint16).shape[0] + 1 - print(total_size, filename) - total_size -= 1 # account for an unnecessary SEP token at the every end - logging.info(f'Writing {total_size} tokens to {output_fname}') - all_token_ids = np.empty(total_size, dtype=np.uint16) - last_token_index = 0 - for filename in tqdm(shards_list): - shared = np.memmap(filename, mode='r', dtype=np.uint16) - all_token_ids[last_token_index:last_token_index+len(shared)] = shared[:] - last_token_index += len(shared) - fp = np.memmap(output_fname, dtype=np.uint16, mode='w+', shape=total_size) - fp[:] = all_token_ids[:] - del fp - - _combine_shards(f'{args.input_dir}/cache/val.bin', val_shards) - _combine_shards(f'{args.input_dir}/cache/train.bin', train_shards) + del MMapTextDataset.tokenizer + # ========================= end preprocessing code ========================= # class Pretrainer(ptl.LightningModule): @@ -319,8 +335,11 @@ def add_args(parser): # Dataset. Some of these params are only useful when generating the dataset cache parser.add_argument("--input_dir", type=str, default='/net/nfs.corp/s2-research/beltagy/longformer/data/') + # Used only at the preprocessing phase parser.add_argument("--train_dev_split", type=float, default=0.05) - parser.add_argument("--shard_size", type=int, default=2 * 1000 * 1000) + parser.add_argument("--shard_size", type=int, default=1024 ** 3 // 4) # 250MB + parser.add_argument("--num_preprocessing_workers", type=int, default=1) + # Used only at the training phase parser.add_argument("--seqlen", type=int, default=512) parser.add_argument("--mlm_prob", type=float, default=0.15) From c20264e6f802ea6fea66dabe5b6aaa3e08f60721 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 3 Aug 2020 13:23:25 -0700 Subject: [PATCH 074/112] wip --- scripts/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pretrain.py b/scripts/pretrain.py index 32d540c..b5c581a 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -233,7 +233,7 @@ def training_step(self, batch, batch_nb): elapsed_time = time.time() - self.start_time tensorboard_logs['second_per_batch'] = elapsed_time self.start_time = time.time() - if not self.use_tpu: + if self.on_gpu: tensorboard_logs['memory'] = torch.cuda.memory_allocated(loss.device) / 1024 ** 3 return {'loss': loss, 'log': tensorboard_logs} From ff96351b17c8485f6b558bdff6c2ec0f34d4a364 Mon Sep 17 00:00:00 2001 From: Slater Date: Mon, 3 Aug 2020 15:23:14 -0700 Subject: [PATCH 075/112] Save this directory as a dataset and use it directly on a plain base image. --- experiment.yml | 18 +++++++++++++++ longformer_on_beaker.sh | 51 +++++++++++++++++++++++++++++++++++++++++ scripts/pretrain.py | 2 +- 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 experiment.yml create mode 100755 longformer_on_beaker.sh diff --git a/experiment.yml b/experiment.yml new file mode 100644 index 0000000..156faf5 --- /dev/null +++ b/experiment.yml @@ -0,0 +1,18 @@ +tasks: + - cluster: {{.Env.CLUSTER}} + spec: + # This is a python3.7/nvidia base image with basic libraries + image: im_j69gti4atcw9 + resultPath: {{.Env.RESULT_PATH}} + args: + - /bin/bash + - -c + - "cd /longformer_on_beaker && pip install . && {{.Env.ARGS}}" + datasetMounts: + - datasetId: {{.Env.INPUT_DATASET_ID}} + containerPath: /data + - datasetId: {{.Env.SCRIPTS}} + containerPath: /longformer_on_beaker + requirements: + gpuCount: {{.Env.GPU_COUNT}} + cpu: {{.Env.CPU_COUNT}} diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh new file mode 100755 index 0000000..6e873a1 --- /dev/null +++ b/longformer_on_beaker.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +export SCRIPTS=$(beaker dataset create -q .) +export INPUT_DATASET_ID="ds_6r0phxc5fiap" +export RESULT_SAVE_DIR="/runs" +export RESULT_SAVE_PREFIX="test" +export ARGS="" +export GPU_COUNT=1 +export CPU_COUNT=6 +copy=("$@") +for i in "${!copy[@]}" +do + if [[ "${copy[$i]}" = "--save_dir" ]] + then + export RESULT_SAVE_DIR="${copy[$i+1]}" + fi + + if [[ "${copy[$i]}" = "--input_dir" ]] + then + export INPUT_DATASET_ID=$(beaker dataset create -q ${copy[$i+1]}) + copy[$i+1]="/data" + fi + + if [[ "${copy[$i]}" = "--save_prefix" ]] + then + export RESULT_SAVE_PREFIX="${copy[$i+1]}" + fi + + if [[ "${copy[$i]}" = "--num_workers" ]] + then + export CPU_COUNT="${copy[$i+1]}" + fi + + if [[ "${copy[$i]}" = "--gpu_count" ]] + then + export GPU_COUNT="${copy[$i+1]}" + fi + ARGS="$ARGS ${copy[$i]}" +done + +# If an input dataset was not specified, use the default +if [[ "ds_6r0phxc5fiap" = $INPUT_DATASET_ID ]] +then + ARGS="$ARGS --input_dir /data" +fi + +echo $ARGS + +export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX + +beaker experiment create -f experiment.yml diff --git a/scripts/pretrain.py b/scripts/pretrain.py index b5c581a..8de5bbd 100644 --- a/scripts/pretrain.py +++ b/scripts/pretrain.py @@ -348,7 +348,7 @@ def add_args(parser): parser.add_argument("--model", type=str, default='roberta-base') # Checkpointing and logging - parser.add_argument("--save_dir", type=str, default='runs/') + parser.add_argument("--save_dir", type=str, default='/runs/') parser.add_argument("--save_prefix", type=str, default='test', help="path of output directory is --save_dir/--save_prefix") parser.add_argument("--resume", type=str, default=None, # It is better to use a different output dir. From 0557e24c8cd654c01be68098126fec66eff89956 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 6 Aug 2020 12:03:48 -0700 Subject: [PATCH 076/112] bug fix --- longformer/longformer_encoder_decoder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py index 67ab3e4..f29fb49 100644 --- a/longformer/longformer_encoder_decoder.py +++ b/longformer/longformer_encoder_decoder.py @@ -62,10 +62,8 @@ def forward( assert list(query.size()) == [tgt_len, bsz, embed_dim] assert attn_mask is None - # LongformerSelfAttention expects this shape - query = query.view(bsz, tgt_len, embed_dim) outputs = self.longformer_self_attn( - query, + query.transpose(0, 1), # LongformerSelfAttention expects (bsz, seqlen, embd_dim) attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1, head_mask=None, encoder_hidden_states=None, From 6ae5051bf704edc9548334d6bb88f321d57a06af Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 7 Aug 2020 11:14:12 -0700 Subject: [PATCH 077/112] fix a bug with the mapping from longformerselfattention to bartselfattention --- longformer/longformer_encoder_decoder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/longformer/longformer_encoder_decoder.py b/longformer/longformer_encoder_decoder.py index f29fb49..df38224 100644 --- a/longformer/longformer_encoder_decoder.py +++ b/longformer/longformer_encoder_decoder.py @@ -71,8 +71,6 @@ def forward( output_attentions=output_attentions, ) - attn_output = outputs[0] - attn_output = attn_output.contiguous().view(tgt_len, bsz, embed_dim) - attn_output = self.output(attn_output) + attn_output = self.output(outputs[0].transpose(0, 1)) return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None) From a1de977980d1cd658d3d8f740a3f108bbf84ea47 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 7 Aug 2020 11:14:28 -0700 Subject: [PATCH 078/112] mem_profiler --- scripts/mem_profiler.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/mem_profiler.py b/scripts/mem_profiler.py index 4edc6b0..5d8e2f7 100644 --- a/scripts/mem_profiler.py +++ b/scripts/mem_profiler.py @@ -9,7 +9,7 @@ from pytorch_lightning import Trainer import pytorch_lightning as pl -seqlen = 1024 * 8 +seqlen = 1024 * 2 global_size = seqlen // 100 attention_window = 256 # one sided @@ -31,16 +31,16 @@ def __init__(self, hparams=None): super().__init__() self.hparams = hparams - # config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096') - config = LongformerConfig.from_pretrained('roberta-large') + config = LongformerEncoderDecoderConfig.from_pretrained('bart-long-4096') + # config = LongformerConfig.from_pretrained('roberta-large') config.max_position_embeddings = seqlen + 2 config.gradient_checkpointing = True - # config.attention_mode = 'sliding_chunks' - config.attention_mode = 'n2' + config.attention_mode = 'sliding_chunks' + # config.attention_mode = 'n2' config.attention_window = [attention_window] * config.num_hidden_layers config.attention_dilation = [1] * config.num_hidden_layers - # self.model = LongformerEncoderDecoderForConditionalGeneration(config) - self.model = LongformerForMaskedLM(config) + self.model = LongformerEncoderDecoderForConditionalGeneration(config) + # self.model = LongformerForMaskedLM(config) def forward(self, x, y): print(seqlen, global_size, attention_window, torch.cuda.max_memory_allocated(x.device) / 1024 ** 3) @@ -60,7 +60,7 @@ def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=0.001) def train_dataloader(self): - return DataLoader(CoolDataset(), batch_size=1, num_workers=0) + return DataLoader(CoolDataset(), batch_size=2, num_workers=0) if __name__ == '__main__': From 1bf6c7c66ebf15a7490e8050df6587f43ea2d047 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 12 Aug 2020 11:10:42 -0700 Subject: [PATCH 079/112] extend encoder only --- ...onvert_bart_to_longformerencoderdecoder.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/scripts/convert_bart_to_longformerencoderdecoder.py b/scripts/convert_bart_to_longformerencoderdecoder.py index e469819..fc94996 100644 --- a/scripts/convert_bart_to_longformerencoderdecoder.py +++ b/scripts/convert_bart_to_longformerencoderdecoder.py @@ -5,6 +5,7 @@ from transformers import BartTokenizer from transformers import BartForConditionalGeneration +from transformers.modeling_bart import shift_tokens_right from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart, LongformerEncoderDecoderConfig from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration @@ -14,10 +15,10 @@ def create_long_model( save_model_to, - base_model='facebook/bart-large', - tokenizer_name_or_path='facebook/bart-large', - attention_window=512, - max_pos=4096 + base_model, + tokenizer_name_or_path, + attention_window, + max_pos ): model = BartForConditionalGeneration.from_pretrained(base_model) tokenizer = BartTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos) @@ -35,7 +36,9 @@ def create_long_model( current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape assert current_max_pos == config.max_position_embeddings + 2 - config.max_position_embeddings = max_pos + config.max_encoder_position_embeddings = max_pos + config.max_decoder_position_embeddings = config.max_position_embeddings + del config.max_position_embeddings max_pos += 2 # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2 assert max_pos >= current_max_pos @@ -50,14 +53,14 @@ def create_long_model( model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed # allocate a larger position embedding matrix for the decoder - new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size) - # copy position embeddings over and over to initialize the new position embeddings - k = 2 - step = current_max_pos - 2 - while k < max_pos - 1: - new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:] - k += step - model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed + # new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size) + # # copy position embeddings over and over to initialize the new position embeddings + # k = 2 + # step = current_max_pos - 2 + # while k < max_pos - 1: + # new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:] + # k += step + # model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed # replace the `modeling_bart.SelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers @@ -107,12 +110,12 @@ def main(): '--attention_window', type=int, default=512, - help='attention window size for longformer self attention' + help='attention window size for longformer self attention (one sided)' ) parser.add_argument( '--max_pos', type=int, - default=4096, + default=4096 * 4, help='maximum encoder positions' ) @@ -137,11 +140,12 @@ def main(): data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048) input_ids = data['input_ids'] attention_mask = data['attention_mask'] - logits = model(input_ids, attention_mask=attention_mask)[0] + decoder_input_ids = shift_tokens_right(input_ids[:, :5], tokenizer.pad_token_id) + logits = model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, use_cache=False)[0] masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() probs = logits[0, masked_index].softmax(dim=0) values, predictions = probs.topk(5) - print(tokenizer.decode(predictions).split()) + print(tokenizer.convert_ids_to_tokens(predictions)) if __name__ == "__main__": From 5b31f5e7396210c6c73d1be268f1310e09d85d6e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 12 Aug 2020 14:29:17 -0700 Subject: [PATCH 080/112] upgrade triviaqa script to PLv0.8.5 --- scripts/triviaqa.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 281c297..6924c81 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -528,21 +528,14 @@ def test_end(self, outputs): return {'count': len(qid_to_answer_text)} - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None): - optimizer.step() - optimizer.zero_grad() - self.scheduler.step(self.global_step) - def configure_optimizers(self): def lr_lambda(current_step): if current_step < self.args.warmup: return float(current_step) / float(max(1, self.args.warmup)) return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup))) optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr) - self.scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1) # scheduler is not saved in the checkpoint, but global_step is, which is enough to restart - self.scheduler.step(self.global_step) - - return optimizer + scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1) + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] @pl.data_loader def train_dataloader(self): @@ -610,8 +603,8 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format") parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format") parser.add_argument("--batch_size", type=int, default=8, help="Batch size") - parser.add_argument("--gpus", type=str, default='0', - help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ") + parser.add_argument("--gpus", type=int, default=1, + help="Number of gpus. 0 for CPU") parser.add_argument("--warmup", type=int, default=200, help="Number of warmup steps") parser.add_argument("--lr", type=float, default=0.0001, help="Maximum learning rate") parser.add_argument("--val_every", type=float, default=0.2, help="Number of training steps between validations") @@ -672,15 +665,14 @@ def main(args): prefix='' ) - args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus is not "" else None # use CPU if no gpu provided print(args) train_set_size = 110648 # hardcode dataset size. Needed to compute number of steps for the lr scheduler - num_devices = 1 or len(args.gpus) - args.steps = args.epochs * train_set_size / (args.batch_size * num_devices) - print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<') + args.steps = args.epochs * train_set_size / (args.batch_size * args.gpus) + print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * args.gpus} <<<<<<<') - trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and (len(args.gpus) > 1) else None, - track_grad_norm=-1, max_nb_epochs=args.epochs, early_stop_callback=None, + trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None, + track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None, + replace_sampler_ddp=False, accumulate_grad_batches=args.batch_size, val_check_interval=args.val_every, val_percent_check=args.val_percent_check, From 405739e735d30048780f06f4cb627a380331eb3a Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 12 Aug 2020 22:47:03 -0700 Subject: [PATCH 081/112] add roberta baseline --- scripts/triviaqa.py | 65 ++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 6924c81..da5ea61 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -9,7 +9,7 @@ from torch.optim.lr_scheduler import LambdaLR from torch.utils.data import DataLoader, Dataset -from transformers import RobertaTokenizer +from transformers import RobertaTokenizer, AutoModel from scripts.triviaqa_utils import evaluation_utils import pytorch_lightning as pl @@ -263,10 +263,13 @@ def __init__(self, args): self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None def load_model(self): - model = Longformer.from_pretrained(self.args.model_path) - for layer in model.encoder.layer: - layer.attention.self.attention_mode = self.args.attention_mode - self.args.attention_window = layer.attention.self.attention_window + if 'longformer' in self.args.model_path: + model = Longformer.from_pretrained(self.args.model_path) + for layer in model.encoder.layer: + layer.attention.self.attention_mode = self.args.attention_mode + self.args.attention_window = layer.attention.self.attention_window + else: + model = AutoModel.from_pretrained(self.args.model_path) print("Loaded model with config:") print(model.config) @@ -277,29 +280,34 @@ def load_model(self): return model def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions): - question_end_index = self._get_question_end_index(input_ids) - # Each batch is one document, and each row of the batch is a chunck of the document. - # Make sure all rows have the same question length. - assert (question_end_index[0].float() == question_end_index.float().mean()).item() - - # local attention everywhere - attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) - # global attention for the question tokens - attention_mask[:, :question_end_index.item()] = 2 - - # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size - input_ids, attention_mask = pad_to_window_size( - input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id) - - sequence_output = self.model( - input_ids, - attention_mask=attention_mask)[0] - - # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens - # before computing loss and decoding. - padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum() - if padding_len > 0: - sequence_output = sequence_output[:, :-padding_len] + if 'longformer' in self.args.model_path: + question_end_index = self._get_question_end_index(input_ids) + # Each batch is one document, and each row of the batch is a chunck of the document. + # Make sure all rows have the same question length. + assert (question_end_index[0].float() == question_end_index.float().mean()).item() + + # local attention everywhere + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) + # global attention for the question tokens + attention_mask[:, :question_end_index.item()] = 2 + + # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size + input_ids, attention_mask = pad_to_window_size( + input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id) + + sequence_output = self.model( + input_ids, + attention_mask=attention_mask)[0] + + # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens + # before computing loss and decoding. + padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum() + if padding_len > 0: + sequence_output = sequence_output[:, :-padding_len] + else: + sequence_output = self.model( + input_ids, + attention_mask=attention_mask)[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -637,6 +645,7 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'], default='sliding_chunks', help='Which implementation of selfattention to use') parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32") + # parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model") return parser From c132d4e3384e1de6c83c09d1ea63260fa30f7604 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 17 Aug 2020 06:36:05 -0700 Subject: [PATCH 082/112] triviaqa seq2seq + fix bart-base bug --- scripts/triviaqa.py | 85 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index da5ea61..362dfa5 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -9,7 +9,7 @@ from torch.optim.lr_scheduler import LambdaLR from torch.utils.data import DataLoader, Dataset -from transformers import RobertaTokenizer, AutoModel +from transformers import RobertaTokenizer, AutoModel, AutoConfig from scripts.triviaqa_utils import evaluation_utils import pytorch_lightning as pl @@ -110,11 +110,13 @@ def is_whitespace(c): try: start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] - except: + token_ids = self.tokenizer.encode(orig_answer_text) + except RuntimeError: print(f'Reading example {idx} failed') start_position = 0 end_position = 0 - answer_spans.append({'start': start_position, 'end': end_position}) + answer_spans.append({'start': start_position, 'end': end_position, + 'text': orig_answer_text, 'token_ids': token_ids}) # ===== Given an example, convert it into tensors ============= query_tokens = self.tokenizer.tokenize(question_text) @@ -146,6 +148,7 @@ def is_whitespace(c): segment_ids_list = [] start_positions_list = [] end_positions_list = [] + answer_token_ids_list = [] for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride): slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens)) @@ -172,6 +175,7 @@ def is_whitespace(c): doc_offset = len(query_tokens) + 2 - slice_start start_positions = [] end_positions = [] + answer_token_ids = [] for answer_span in answer_spans: start_position = answer_span['start'] end_position = answer_span['end'] @@ -183,6 +187,7 @@ def is_whitespace(c): continue start_positions.append(tok_start_position_in_doc + doc_offset) end_positions.append(tok_end_position_in_doc + doc_offset) + answer_token_ids.append(answer_span['token_ids']) assert len(start_positions) == len(end_positions) if self.ignore_seq_with_no_answers and len(start_positions) == 0: continue @@ -190,32 +195,58 @@ def is_whitespace(c): # answers from start_positions and end_positions if > self.max_num_answers start_positions = start_positions[:self.max_num_answers] end_positions = end_positions[:self.max_num_answers] + answer_token_ids = answer_token_ids[:self.max_num_answers] # -1 padding up to self.max_num_answers padding_len = self.max_num_answers - len(start_positions) start_positions.extend([-1] * padding_len) end_positions.extend([-1] * padding_len) + answer_token_ids.extend([[]] * padding_len) # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values found_start_positions = set() found_end_positions = set() - for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)): + found_answer_token_ids = set() + for i, (start_position, end_position, answer_tokens) in enumerate( + zip(start_positions, end_positions, answer_token_ids) + ): if start_position in found_start_positions: start_positions[i] = -1 if end_position in found_end_positions: end_positions[i] = -1 + answer_tokens_as_str = ','.join([str(x) for x in answer_tokens]) + if answer_tokens_as_str in found_answer_token_ids: + answer_token_ids[i] = [] found_start_positions.add(start_position) found_end_positions.add(end_position) + found_answer_token_ids.add(answer_tokens_as_str) input_ids_list.append(input_ids) input_mask_list.append(input_mask) segment_ids_list.append(segment_ids) start_positions_list.append(start_positions) end_positions_list.append(end_positions) + answer_token_ids_list.append(answer_token_ids) + + # pad answers in answer_token_ids_list to the longest answer + max_answer_len = max([len(item) for sublist in answer_token_ids_list for item in sublist]) # flat list + if max_answer_len == 0: + max_answer_len = 2 + for answers_of_one_slice in answer_token_ids_list: + for answer_tokens in answers_of_one_slice: + if len(answer_tokens) == 0: + # TODO: or ? + padding_len = max_answer_len - len(answer_tokens) - 2 + answer_tokens.extend([self.tokenizer.bos_token_id, self.tokenizer.eos_token_id] + + ([self.tokenizer.pad_token_id] * padding_len)) + else: + padding_len = max_answer_len - len(answer_tokens) + answer_tokens.extend([self.tokenizer.pad_token_id] * padding_len) tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list), torch.tensor(segment_ids_list), torch.tensor(start_positions_list), torch.tensor(end_positions_list), + torch.tensor(answer_token_ids_list), self._get_qid(qa['id']), qa["aliases"])) # for eval return tensors_list @@ -268,6 +299,20 @@ def load_model(self): for layer in model.encoder.layer: layer.attention.self.attention_mode = self.args.attention_mode self.args.attention_window = layer.attention.self.attention_window + elif self.args.model_path in ['bart.large', 'bart.base']: + model = torch.hub.load('pytorch/fairseq', self.args.model_path) + model.config = model.args + model.config.hidden_size = model.config.decoder_output_dim + elif 'bart' in self.args.model_path and 'base' in self.args.model_path: + config = AutoConfig.from_pretrained(self.args.model_path) + config.encoder_attention_heads = 12 + config.decoder_attention_heads = 12 + config.attention_dropout = 0.1 + model = AutoModel.from_pretrained(self.args.model_path, config=config) + elif 'bart' in self.args.model_path and 'large' in self.args.model_path: + config = AutoConfig.from_pretrained(self.args.model_path) + config.attention_dropout = 0.1 + model = AutoModel.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path) @@ -279,7 +324,7 @@ def load_model(self): model.train() return model - def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions): + def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, answer_token_ids): if 'longformer' in self.args.model_path: question_end_index = self._get_question_end_index(input_ids) # Each batch is one document, and each row of the batch is a chunck of the document. @@ -304,6 +349,8 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum() if padding_len > 0: sequence_output = sequence_output[:, :-padding_len] + elif self.args.model_path in ['bart.large', 'bart.base']: + sequence_output = self.model.extract_features(input_ids) else: sequence_output = self.model( input_ids, @@ -376,8 +423,8 @@ def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, return loss[~torch.isinf(loss)].sum() def training_step(self, batch, batch_nb): - input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch - output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends) + input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch + output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) loss = output[0] lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr'] tensorboard_logs = {'train_loss': loss, 'lr': lr, @@ -386,8 +433,8 @@ def training_step(self, batch, batch_nb): return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): - input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch - output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends) + input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch + output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) loss, start_logits, end_logits = output[:3] answers = self.decode(input_ids, start_logits, end_logits) @@ -461,8 +508,8 @@ def decode(self, input_ids, start_logits, end_logits): answers.append({'text': text, 'score': score}) return answers - def sync_list_across_gpus(self, l, device, dtype): - l_tensor = torch.tensor(l, device=device, dtype=dtype) + def sync_list_across_gpus(self, list_to_sync, device, dtype): + l_tensor = torch.tensor(list_to_sync, device=device, dtype=dtype) gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)] torch.distributed.all_gather(gather_l_tensor, l_tensor) return torch.cat(gather_l_tensor).tolist() @@ -507,8 +554,8 @@ def validation_end(self, outputs): return {'avg_val_loss': avg_loss, 'log': logs, 'progress_bar': logs} def test_step(self, batch, batch_nb): - input_ids, input_mask, segment_ids, subword_starts, subword_ends, qids, aliases = batch - output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends) + input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch + output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) loss, start_logits, end_logits = output[:3] answers = self.decode(input_ids, start_logits, end_logits) @@ -555,7 +602,7 @@ def train_dataloader(self): max_num_answers=self.args.max_num_answers, max_question_len=self.args.max_question_len, ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers) - sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) if self.trainer.use_ddp else None dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None), num_workers=self.args.num_workers, sampler=sampler, collate_fn=TriviaQADataset.collate_one_doc_and_lists) @@ -572,8 +619,8 @@ def val_dataloader(self): max_num_answers=self.args.max_num_answers, max_question_len=self.args.max_question_len, ignore_seq_with_no_answers=False) # evaluation data should keep all examples - sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None - dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None), + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False) if self.trainer.use_ddp else None + dl = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=self.args.num_workers, sampler=sampler, collate_fn=TriviaQADataset.collate_one_doc_and_lists) self.val_dataloader_object = dl @@ -637,7 +684,8 @@ def add_model_specific_args(parser, root_dir): help="Number of answer candidates. Used at decoding time") parser.add_argument("--max_answer_length", type=int, default=30, help="maximum num of wordpieces/answer. Used at decoding time") - parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss") + parser.add_argument("--regular_softmax_loss", action='store_true', + help="IF true, use regular softmax. Default is using ORed softmax loss") parser.add_argument("--test", action='store_true', help="Test only, no training") parser.add_argument("--model_path", type=str, required=True, help="Path to the checkpoint directory") @@ -645,7 +693,7 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'], default='sliding_chunks', help='Which implementation of selfattention to use') parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32") - # parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model") + parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model") return parser @@ -684,6 +732,7 @@ def main(args): replace_sampler_ddp=False, accumulate_grad_batches=args.batch_size, val_check_interval=args.val_every, + # check_val_every_n_epoch=2, val_percent_check=args.val_percent_check, test_percent_check=args.val_percent_check, logger=logger if not args.disable_checkpointing else False, From d1349e96ded239b0c872e579608a47901a2d38a2 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sun, 23 Aug 2020 18:05:44 +0000 Subject: [PATCH 083/112] beaker --- longformer_on_beaker.sh | 48 +++++------------------------------------ requirements.txt | 3 +-- scripts/triviaqa.py | 3 ++- 3 files changed, 8 insertions(+), 46 deletions(-) diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh index 6e873a1..425dcef 100755 --- a/longformer_on_beaker.sh +++ b/longformer_on_beaker.sh @@ -1,51 +1,13 @@ #!/bin/bash export SCRIPTS=$(beaker dataset create -q .) -export INPUT_DATASET_ID="ds_6r0phxc5fiap" +export INPUT_DATASET_ID="ds_drt127wv4aun" export RESULT_SAVE_DIR="/runs" export RESULT_SAVE_PREFIX="test" -export ARGS="" -export GPU_COUNT=1 -export CPU_COUNT=6 -copy=("$@") -for i in "${!copy[@]}" -do - if [[ "${copy[$i]}" = "--save_dir" ]] - then - export RESULT_SAVE_DIR="${copy[$i+1]}" - fi - - if [[ "${copy[$i]}" = "--input_dir" ]] - then - export INPUT_DATASET_ID=$(beaker dataset create -q ${copy[$i+1]}) - copy[$i+1]="/data" - fi - - if [[ "${copy[$i]}" = "--save_prefix" ]] - then - export RESULT_SAVE_PREFIX="${copy[$i+1]}" - fi - - if [[ "${copy[$i]}" = "--num_workers" ]] - then - export CPU_COUNT="${copy[$i+1]}" - fi - - if [[ "${copy[$i]}" = "--gpu_count" ]] - then - export GPU_COUNT="${copy[$i+1]}" - fi - ARGS="$ARGS ${copy[$i]}" -done - -# If an input dataset was not specified, use the default -if [[ "ds_6r0phxc5fiap" = $INPUT_DATASET_ID ]] -then - ARGS="$ARGS --input_dir /data" -fi - -echo $ARGS - +export ARGS="$@" +export GPU_COUNT=8 +export CPU_COUNT=32 +export CLUSTER="ai2/on-prem-ai2-server2" export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX beaker experiment create -f experiment.yml diff --git a/requirements.txt b/requirements.txt index 54829f1..d91e5de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ transformers @ git+http://github.com/ibeltagy/transformers.git@longformer_encoder_decoder#egg=transformers pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning -torch>=1.2.0 -transformers==3.0.2 +torch==1.6.0 tensorboardX test-tube==0.7.5 diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 362dfa5..e5e488d 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -312,6 +312,7 @@ def load_model(self): elif 'bart' in self.args.model_path and 'large' in self.args.model_path: config = AutoConfig.from_pretrained(self.args.model_path) config.attention_dropout = 0.1 + config.gradient_checkpointing = True model = AutoModel.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path) @@ -647,7 +648,7 @@ def configure_ddp(self, model, device_ids): model = LightningDistributedDataParallel( model, device_ids=device_ids, - find_unused_parameters=True + find_unused_parameters=False ) return model From 5a2b9da317c3407293b5069417a3556a5b15e28a Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Tue, 25 Aug 2020 10:40:24 -0700 Subject: [PATCH 084/112] sliding_chunks_no_overlap (#100) --- longformer/longformer.py | 27 ++++++++++++++++------ longformer/sliding_chunks.py | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/longformer/longformer.py b/longformer/longformer.py index 953bd2c..e239c6e 100644 --- a/longformer/longformer.py +++ b/longformer/longformer.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from longformer.diagonaled_mm_tvm import diagonaled_mm as diagonaled_mm_tvm, mask_invalid_locations from longformer.sliding_chunks import sliding_chunks_matmul_qk, sliding_chunks_matmul_pv +from longformer.sliding_chunks import sliding_chunks_no_overlap_matmul_qk, sliding_chunks_no_overlap_matmul_pv from transformers.modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM @@ -48,7 +49,7 @@ def __init__(self, attention_window: List[int] = None, attention_dilation: List[ self.attention_dilation = attention_dilation self.autoregressive = autoregressive self.attention_mode = attention_mode - assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] + assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2', 'sliding_chunks_no_overlap'] class LongformerSelfAttention(nn.Module): @@ -80,8 +81,8 @@ def __init__(self, config, layer_id): self.autoregressive = config.autoregressive assert self.attention_window > 0 assert self.attention_dilation > 0 - assert self.attention_mode in ['tvm', 'sliding_chunks'] - if self.attention_mode == 'sliding_chunks': + assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap'] + if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']: assert not self.autoregressive # not supported assert self.attention_dilation == 1 # dilation is not supported @@ -147,8 +148,12 @@ def forward( q = q.float().contiguous() k = k.float().contiguous() attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False) - else: # "sliding_chunks" + elif self.attention_mode == "sliding_chunks": attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0) + elif self.attention_mode == "sliding_chunks_no_overlap": + attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0) + else: + raise False mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False) if remove_from_windowed_attention_mask is not None: # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 @@ -162,10 +167,14 @@ def forward( # diagonal mask with zeros everywhere and -inf inplace of padding if self.attention_mode == 'tvm': d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False) - else: + elif self.attention_mode == "sliding_chunks": d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) + elif self.attention_mode == "sliding_chunks_no_overlap": + d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) + attn_weights += d_mask - assert list(attn_weights.size()) == [bsz, seq_len, self.num_heads, self.attention_window * 2 + 1] + assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads] + assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3] # the extra attention if extra_attention_mask is not None: @@ -199,8 +208,12 @@ def forward( if self.attention_mode == 'tvm': v = v.float().contiguous() attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False) - else: # "sliding_chunks" + elif self.attention_mode == "sliding_chunks": attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window) + elif self.attention_mode == "sliding_chunks_no_overlap": + attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window) + else: + raise False attn = attn.type_as(hidden_states) assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim] diff --git a/longformer/sliding_chunks.py b/longformer/sliding_chunks.py index d39fe9b..4eed8d8 100644 --- a/longformer/sliding_chunks.py +++ b/longformer/sliding_chunks.py @@ -131,3 +131,46 @@ def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor, input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) attention_mask = F.pad(attention_mask, (0, padding_len), value=False) # no attention on the padding tokens return input_ids, attention_mask + + +# ========= "sliding_chunks_no_overlap": alternative implemenation of the sliding window attention ========= +# This implementation uses non-overlapping chunks (or blocks) of size `w` with number of local attention = 3xw +# To make this implemenation comparable to "sliding_chunks" set w such that +# w_of_sliding_chunks_no_overlap = w_of_sliding_chunks * 2 / 3 +# For example, +# w_of_sliding_chunks = 256 (this is one sided. Total attention size = 512) +# w_of_sliding_chunks_no_overlap = 170 (Total attention size = 510) +# Performance: +# - Speed: 30% faster than "sliding_chunks" +# - Memory: 95% of the memory usage of "sliding_chunks" +# The windows are asymmetric where number of attention on each side of a token ranges between w to 2w +# while "sliding_chunks" has a symmetric window around each token. +# This implementation is roughly similar to the implementation described in the BigBird paper https://arxiv.org/abs/2007.14062 + +def sliding_chunks_no_overlap_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float): + bsz, seqlen, num_heads, head_dim = q.size() + assert seqlen % w == 0 + assert q.size() == k.size() + # chunk seqlen into non-overlapping chunks of size w + chunk_q = q.view(bsz, seqlen // w, w, num_heads, head_dim) + chunk_k = k.view(bsz, seqlen // w, w, num_heads, head_dim) + chunk_k_expanded = torch.stack(( + F.pad(chunk_k[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0), + chunk_k, + F.pad(chunk_k[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0), + ), dim=-1) + diagonal_attn = torch.einsum('bcxhd,bcyhde->bcxhey', (chunk_q, chunk_k_expanded)) # multiply + return diagonal_attn.reshape(bsz, seqlen, num_heads, 3 * w) + + +def sliding_chunks_no_overlap_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int): + bsz, seqlen, num_heads, head_dim = v.size() + chunk_prob = prob.view(bsz, seqlen // w, w, num_heads, 3, w) + chunk_v = v.view(bsz, seqlen // w, w, num_heads, head_dim) + chunk_v_extended = torch.stack(( + F.pad(chunk_v[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0), + chunk_v, + F.pad(chunk_v[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0), + ), dim=-1) + context = torch.einsum('bcwhpd,bcdhep->bcwhe', (chunk_prob, chunk_v_extended)) + return context.reshape(bsz, seqlen, num_heads, head_dim) From b15607b5ec9f3decbb93bafa3f310d3f3cdd8c53 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 26 Aug 2020 16:17:19 -0700 Subject: [PATCH 085/112] seq2seq --- longformer_on_beaker.sh | 2 +- scripts/triviaqa.py | 62 ++++++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh index 425dcef..bedf8d9 100755 --- a/longformer_on_beaker.sh +++ b/longformer_on_beaker.sh @@ -7,7 +7,7 @@ export RESULT_SAVE_PREFIX="test" export ARGS="$@" export GPU_COUNT=8 export CPU_COUNT=32 -export CLUSTER="ai2/on-prem-ai2-server2" +export CLUSTER="ai2/on-prem-ai2-server3" export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX beaker experiment create -f experiment.yml diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index e5e488d..9dbb6bc 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -9,7 +9,7 @@ from torch.optim.lr_scheduler import LambdaLR from torch.utils.data import DataLoader, Dataset -from transformers import RobertaTokenizer, AutoModel, AutoConfig +from transformers import RobertaTokenizer, AutoModel, AutoConfig, AutoModelWithLMHead from scripts.triviaqa_utils import evaluation_utils import pytorch_lightning as pl @@ -308,12 +308,18 @@ def load_model(self): config.encoder_attention_heads = 12 config.decoder_attention_heads = 12 config.attention_dropout = 0.1 - model = AutoModel.from_pretrained(self.args.model_path, config=config) + if self.args.seq2seq: + model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config) + else: + model = AutoModel.from_pretrained(self.args.model_path, config=config) elif 'bart' in self.args.model_path and 'large' in self.args.model_path: config = AutoConfig.from_pretrained(self.args.model_path) config.attention_dropout = 0.1 config.gradient_checkpointing = True - model = AutoModel.from_pretrained(self.args.model_path, config=config) + if self.args.seq2seq: + model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config) + else: + model = AutoModel.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path) @@ -353,9 +359,20 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p elif self.args.model_path in ['bart.large', 'bart.base']: sequence_output = self.model.extract_features(input_ids) else: - sequence_output = self.model( - input_ids, - attention_mask=attention_mask)[0] + if self.args.seq2seq: + decoder_input_ids = answer_token_ids[:, 0, :-1].clone() + decoder_input_ids[decoder_input_ids == self.tokenizer.eos_token_id] = self.tokenizer.pad_token_id + decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) + labels = answer_token_ids[:, 0, 1:].contiguous() + loss = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + labels=labels)[0] + return [loss] + else: + sequence_output = self.model(input_ids, attention_mask=attention_mask)[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -436,6 +453,8 @@ def training_step(self, batch, batch_nb): def validation_step(self, batch, batch_nb): input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) + if self.args.seq2seq: + return {'vloss': output[0]} loss, start_logits, end_logits = output[:3] answers = self.decode(input_ids, start_logits, end_logits) @@ -517,23 +536,28 @@ def sync_list_across_gpus(self, list_to_sync, device, dtype): def validation_end(self, outputs): avg_loss = torch.stack([x['vloss'] for x in outputs]).mean() - avg_em = torch.stack([x['vem'] for x in outputs]).mean() - string_qids = [item for sublist in outputs for item in sublist['qids']] - int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids] - answer_scores = [item for sublist in outputs for item in sublist['answer_scores']] - f1_scores = [item for sublist in outputs for item in sublist['f1']] - em_scores = [item for sublist in outputs for item in sublist['em']] - print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') + if not self.args.seq2seq: + avg_em = torch.stack([x['vem'] for x in outputs]).mean() + string_qids = [item for sublist in outputs for item in sublist['qids']] + int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids] + answer_scores = [item for sublist in outputs for item in sublist['answer_scores']] + f1_scores = [item for sublist in outputs for item in sublist['f1']] + em_scores = [item for sublist in outputs for item in sublist['em']] + print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') if self.trainer.use_ddp: torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= self.trainer.world_size - torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM) - avg_em /= self.trainer.world_size + if not self.args.seq2seq: + torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM) + avg_em /= self.trainer.world_size + + int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int) + answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float) + f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float) + em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int) + if self.args.seq2seq: + return {'avg_val_loss': avg_loss, 'log': {'val_loss': avg_loss}, 'progress_bar': {'val_loss': avg_loss}} - int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int) - answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float) - f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float) - em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int) print(f'after sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') # Because of having multiple documents per questions, some questions might have multiple corresponding answers From 82741c3c9ec4107a42d5981eac2ca98cc605a136 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 27 Aug 2020 21:06:12 +0000 Subject: [PATCH 086/112] wip --- scripts/triviaqa.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 9dbb6bc..8b6c9a7 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -290,7 +290,8 @@ def __init__(self, args): self.tokenizer.model_max_length = self.args.max_seq_len self.model = self.load_model() self.num_labels = 2 - self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) + if not self.args.seq2seq: + self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None def load_model(self): @@ -364,6 +365,7 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p decoder_input_ids[decoder_input_ids == self.tokenizer.eos_token_id] = self.tokenizer.pad_token_id decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = answer_token_ids[:, 0, 1:].contiguous() + labels[answer_token_ids[:, 0, 1:] == self.tokenizer.pad_token_id] = -100 loss = self.model( input_ids, attention_mask=attention_mask, From 5c3a22ac08b5da4c5ce933d18d770061bdfa584b Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 27 Aug 2020 14:26:35 -0700 Subject: [PATCH 087/112] wip --- scripts/triviaqa.py | 62 ++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 8b6c9a7..8a55462 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -366,13 +366,15 @@ def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_p decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = answer_token_ids[:, 0, 1:].contiguous() labels[answer_token_ids[:, 0, 1:] == self.tokenizer.pad_token_id] = -100 - loss = self.model( + outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, - labels=labels)[0] - return [loss] + labels=labels) + loss = outputs[0] + logit_scores = outputs[1].softmax(dim=2)[:, :, 0].sum(dim=1) + return [loss, logit_scores] else: sequence_output = self.model(input_ids, attention_mask=attention_mask)[0] @@ -456,7 +458,23 @@ def validation_step(self, batch, batch_nb): input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) if self.args.seq2seq: - return {'vloss': output[0]} + logit_scores = output[1] + best_answer_score = logit_scores.max() + best_answer_index = logit_scores.argmax().item() + generated_ids = self.model.generate(input_ids=input_ids[best_answer_index:best_answer_index + 1], + attention_mask=input_mask[best_answer_index:best_answer_index + 1], + use_cache=True,) + generated_answer_ids = generated_ids[0] + generated_answer_ids[-1] = self.tokenizer.eos_token_id + index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item() + generated_answer_ids = generated_answer_ids[1:index_of_eos_token] + answer_text = self.tokenizer.decode(generated_answer_ids) + f1_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.f1_score, answer_text, aliases) + em_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.exact_match_score, answer_text, aliases) + return {'vloss': output[0], 'vem': generated_answer_ids.new_zeros([1]).float(), + 'qids': [qids], 'answer_scores': [best_answer_score], + 'f1': [f1_score], 'em': [em_score]} + loss, start_logits, end_logits = output[:3] answers = self.decode(input_ids, start_logits, end_logits) @@ -538,28 +556,23 @@ def sync_list_across_gpus(self, list_to_sync, device, dtype): def validation_end(self, outputs): avg_loss = torch.stack([x['vloss'] for x in outputs]).mean() - if not self.args.seq2seq: - avg_em = torch.stack([x['vem'] for x in outputs]).mean() - string_qids = [item for sublist in outputs for item in sublist['qids']] - int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids] - answer_scores = [item for sublist in outputs for item in sublist['answer_scores']] - f1_scores = [item for sublist in outputs for item in sublist['f1']] - em_scores = [item for sublist in outputs for item in sublist['em']] - print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') + avg_em = torch.stack([x['vem'] for x in outputs]).mean() + string_qids = [item for sublist in outputs for item in sublist['qids']] + int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids] + answer_scores = [item for sublist in outputs for item in sublist['answer_scores']] + f1_scores = [item for sublist in outputs for item in sublist['f1']] + em_scores = [item for sublist in outputs for item in sublist['em']] + print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') if self.trainer.use_ddp: torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM) avg_loss /= self.trainer.world_size - if not self.args.seq2seq: - torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM) - avg_em /= self.trainer.world_size - - int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int) - answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float) - f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float) - em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int) - if self.args.seq2seq: - return {'avg_val_loss': avg_loss, 'log': {'val_loss': avg_loss}, 'progress_bar': {'val_loss': avg_loss}} + torch.distributed.all_reduce(avg_em, op=torch.distributed.ReduceOp.SUM) + avg_em /= self.trainer.world_size + int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int) + answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float) + f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float) + em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.int) print(f'after sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}') # Because of having multiple documents per questions, some questions might have multiple corresponding answers @@ -583,6 +596,9 @@ def validation_end(self, outputs): def test_step(self, batch, batch_nb): input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids, qids, aliases = batch output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) + if self.args.seq2seq: + raise NotImplemented + loss, start_logits, end_logits = output[:3] answers = self.decode(input_ids, start_logits, end_logits) @@ -689,7 +705,7 @@ def add_model_specific_args(parser, root_dir): help="Number of gpus. 0 for CPU") parser.add_argument("--warmup", type=int, default=200, help="Number of warmup steps") parser.add_argument("--lr", type=float, default=0.0001, help="Maximum learning rate") - parser.add_argument("--val_every", type=float, default=0.2, help="Number of training steps between validations") + parser.add_argument("--val_every", type=float, default=0.5, help="Number of training steps between validations") parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used') parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers") parser.add_argument("--seed", type=int, default=1234, help="Seed") From 75aeb4764f661299e5aed05c151132b9ff167db3 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 28 Aug 2020 04:19:06 +0000 Subject: [PATCH 088/112] wip --- scripts/triviaqa.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index 8a55462..bba1e5f 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -737,6 +737,8 @@ def add_model_specific_args(parser, root_dir): default='sliding_chunks', help='Which implementation of selfattention to use') parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32") parser.add_argument("--seq2seq", action='store_true', help="Use an answer generation model") + parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from") + return parser @@ -760,14 +762,16 @@ def main(args): filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"), save_top_k=5, verbose=True, - monitor='avg_val_f1', - mode='max', + monitor='avg_val_loss', + # save_last=True, + mode='min', + period=-1, prefix='' ) print(args) train_set_size = 110648 # hardcode dataset size. Needed to compute number of steps for the lr scheduler - args.steps = args.epochs * train_set_size / (args.batch_size * args.gpus) + args.steps = args.epochs * train_set_size / (args.batch_size * max(args.gpus, 1)) print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * args.gpus} <<<<<<<') trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None, @@ -775,6 +779,7 @@ def main(args): replace_sampler_ddp=False, accumulate_grad_batches=args.batch_size, val_check_interval=args.val_every, + num_sanity_val_steps=2, # check_val_every_n_epoch=2, val_percent_check=args.val_percent_check, test_percent_check=args.val_percent_check, @@ -782,6 +787,7 @@ def main(args): checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False, show_progress_bar=not args.no_progress_bar, use_amp=not args.fp32, amp_level='O2', + resume_from_checkpoint=args.resume_ckpt, ) if not args.test: trainer.fit(model) From 391d8de413b094ae3a0b1a5a3935e91cb3c89bed Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Tue, 1 Sep 2020 16:16:08 -0700 Subject: [PATCH 089/112] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5631783..89816c9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\*** -A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 12K. +A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 16K. ``` pip install git+https://github.com/allenai/longformer.git@encoderdecoder From bf9e58a447f3fe8c19513d20b043201fcd24acbc Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 2 Sep 2020 16:50:35 -0700 Subject: [PATCH 090/112] summarization --- scripts/summarization.py | 247 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 scripts/summarization.py diff --git a/scripts/summarization.py b/scripts/summarization.py new file mode 100644 index 0000000..37d4a14 --- /dev/null +++ b/scripts/summarization.py @@ -0,0 +1,247 @@ +import os +import argparse +import random +import numpy as np + +import torch +from torch.utils.data import DataLoader, Dataset +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from transformers.optimization import get_linear_schedule_with_warmup +import nlp + +import pytorch_lightning as pl +from pytorch_lightning.logging import TestTubeLogger +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel + + +class SummarizationDataset(Dataset): + def __init__(self, hf_dataset, tokenizer, max_output_len): + self.hf_dataset = hf_dataset + self.tokenizer = tokenizer + self.max_output_len = max_output_len + + def __len__(self): + return len(self.hf_dataset) + + def __getitem__(self, idx): + entry = self.hf_dataset[idx] + input_ids = self.tokenizer.encode(entry['article'], truncation=True) + output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len) + return torch.tensor(input_ids), torch.tensor(output_ids) + + @staticmethod + def collate_fn(batch): + pad_token_id = 1 # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id + input_ids, output_ids = list(zip(*batch)) + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id) + output_ids = torch.nn.utils.rnn.pad_sequence(output_ids, batch_first=True, padding_value=pad_token_id) + return input_ids, output_ids + + +class Summarizer(pl.LightningModule): + + def __init__(self, args): + super().__init__() + self.args = args + self.hparams = args + self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True) + self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path) + self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None + self.rouge = None + + def forward(self, input_ids, output_ids): + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) + attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 + decoder_input_ids = output_ids[:, :-1] + decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) + labels = output_ids[:, 1:].clone() + labels[labels == self.tokenizer.pad_token_id] = -100 + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + labels=labels) + return outputs + + def training_step(self, batch, batch_nb): + output = self.forward(*batch) + loss = output[0] + lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr'] + tensorboard_logs = {'train_loss': loss, 'lr': lr, + 'input_size': batch[0].numel(), + 'output_size': batch[1].numel(), + 'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3} + return {'loss': loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_nb): + outputs = self.forward(*batch) + vloss = outputs[0] + input_ids, output_ids = batch + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) + attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 + generated_ids = self.model.generate(input_ids=input_ids, + attention_mask=attention_mask, + use_cache=True, + max_length=self.args.max_output_len) + generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) + if self.rouge is None: + self.rouge = nlp.load_metric("rouge") + rouge_scores = self.rouge.compute(predictions=generated_str, references=gold_str, rouge_types=['rouge2', 'rouge1', 'rougeL']) + return {'vloss': vloss, + 'rouge1': vloss.new_zeros(1) + rouge_scores['rouge1'].mid.fmeasure, + 'rouge2': vloss.new_zeros(1) + rouge_scores['rouge2'].mid.fmeasure, + 'rougeL': vloss.new_zeros(1) + rouge_scores['rougeL'].mid.fmeasure} + + def validation_epoch_end(self, outputs): + names = ['vloss', 'rouge1', 'rouge2', 'rougeL'] + metrics = [] + for name in names: + metric = torch.stack([x[name] for x in outputs]).mean() + if self.trainer.use_ddp: + torch.distributed.all_reduce(metric, op=torch.distributed.ReduceOp.SUM) + metric /= self.trainer.world_size + metrics.append(metric) + logs = dict(zip(*[names, metrics])) + return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs} + + def test_step(self, batch, batch_nb): + raise NotImplementedError + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) + if self.args.debug: + return optimizer # const LR + num_gpus = torch.cuda.device_count() + num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps + ) + return [optimizer], [{"scheduler": scheduler, "interval": "step"}] + + def _get_dataloader(self, current_dataloader, hf_dataset, is_train): + if current_dataloader is not None: + return current_dataloader + dataset = SummarizationDataset(hf_dataset=hf_dataset, tokenizer=self.tokenizer, max_output_len=self.args.max_output_len) + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None + return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None), + num_workers=self.args.num_workers, sampler=sampler, + collate_fn=SummarizationDataset.collate_fn) + + @pl.data_loader + def train_dataloader(self): + if self.hf_datasets is None: + self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') + self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, self.hf_datasets['train'], is_train=True) + return self.train_dataloader_object + + @pl.data_loader + def val_dataloader(self): + if self.hf_datasets is None: + self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') + dataset_split = 'validation' if not self.args.debug else 'train' + self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, self.hf_datasets[dataset_split], is_train=False) + return self.val_dataloader_object + + @pl.data_loader + def test_dataloader(self): + if self.hf_datasets is None: + self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') + self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, self.hf_datasets['test'], is_train=False) + return self.test_dataloader_object + + def configure_ddp(self, model, device_ids): + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + find_unused_parameters=False + ) + return model + + @staticmethod + def add_model_specific_args(parser, root_dir): + parser.add_argument("--save_dir", type=str, default='summarization') + parser.add_argument("--save_prefix", type=str, default='test') + parser.add_argument("--batch_size", type=int, default=16, help="Batch size") + parser.add_argument("--grad_accum", type=int, default=1, help="number of gradient accumulation steps") + parser.add_argument("--gpus", type=int, default=-1, + help="Number of gpus. 0 for CPU") + parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps") + parser.add_argument("--lr", type=float, default=0.00003, help="Maximum learning rate") + parser.add_argument("--val_every", type=float, default=1.0, help="Number of training steps between validations") + parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used') + parser.add_argument("--num_workers", type=int, default=0, help="Number of data loader workers") + parser.add_argument("--seed", type=int, default=1234, help="Seed") + parser.add_argument("--epochs", type=int, default=5, help="Number of epochs") + parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing") + parser.add_argument("--max_output_len", type=int, default=256, + help="maximum num of wordpieces/summary. Used for training and testing") + parser.add_argument("--test", action='store_true', help="Test only, no training") + parser.add_argument("--model_path", type=str, default='facebook/bart-base', + help="Path to the checkpoint directory or model name") + parser.add_argument("--tokenizer", type=str, default='facebook/bart-base') + parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing") + parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32") + parser.add_argument("--debug", action='store_true', help="debug run") + parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from") + + return parser + + +def main(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(args.seed) + + model = Summarizer(args) + + logger = TestTubeLogger( + save_dir=args.save_dir, + name=args.save_prefix, + version=0 # always use version=0 + ) + + checkpoint_callback = ModelCheckpoint( + filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"), + save_top_k=5, + verbose=True, + monitor='avg_val_loss', + mode='min', + period=-1, + prefix='' + ) + + print(args) + model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') + args.dataset_size = 203037 # hardcode dataset size. Needed to compute number of steps for the lr scheduler + + trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None, + track_grad_norm=-1, + max_epochs=args.epochs if not args.debug else 100, + replace_sampler_ddp=False, + accumulate_grad_batches=args.grad_accum, + val_check_interval=args.val_every, + num_sanity_val_steps=2, + check_val_every_n_epoch=1 if not args.debug else 5, + val_percent_check=args.val_percent_check, + test_percent_check=args.val_percent_check, + logger=logger, + checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False, + show_progress_bar=not args.no_progress_bar, + use_amp=not args.fp32, amp_level='O2', + resume_from_checkpoint=args.resume_ckpt, + ) + if not args.test: + trainer.fit(model) + trainer.test(model) + + +if __name__ == "__main__": + main_arg_parser = argparse.ArgumentParser(description="summarization") + parser = Summarizer.add_model_specific_args(main_arg_parser, os.getcwd()) + args = parser.parse_args() + main(args) From 01581253b921045077a330650412d9925332d0a8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Wed, 2 Sep 2020 17:07:07 -0700 Subject: [PATCH 091/112] fix loading data --- scripts/summarization.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index 37d4a14..6bcf956 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -121,10 +121,10 @@ def configure_optimizers(self): ) return [optimizer], [{"scheduler": scheduler, "interval": "step"}] - def _get_dataloader(self, current_dataloader, hf_dataset, is_train): + def _get_dataloader(self, current_dataloader, split_name, is_train): if current_dataloader is not None: return current_dataloader - dataset = SummarizationDataset(hf_dataset=hf_dataset, tokenizer=self.tokenizer, max_output_len=self.args.max_output_len) + dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer, max_output_len=self.args.max_output_len) sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None), num_workers=self.args.num_workers, sampler=sampler, @@ -132,24 +132,18 @@ def _get_dataloader(self, current_dataloader, hf_dataset, is_train): @pl.data_loader def train_dataloader(self): - if self.hf_datasets is None: - self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') - self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, self.hf_datasets['train'], is_train=True) + self.train_dataloader_object = self._get_dataloader(self.train_dataloader_object, 'train', is_train=True) return self.train_dataloader_object @pl.data_loader def val_dataloader(self): - if self.hf_datasets is None: - self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') - dataset_split = 'validation' if not self.args.debug else 'train' - self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, self.hf_datasets[dataset_split], is_train=False) + split_name = 'validation' if not self.args.debug else 'train' + self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, split_name, is_train=False) return self.val_dataloader_object @pl.data_loader def test_dataloader(self): - if self.hf_datasets is None: - self.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') - self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, self.hf_datasets['test'], is_train=False) + self.test_dataloader_object = self._get_dataloader(self.test_dataloader_object, 'test', is_train=False) return self.test_dataloader_object def configure_ddp(self, model, device_ids): @@ -198,6 +192,7 @@ def main(args): torch.cuda.manual_seed_all(args.seed) model = Summarizer(args) + model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') logger = TestTubeLogger( save_dir=args.save_dir, @@ -216,10 +211,10 @@ def main(args): ) print(args) - model.hf_datasets = nlp.load_dataset('scientific_papers', 'arxiv') + args.dataset_size = 203037 # hardcode dataset size. Needed to compute number of steps for the lr scheduler - trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and args.gpus > 1 else None, + trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp', track_grad_norm=-1, max_epochs=args.epochs if not args.debug else 100, replace_sampler_ddp=False, From 9fdef528053364fcd6316b7bfbf7fcb8654d84a1 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 3 Sep 2020 07:35:23 -0700 Subject: [PATCH 092/112] wip --- scripts/summarization.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index 6bcf956..f9e3b30 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -14,6 +14,8 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from rouge_score import rouge_scorer + class SummarizationDataset(Dataset): def __init__(self, hf_dataset, tokenizer, max_output_len): @@ -26,7 +28,7 @@ def __len__(self): def __getitem__(self, idx): entry = self.hf_dataset[idx] - input_ids = self.tokenizer.encode(entry['article'], truncation=True) + input_ids = self.tokenizer.encode(entry['article'], truncation=True, max_length=self.max_input_len) output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len) return torch.tensor(input_ids), torch.tensor(output_ids) @@ -48,7 +50,6 @@ def __init__(self, args): self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True) self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None - self.rouge = None def forward(self, input_ids, output_ids): attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) @@ -87,13 +88,21 @@ def validation_step(self, batch, batch_nb): max_length=self.args.max_output_len) generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) - if self.rouge is None: - self.rouge = nlp.load_metric("rouge") - rouge_scores = self.rouge.compute(predictions=generated_str, references=gold_str, rouge_types=['rouge2', 'rouge1', 'rougeL']) + scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False) + rouge1 = rouge2 = rougel = 0.0 + for ref, pred in zip(gold_str, generated_str): + score = scorer.score(ref, pred) + rouge1 += score['rouge1'].fmeasure + rouge2 += score['rouge2'].fmeasure + rougel += score['rougeL'].fmeasure + rouge1 /= len(generated_str) + rouge2 /= len(generated_str) + rougel /= len(generated_str) + return {'vloss': vloss, - 'rouge1': vloss.new_zeros(1) + rouge_scores['rouge1'].mid.fmeasure, - 'rouge2': vloss.new_zeros(1) + rouge_scores['rouge2'].mid.fmeasure, - 'rougeL': vloss.new_zeros(1) + rouge_scores['rougeL'].mid.fmeasure} + 'rouge1': vloss.new_zeros(1) + rouge1, + 'rouge2': vloss.new_zeros(1) + rouge2, + 'rougeL': vloss.new_zeros(1) + rougel, } def validation_epoch_end(self, outputs): names = ['vloss', 'rouge1', 'rouge2', 'rougeL'] @@ -108,7 +117,11 @@ def validation_epoch_end(self, outputs): return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs} def test_step(self, batch, batch_nb): - raise NotImplementedError + return self.validation_step(batch, batch_nb) + + def test_epoch_end(self, outputs): + result = self.validation_epoch_end(outputs) + print(result) def configure_optimizers(self): optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) @@ -172,6 +185,8 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing") parser.add_argument("--max_output_len", type=int, default=256, help="maximum num of wordpieces/summary. Used for training and testing") + parser.add_argument("--max_input_len", type=int, default=512, + help="maximum num of wordpieces/summary. Used for training and testing") parser.add_argument("--test", action='store_true', help="Test only, no training") parser.add_argument("--model_path", type=str, default='facebook/bart-base', help="Path to the checkpoint directory or model name") From cbb407d6ae13202dffd9c436303f3f5ec4d6e920 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 3 Sep 2020 07:41:40 -0700 Subject: [PATCH 093/112] wip --- scripts/summarization.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index f9e3b30..a6df4f3 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -18,9 +18,10 @@ class SummarizationDataset(Dataset): - def __init__(self, hf_dataset, tokenizer, max_output_len): + def __init__(self, hf_dataset, tokenizer, max_input_len, max_output_len): self.hf_dataset = hf_dataset self.tokenizer = tokenizer + self.max_input_len = max_input_len self.max_output_len = max_output_len def __len__(self): @@ -137,7 +138,8 @@ def configure_optimizers(self): def _get_dataloader(self, current_dataloader, split_name, is_train): if current_dataloader is not None: return current_dataloader - dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer, max_output_len=self.args.max_output_len) + dataset = SummarizationDataset(hf_dataset=self.hf_datasets[split_name], tokenizer=self.tokenizer, + max_input_len=self.args.max_input_len, max_output_len=self.args.max_output_len) sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=is_train) if self.trainer.use_ddp else None return DataLoader(dataset, batch_size=self.args.batch_size, shuffle=(sampler is None), num_workers=self.args.num_workers, sampler=sampler, From eb34cc0b880617e8fd952e3ecfb1bcf906e7d9b8 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 3 Sep 2020 16:55:51 -0700 Subject: [PATCH 094/112] grad_ckpt + reqs + long --- requirements.txt | 2 ++ scripts/summarization.py | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index d91e5de..a98ef2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_ torch==1.6.0 tensorboardX test-tube==0.7.5 +nlp +rouge_score diff --git a/scripts/summarization.py b/scripts/summarization.py index a6df4f3..a57bab4 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -8,13 +8,15 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers.optimization import get_linear_schedule_with_warmup import nlp +from rouge_score import rouge_scorer import pytorch_lightning as pl from pytorch_lightning.logging import TestTubeLogger from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel -from rouge_score import rouge_scorer +from longformer import LongformerEncoderDecoderForConditionalGeneration +from longformer.sliding_chunks import pad_to_window_size class SummarizationDataset(Dataset): @@ -49,16 +51,26 @@ def __init__(self, args): self.args = args self.hparams = args self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True) - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path) + if 'long' in self.args.model_path: + # TODO: remember to set attention_dropout = 0.1 + self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained( + self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,) + else: + self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None def forward(self, input_ids, output_ids): attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 + if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration): + attention_mask[:, 0] = 2 # global attention on one token for all model params to be used, which is important for gradient checkpointing to work + input_ids, attention_mask = pad_to_window_size( # ideally, should be moved inside the LongformerModel + input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id) decoder_input_ids = output_ids[:, :-1] decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = output_ids[:, 1:].clone() labels[labels == self.tokenizer.pad_token_id] = -100 + outputs = self.model( input_ids, attention_mask=attention_mask, @@ -129,7 +141,7 @@ def configure_optimizers(self): if self.args.debug: return optimizer # const LR num_gpus = torch.cuda.device_count() - num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum + num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps ) @@ -197,6 +209,7 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32") parser.add_argument("--debug", action='store_true', help="debug run") parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from") + parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory') return parser From 42481fd427879d11b3c332f91c8e26206003d11c Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 4 Sep 2020 02:44:23 +0000 Subject: [PATCH 095/112] ignore empty answers --- scripts/triviaqa.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/triviaqa.py b/scripts/triviaqa.py index bba1e5f..967f97a 100644 --- a/scripts/triviaqa.py +++ b/scripts/triviaqa.py @@ -459,16 +459,19 @@ def validation_step(self, batch, batch_nb): output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, answer_token_ids) if self.args.seq2seq: logit_scores = output[1] - best_answer_score = logit_scores.max() - best_answer_index = logit_scores.argmax().item() - generated_ids = self.model.generate(input_ids=input_ids[best_answer_index:best_answer_index + 1], - attention_mask=input_mask[best_answer_index:best_answer_index + 1], - use_cache=True,) - generated_answer_ids = generated_ids[0] - generated_answer_ids[-1] = self.tokenizer.eos_token_id - index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item() - generated_answer_ids = generated_answer_ids[1:index_of_eos_token] - answer_text = self.tokenizer.decode(generated_answer_ids) + answer_score_indices = logit_scores.sort().indices + generated_ids = self.model.generate(input_ids=input_ids, attention_mask=input_mask, use_cache=True,) + answer_text = '' + best_answer_score = 0 + for i in answer_score_indices: + generated_answer_ids = generated_ids[answer_score_indices[i]] + generated_answer_ids[-1] = self.tokenizer.eos_token_id + index_of_eos_token = (generated_answer_ids == self.tokenizer.eos_token_id).nonzero()[0, 0].item() + generated_answer_ids = generated_answer_ids[1:index_of_eos_token] + answer_text = self.tokenizer.decode(generated_answer_ids) + if answer_text != '': + best_answer_score = logit_scores[answer_score_indices[i]] + break f1_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.f1_score, answer_text, aliases) em_score = evaluation_utils.metric_max_over_ground_truths(evaluation_utils.exact_match_score, answer_text, aliases) return {'vloss': output[0], 'vem': generated_answer_ids.new_zeros([1]).float(), From c6f23353f151dad213f3dc9a0d992ec5c19ab599 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 3 Sep 2020 20:52:58 -0700 Subject: [PATCH 096/112] attention dropout --- scripts/summarization.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index a57bab4..64d42c1 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -5,7 +5,7 @@ import torch from torch.utils.data import DataLoader, Dataset -from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig from transformers.optimization import get_linear_schedule_with_warmup import nlp from rouge_score import rouge_scorer @@ -15,7 +15,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel -from longformer import LongformerEncoderDecoderForConditionalGeneration +from longformer import LongformerEncoderDecoderForConditionalGeneration, LongformerEncoderDecoderConfig from longformer.sliding_chunks import pad_to_window_size @@ -51,12 +51,18 @@ def __init__(self, args): self.args = args self.hparams = args self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer, use_fast=True) + if 'long' in self.args.model_path: - # TODO: remember to set attention_dropout = 0.1 + config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path) + config.attention_dropout = self.args.attention_dropout self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained( - self.args.model_path, gradient_checkpointing=self.args.grad_ckpt,) + self.args.model_path, gradient_checkpointing=self.args.grad_ckpt, + config=config) else: - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.model_path) + config = AutoConfig.from_pretrained(self.args.model_path) + config.attention_dropout = self.args.attention_dropout + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.args.model_path, config=config) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None def forward(self, input_ids, output_ids): @@ -210,6 +216,8 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--debug", action='store_true', help="debug run") parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from") parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory') + parser.add_argument("--attention_dropout", type=float, default=0.1, + help="attention dropout") return parser From f5a798d083f18b3ec2ce5def37307ab66c6098c1 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Thu, 3 Sep 2020 22:22:59 -0700 Subject: [PATCH 097/112] model.generate takes a lot of memory. Set requires_grad=False --- scripts/summarization.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index 64d42c1..dac4b08 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -55,9 +55,9 @@ def __init__(self, args): if 'long' in self.args.model_path: config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path) config.attention_dropout = self.args.attention_dropout + config.gradient_checkpointing = self.args.grad_ckpt self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained( - self.args.model_path, gradient_checkpointing=self.args.grad_ckpt, - config=config) + self.args.model_path, config=config) else: config = AutoConfig.from_pretrained(self.args.model_path) config.attention_dropout = self.args.attention_dropout @@ -96,15 +96,18 @@ def training_step(self, batch, batch_nb): return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): + for p in self.model.parameters(): + p.requires_grad = False + outputs = self.forward(*batch) vloss = outputs[0] input_ids, output_ids = batch attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 - generated_ids = self.model.generate(input_ids=input_ids, - attention_mask=attention_mask, - use_cache=True, - max_length=self.args.max_output_len) + + generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, + use_cache=True, max_length=self.args.max_output_len, + num_beams=1) generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False) @@ -124,6 +127,9 @@ def validation_step(self, batch, batch_nb): 'rougeL': vloss.new_zeros(1) + rougel, } def validation_epoch_end(self, outputs): + for p in self.model.parameters(): + p.requires_grad = True + names = ['vloss', 'rouge1', 'rouge2', 'rougeL'] metrics = [] for name in names: From 274d017a1687e6c90bd6a324e53d1c28ef8d7f44 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Fri, 4 Sep 2020 23:16:23 -0700 Subject: [PATCH 098/112] wip --- scripts/summarization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index dac4b08..7d338e6 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -139,6 +139,7 @@ def validation_epoch_end(self, outputs): metric /= self.trainer.world_size metrics.append(metric) logs = dict(zip(*[names, metrics])) + print(logs) return {'avg_val_loss': logs['vloss'], 'log': logs, 'progress_bar': logs} def test_step(self, batch, batch_nb): @@ -176,8 +177,7 @@ def train_dataloader(self): @pl.data_loader def val_dataloader(self): - split_name = 'validation' if not self.args.debug else 'train' - self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, split_name, is_train=False) + self.val_dataloader_object = self._get_dataloader(self.val_dataloader_object, 'validation', is_train=False) return self.val_dataloader_object @pl.data_loader @@ -263,9 +263,9 @@ def main(args): max_epochs=args.epochs if not args.debug else 100, replace_sampler_ddp=False, accumulate_grad_batches=args.grad_accum, - val_check_interval=args.val_every, + val_check_interval=args.val_every if not args.debug else 1, num_sanity_val_steps=2, - check_val_every_n_epoch=1 if not args.debug else 5, + check_val_every_n_epoch=1 if not args.debug else 1, val_percent_check=args.val_percent_check, test_percent_check=args.val_percent_check, logger=logger, From 5f765b9021ab5abb082443964d54bf9265179022 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 5 Sep 2020 02:08:05 -0700 Subject: [PATCH 099/112] wip --- scripts/summarization.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index 7d338e6..91ecb87 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -65,13 +65,17 @@ def __init__(self, args): self.args.model_path, config=config) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None - def forward(self, input_ids, output_ids): + def _prepare_input(self, input_ids): attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration): attention_mask[:, 0] = 2 # global attention on one token for all model params to be used, which is important for gradient checkpointing to work input_ids, attention_mask = pad_to_window_size( # ideally, should be moved inside the LongformerModel input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id) + return input_ids, attention_mask + + def forward(self, input_ids, output_ids): + input_ids, attention_mask = self._prepare_input(input_ids) decoder_input_ids = output_ids[:, :-1] decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = output_ids[:, 1:].clone() @@ -102,9 +106,7 @@ def validation_step(self, batch, batch_nb): outputs = self.forward(*batch) vloss = outputs[0] input_ids, output_ids = batch - attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) - attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 - + input_ids, attention_mask = self._prepare_input(input_ids) generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, use_cache=True, max_length=self.args.max_output_len, num_beams=1) From 5784aee109b7e7bd379b8b33c36466b21dec8f22 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sat, 5 Sep 2020 21:09:47 -0700 Subject: [PATCH 100/112] attention_mode --- longformer/sliding_chunks.py | 2 +- scripts/summarization.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/longformer/sliding_chunks.py b/longformer/sliding_chunks.py index 4eed8d8..8ee30a1 100644 --- a/longformer/sliding_chunks.py +++ b/longformer/sliding_chunks.py @@ -125,7 +125,7 @@ def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor, Returns (input_ids, attention_mask) padded to length divisible by 2 * one_sided_window_size ''' - w = 2 * one_sided_window_size + w = int(2 * one_sided_window_size) seqlen = input_ids.size(1) padding_len = (w - seqlen % w) % w input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) diff --git a/scripts/summarization.py b/scripts/summarization.py index 91ecb87..454d2bc 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -56,6 +56,8 @@ def __init__(self, args): config = LongformerEncoderDecoderConfig.from_pretrained(self.args.model_path) config.attention_dropout = self.args.attention_dropout config.gradient_checkpointing = self.args.grad_ckpt + config.attention_mode = self.args.attention_mode + config.attention_window = [self.args.attention_window] * config.encoder_layers self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained( self.args.model_path, config=config) else: @@ -70,8 +72,14 @@ def _prepare_input(self, input_ids): attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration): attention_mask[:, 0] = 2 # global attention on one token for all model params to be used, which is important for gradient checkpointing to work + if self.args.attention_mode == 'sliding_chunks': + half_padding_mod = self.model.config.attention_window[0] + elif self.args.attention_mode == 'sliding_chunks_no_overlap': + half_padding_mod = self.model.config.attention_window[0] / 2 + else: + raise NotImplementedError input_ids, attention_mask = pad_to_window_size( # ideally, should be moved inside the LongformerModel - input_ids, attention_mask, self.model.config.attention_window[0], self.tokenizer.pad_token_id) + input_ids, attention_mask, half_padding_mod, self.tokenizer.pad_token_id) return input_ids, attention_mask def forward(self, input_ids, output_ids): @@ -224,8 +232,9 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--debug", action='store_true', help="debug run") parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from") parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory') - parser.add_argument("--attention_dropout", type=float, default=0.1, - help="attention dropout") + parser.add_argument("--attention_dropout", type=float, default=0.1, help="attention dropout") + parser.add_argument("--attention_mode", type=str, default='sliding_chunks', help="Longformer attention mode") + parser.add_argument("--attention_window", type=int, default=512, help="Attention window") return parser From b78384a826635a2bc656295ffbd2beb17e44fb4e Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 7 Sep 2020 04:15:32 +0000 Subject: [PATCH 101/112] wip --- scripts/summarization.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index dac4b08..3486685 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -37,7 +37,14 @@ def __getitem__(self, idx): @staticmethod def collate_fn(batch): - pad_token_id = 1 # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id + # A hack to know if this is bart or pegasus. DDP doesn't like global variables nor class-level memebr variables + if batch[0][0][-1].item() == 2: + pad_token_id = 1 # AutoTokenizer.from_pretrained('facebook/bart-base').pad_token_id + elif batch[0][0][-1].item() == 1: + pad_token_id = 0 # AutoTokenizer.from_pretrained('google/pegasus-large').pad_token_id + else: + assert False + input_ids, output_ids = list(zip(*batch)) input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id) output_ids = torch.nn.utils.rnn.pad_sequence(output_ids, batch_first=True, padding_value=pad_token_id) @@ -76,7 +83,6 @@ def forward(self, input_ids, output_ids): decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = output_ids[:, 1:].clone() labels[labels == self.tokenizer.pad_token_id] = -100 - outputs = self.model( input_ids, attention_mask=attention_mask, @@ -108,8 +114,8 @@ def validation_step(self, batch, batch_nb): generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, use_cache=True, max_length=self.args.max_output_len, num_beams=1) - generated_str = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - gold_str = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) + generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True) + gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True) scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False) rouge1 = rouge2 = rougel = 0.0 for ref, pred in zip(gold_str, generated_str): From 327b72932f27dd608f6a5619508db27b35ea0c06 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 7 Sep 2020 06:36:42 +0000 Subject: [PATCH 102/112] pegasus bug --- scripts/summarization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/summarization.py b/scripts/summarization.py index 35773d0..9885001 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -33,6 +33,8 @@ def __getitem__(self, idx): entry = self.hf_dataset[idx] input_ids = self.tokenizer.encode(entry['article'], truncation=True, max_length=self.max_input_len) output_ids = self.tokenizer.encode(entry['abstract'], truncation=True, max_length=self.max_output_len) + if self.tokenizer.bos_token_id is None: # pegasus + output_ids = [self.tokenizer.pad_token_id] + output_ids return torch.tensor(input_ids), torch.tensor(output_ids) @staticmethod From 4944fb851ad5a02d55d676c6f8c0801c71db0944 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 7 Sep 2020 06:47:07 +0000 Subject: [PATCH 103/112] run on cpu --- scripts/summarization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index 9885001..e477d9d 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -112,7 +112,7 @@ def training_step(self, batch, batch_nb): tensorboard_logs = {'train_loss': loss, 'lr': lr, 'input_size': batch[0].numel(), 'output_size': batch[1].numel(), - 'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3} + 'mem': torch.cuda.memory_allocated(loss.device) / 1024 ** 3 if torch.cuda.is_available() else 0} return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_nb): @@ -171,7 +171,7 @@ def configure_optimizers(self): optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) if self.args.debug: return optimizer # const LR - num_gpus = torch.cuda.device_count() + num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1 num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps @@ -277,7 +277,7 @@ def main(args): args.dataset_size = 203037 # hardcode dataset size. Needed to compute number of steps for the lr scheduler - trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp', + trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if torch.cuda.is_available() else None, track_grad_norm=-1, max_epochs=args.epochs if not args.debug else 100, replace_sampler_ddp=False, From 36252c07bbc9adcd506e88d80bad76095df750ed Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 7 Sep 2020 00:20:32 -0700 Subject: [PATCH 104/112] readme --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4693102..b2e6611 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,17 @@ **\*\*\*\*\* Work In Progress: LongformerEncoderDecoder \*\*\*\*\*** -A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BartLarge`. With gradient checkpointing, fp16, and 48GB gpu, the input length be up to 16K. +A `LongformerEncoderDecoder` model is now available. It is geared towards summarization where the input is long but the output is relatively shorter. The following code snippet loads a `LongformerEncoderDecoder` checkpointing started from `BART`. With gradient checkpointing, fp16, and 48GB gpu, the input length can be up to 16K tokens. ``` -pip install git+https://github.com/allenai/longformer.git@encoderdecoder +pip install git+https://github.com/allenai/longformer.git@encoderdecoder -# checkpoint: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-12288.tar.gz +# checkpoint-base: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-base-16384.tar.gz +# checkpoint-large: https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-16384.tar.gz from longformer import LongformerEncoderDecoderForConditionalGeneration model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True) ``` +Check the script `scripts/summarization.py` for an example of how to use the model. **\*\*\*\*\* New July 23rd, 2020: Speed degradation \*\*\*\*\*** From 281999fdf4a449e2eb4d738e4b56a03b12c89f18 Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Mon, 7 Sep 2020 00:22:35 -0700 Subject: [PATCH 105/112] readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b2e6611..aed7e73 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,10 @@ pip install git+https://github.com/allenai/longformer.git@encoderdecoder from longformer import LongformerEncoderDecoderForConditionalGeneration model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(downloaded_checkpoint, gradient_checkpointing=True) ``` -Check the script `scripts/summarization.py` for an example of how to use the model. + +- Check the script `scripts/summarization.py` for an example of how to use the model. + +- Make sure to use the huggingface/transformers fork specified in `requirements.txt`. **\*\*\*\*\* New July 23rd, 2020: Speed degradation \*\*\*\*\*** From dee3daf1a6feea8dab056ade780c661effa9045f Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Tue, 8 Sep 2020 21:14:15 -0700 Subject: [PATCH 106/112] adafactor and label smoothing --- scripts/summarization.py | 49 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index e477d9d..614ab5a 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -6,7 +6,7 @@ import torch from torch.utils.data import DataLoader, Dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig -from transformers.optimization import get_linear_schedule_with_warmup +from transformers.optimization import get_linear_schedule_with_warmup, Adafactor import nlp from rouge_score import rouge_scorer @@ -15,10 +15,34 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel + from longformer import LongformerEncoderDecoderForConditionalGeneration, LongformerEncoderDecoderConfig from longformer.sliding_chunks import pad_to_window_size +def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100): + """From fairseq""" + if target.dim() == lprobs.dim() - 1: + target = target.unsqueeze(-1) + nll_loss = -lprobs.gather(dim=-1, index=target) + smooth_loss = -lprobs.sum(dim=-1, keepdim=True) + if ignore_index is not None: + pad_mask = target.eq(ignore_index) + nll_loss.masked_fill_(pad_mask, 0.0) + smooth_loss.masked_fill_(pad_mask, 0.0) + count = (~pad_mask).sum() + else: + nll_loss = nll_loss.squeeze(-1) + smooth_loss = smooth_loss.squeeze(-1) + count = nll_loss.numel() + + nll_loss = nll_loss.sum() / count + smooth_loss = smooth_loss.sum() / count + eps_i = epsilon / lprobs.size(-1) + loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss + return loss, nll_loss + + class SummarizationDataset(Dataset): def __init__(self, hf_dataset, tokenizer, max_input_len, max_output_len): self.hf_dataset = hf_dataset @@ -96,14 +120,24 @@ def forward(self, input_ids, output_ids): decoder_input_ids = output_ids[:, :-1] decoder_attention_mask = (decoder_input_ids != self.tokenizer.pad_token_id) labels = output_ids[:, 1:].clone() - labels[labels == self.tokenizer.pad_token_id] = -100 outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, - labels=labels) - return outputs + use_cache=False,) + lm_logits = outputs[0] + if self.args.label_smoothing == 0: + # Same behavior as modeling_bart.py, besides ignoring pad_token_id + ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + assert lm_logits.shape[-1] == self.model.config.vocab_size + loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1)) + else: + lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1) + loss, nll_loss = label_smoothed_nll_loss( + lprobs, labels, self.args.label_smoothing, ignore_index=self.tokenizer.pad_token_id + ) + return [loss] def training_step(self, batch, batch_nb): output = self.forward(*batch) @@ -168,7 +202,10 @@ def test_epoch_end(self, outputs): print(result) def configure_optimizers(self): - optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) + if self.args.adafactor: + optimizer = Adafactor(self.model.parameters(), lr=self.args.lr, scale_parameter=False, relative_step=False) + else: + optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) if self.args.debug: return optimizer # const LR num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1 @@ -243,6 +280,8 @@ def add_model_specific_args(parser, root_dir): parser.add_argument("--attention_dropout", type=float, default=0.1, help="attention dropout") parser.add_argument("--attention_mode", type=str, default='sliding_chunks', help="Longformer attention mode") parser.add_argument("--attention_window", type=int, default=512, help="Attention window") + parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) + parser.add_argument("--adafactor", action='store_true', help="Use adafactor optimizer") return parser From 498ca0408049c7b61724d135538f6680c8f9be1a Mon Sep 17 00:00:00 2001 From: Iz Beltagy Date: Sun, 13 Sep 2020 04:54:20 +0000 Subject: [PATCH 107/112] add rougeLsum --- scripts/summarization.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/summarization.py b/scripts/summarization.py index e477d9d..26264b6 100644 --- a/scripts/summarization.py +++ b/scripts/summarization.py @@ -128,27 +128,30 @@ def validation_step(self, batch, batch_nb): num_beams=1) generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True) gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True) - scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=False) - rouge1 = rouge2 = rougel = 0.0 + scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=False) + rouge1 = rouge2 = rougel = rougelsum = 0.0 for ref, pred in zip(gold_str, generated_str): score = scorer.score(ref, pred) rouge1 += score['rouge1'].fmeasure rouge2 += score['rouge2'].fmeasure rougel += score['rougeL'].fmeasure + rougelsum += score['rougeLsum'].fmeasure rouge1 /= len(generated_str) rouge2 /= len(generated_str) rougel /= len(generated_str) + rougelsum /= len(generated_str) return {'vloss': vloss, 'rouge1': vloss.new_zeros(1) + rouge1, 'rouge2': vloss.new_zeros(1) + rouge2, - 'rougeL': vloss.new_zeros(1) + rougel, } + 'rougeL': vloss.new_zeros(1) + rougel, + 'rougeLsum': vloss.new_zeros(1) + rougelsum, } def validation_epoch_end(self, outputs): for p in self.model.parameters(): p.requires_grad = True - names = ['vloss', 'rouge1', 'rouge2', 'rougeL'] + names = ['vloss', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum'] metrics = [] for name in names: metric = torch.stack([x[name] for x in outputs]).mean() @@ -280,10 +283,11 @@ def main(args): trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if torch.cuda.is_available() else None, track_grad_norm=-1, max_epochs=args.epochs if not args.debug else 100, + max_steps=None if not args.debug else 1, replace_sampler_ddp=False, accumulate_grad_batches=args.grad_accum, val_check_interval=args.val_every if not args.debug else 1, - num_sanity_val_steps=2, + num_sanity_val_steps=2 if not args.debug else 0, check_val_every_n_epoch=1 if not args.debug else 1, val_percent_check=args.val_percent_check, test_percent_check=args.val_percent_check, From 0f3875fa875ca3220cb01801ad6c1ef9be86e6a5 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Thu, 12 Nov 2020 02:19:40 -0800 Subject: [PATCH 108/112] wip code for LongT5 --- longformer/longformer_t5_encoder_decoder.py | 395 ++++++++++++++++++ requirements.txt | 2 +- .../convert_t5_to_longformerencoderdecoder.py | 156 +++++++ 3 files changed, 552 insertions(+), 1 deletion(-) create mode 100644 longformer/longformer_t5_encoder_decoder.py create mode 100644 scripts/convert_t5_to_longformerencoderdecoder.py diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py new file mode 100644 index 0000000..b02cb36 --- /dev/null +++ b/longformer/longformer_t5_encoder_decoder.py @@ -0,0 +1,395 @@ +import math +from typing import List, Optional, Tuple, Dict +from torch import nn, Tensor +from longformer.longformer import LongformerSelfAttention +from longformer.sliding_chunks import * +from transformers.modeling_t5 import T5Config, T5ForConditionalGeneration + + +class LongformerEncoderDecoderForConditionalGenerationT5(T5ForConditionalGeneration): + def __init__(self, config): + super().__init__(config) + if config.attention_mode == 'n2': + pass # do nothing, use BertSelfAttention instead + else: + for i, layer in enumerate(self.encoder.block): + layer.layer[0].SelfAttention = LongformerSelfAttentionForT5(config, layer_id=i) + + +class LongformerEncoderDecoderConfigT5(T5Config): + def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None, + autoregressive: bool = False, attention_mode: str = 'sliding_chunks', + gradient_checkpointing: bool = False, **kwargs): + """ + Args: + attention_window: list of attention window sizes of length = number of layers. + window size = number of attention locations on each side. + For an affective window size of 512, use `attention_window=[256]*num_layers` + which is 256 on each side. + attention_dilation: list of attention dilation of length = number of layers. + attention dilation of `1` means no dilation. + autoregressive: do autoregressive attention or have attention of both sides + attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer + selfattention, 'sliding_chunks' for another implementation of Longformer selfattention + """ + super().__init__(**kwargs) + self.attention_window = attention_window + self.attention_dilation = attention_dilation + self.autoregressive = autoregressive + self.attention_mode = attention_mode + self.gradient_checkpointing = gradient_checkpointing + # self.attention_probs_dropout_prob = self.dropout_rate + assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] + +class LongformerSelfAttentionT5Basic(nn.Module): + def __init__(self, config, layer_id, has_relative_attention_bias=False): + super(LongformerSelfAttentionT5Basic, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_heads = config.num_attention_heads + self.head_dim = int(config.hidden_size / config.num_attention_heads) + self.embed_dim = config.hidden_size + + self.query = nn.Linear(config.hidden_size, self.embed_dim) + self.key = nn.Linear(config.hidden_size, self.embed_dim) + self.value = nn.Linear(config.hidden_size, self.embed_dim) + + # this is for the T5 setting + self.is_decoder = config.is_decoder + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.has_relative_attention_bias = has_relative_attention_bias + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.num_heads) + + self.query_global = nn.Linear(config.hidden_size, self.embed_dim) + self.key_global = nn.Linear(config.hidden_size, self.embed_dim) + self.value_global = nn.Linear(config.hidden_size, self.embed_dim) + + self.dropout = config.attention_probs_dropout_prob + + self.layer_id = layer_id + self.attention_window = config.attention_window[self.layer_id] + self.attention_dilation = config.attention_dilation[self.layer_id] + self.attention_mode = config.attention_mode + self.autoregressive = config.autoregressive + assert self.attention_window > 0 + assert self.attention_dilation > 0 + assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap'] + if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']: + assert not self.autoregressive # not supported + assert self.attention_dilation == 1 # dilation is not supported + + @staticmethod + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + Translate relative position to a bucket number for relative attention. + The relative position is defined as memory_position - query_position, i.e. + the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are + invalid. + We use smaller buckets for small absolute relative_position and larger buckets + for larger absolute relative_positions. All relative positions >=max_distance + map to the same bucket. All relative positions <=-max_distance map to the + same bucket. This should allow for more graceful generalization to longer + sequences than the model has been trained on. + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + Returns: + a Tensor with the same shape as relative_position, containing int32 + values in the range [0, num_buckets) + """ + ret = 0 + n = -relative_position + if bidirectional: + num_buckets //= 2 + ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets + n = torch.abs(n) + else: + n = torch.max(n, torch.zeros_like(n)) + # now n is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = n < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).to(torch.long) + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def compute_bias(self, qlen, klen): + """ Compute binned relative position bias """ + context_position = torch.arange(qlen, dtype=torch.long)[:, None] + memory_position = torch.arange(klen, dtype=torch.long)[None, :] + relative_position = memory_position - context_position # shape (qlen, klen) + rp_bucket = self._relative_position_bucket( + relative_position, # shape (qlen, klen) + bidirectional=not self.is_decoder, + num_buckets=self.relative_attention_num_buckets, + ) + rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) + values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) + # values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) + # Changing the shape to below because that's what LongformerSelfAttention's attn_weights need. + values = values.permute([0, 2, 1]).unsqueeze(0) # shape (1, qlen, num_heads, klen) + return values + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + past_key_value_state=None, + head_mask=None, + output_attentions=False, + ): + ''' + The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to + -ve: no attention + 0: local attention + +ve: global attention + ''' + + if attention_mask is not None: + attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1) + key_padding_mask = attention_mask < 0 + extra_attention_mask = attention_mask > 0 + remove_from_windowed_attention_mask = attention_mask != 0 + + num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1) + max_num_extra_indices_per_batch = num_extra_indices_per_batch.max() + if max_num_extra_indices_per_batch <= 0: + extra_attention_mask = None + else: + # To support the case of variable number of global attention in the rows of a batch, + # we use the following three selection masks to select global attention embeddings + # in a 3d tensor and pad it to `max_num_extra_indices_per_batch` + # 1) selecting embeddings that correspond to global attention + extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True) + zero_to_max_range = torch.arange(0, max_num_extra_indices_per_batch, + device=num_extra_indices_per_batch.device) + # mask indicating which values are actually going to be padding + selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1) + # 2) location of the non-padding values in the selected global attention + selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True) + # 3) location of the padding values in the selected global attention + selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True) + else: + remove_from_windowed_attention_mask = None + extra_attention_mask = None + key_padding_mask = None + + hidden_states = hidden_states.transpose(0, 1) + seq_len, bsz, embed_dim = hidden_states.size() + assert embed_dim == self.embed_dim + q = self.query(hidden_states) + k = self.key(hidden_states) + v = self.value(hidden_states) + q /= math.sqrt(self.head_dim) + + q = q.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) + k = k.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) + # attn_weights = (bsz, seq_len, num_heads, window*2+1) + if self.attention_mode == 'tvm': + q = q.float().contiguous() + k = k.float().contiguous() + attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False) + elif self.attention_mode == "sliding_chunks": + attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0) + elif self.attention_mode == "sliding_chunks_no_overlap": + attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0) + else: + raise False + mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False) + if remove_from_windowed_attention_mask is not None: + # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 + # from (bsz x seq_len) to (bsz x seq_len x num_heads x hidden_size) + remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze(dim=-1) + # cast to float/half then replace 1's with -inf + float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill(remove_from_windowed_attention_mask, -10000.0) + repeat_size = 1 if isinstance(self.attention_dilation, int) else len(self.attention_dilation) + float_mask = float_mask.repeat(1, 1, repeat_size, 1) + ones = float_mask.new_ones(size=float_mask.size()) # tensor of ones + # diagonal mask with zeros everywhere and -inf inplace of padding + if self.attention_mode == 'tvm': + d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False) + elif self.attention_mode == "sliding_chunks": + d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) + elif self.attention_mode == "sliding_chunks_no_overlap": + d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) + + attn_weights += d_mask + assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads] + assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3] + + # the extra attention + if extra_attention_mask is not None: + selected_k = k.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) + selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros] + # (bsz, seq_len, num_heads, max_num_extra_indices_per_batch) + selected_attn_weights = torch.einsum('blhd,bshd->blhs', (q, selected_k)) + selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000 + # concat to attn_weights + # (bsz, seq_len, num_heads, extra attention count + 2*window+1) + attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1) + + # TODO: added position_bias for T5 + if position_bias is None: + if not self.has_relative_attention_bias: + raise ValueError("No position_bias provided and no weights to compute position_bias") + + position_bias = self.compute_bias(seq_len, 2*self.attention_window + 1) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value_state is not None: + position_bias = position_bias[:, :, -1:, :] + + # TODO: what should be the attn_mask added here? + # if attention_mask is not None: + # position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) + + # ipdb.set_trace() + attn_weights += position_bias + + attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability + if key_padding_mask is not None: + # softmax sometimes inserts NaN if all positions are masked, replace them with 0 + attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) + v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) + attn = 0 + if extra_attention_mask is not None: + selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch) + selected_v = v.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) + selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros] + # use `matmul` because `einsum` crashes sometimes with fp16 + # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v)) + attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2).type_as(selected_attn_probs)).transpose(1, 2) + attn_probs = attn_probs.narrow(-1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch).contiguous() + + if self.attention_mode == 'tvm': + v = v.float().contiguous() + attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False) + elif self.attention_mode == "sliding_chunks": + attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window) + elif self.attention_mode == "sliding_chunks_no_overlap": + attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window) + else: + raise False + + attn = attn.type_as(hidden_states) + assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim] + attn = attn.transpose(0, 1).reshape(seq_len, bsz, embed_dim).contiguous() + + # For this case, we'll just recompute the attention for these indices + # and overwrite the attn tensor. TODO: remove the redundant computation + if extra_attention_mask is not None: + selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, bsz, embed_dim) + selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states[extra_attention_mask_nonzeros[::-1]] + + q = self.query_global(selected_hidden_states) + k = self.key_global(hidden_states) + v = self.value_global(hidden_states) + q /= math.sqrt(self.head_dim) + + q = q.contiguous().view(max_num_extra_indices_per_batch, bsz * self.num_heads, self.head_dim).transpose(0, 1) # (bsz*self.num_heads, max_num_extra_indices_per_batch, head_dim) + k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) # bsz * self.num_heads, seq_len, head_dim) + v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) # bsz * self.num_heads, seq_len, head_dim) + attn_weights = torch.bmm(q, k.transpose(1, 2)) + assert list(attn_weights.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len] + + attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len) + attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0 + if key_padding_mask is not None: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + -10000.0, + ) + attn_weights = attn_weights.view(bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len) + attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability + attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) + selected_attn = torch.bmm(attn_probs, v) + assert list(selected_attn.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, self.head_dim] + + selected_attn_4d = selected_attn.view(bsz, self.num_heads, max_num_extra_indices_per_batch, self.head_dim) + nonzero_selected_attn = selected_attn_4d[selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1]] + attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states) + + context_layer = attn.transpose(0, 1) + if output_attentions: + if extra_attention_mask is not None: + # With global attention, return global attention probabilities only + # batch_size x num_heads x max_num_global_attention_tokens x sequence_length + # which is the attention weights from tokens with global attention to all tokens + # It doesn't not return local attention + # In case of variable number of global attantion in the rows of a batch, + # attn_weights are padded with -10000.0 attention scores + attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len) + else: + # without global attention, return local attention probabilities + # batch_size x num_heads x sequence_length x window_size + # which is the attention weights of every token attending to its neighbours + attn_weights = attn_weights.permute(0, 2, 1, 3) + outputs = (context_layer, attn_weights) if output_attentions else (context_layer,) + return outputs + + +class LongformerSelfAttentionForT5(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.embed_dim = config.d_model + self.longformer_self_attn = LongformerSelfAttentionT5Basic(config, layer_id=layer_id, + has_relative_attention_bias=True) #config.has_relative_attention_bias) + self.output = nn.Linear(self.embed_dim, self.embed_dim) + + # def forward( + # self, + # query, + # key: Optional[Tensor], + # key_padding_mask: Optional[Tensor] = None, + # layer_state: Optional[Dict[str, Optional[Tensor]]] = None, + # attn_mask: Optional[Tensor] = None, + # need_weights=False, + # output_attentions=False, + # ) -> Tuple[Tensor, Optional[Tensor]]: + def forward( + self, + query, + mask=None, + kv=None, + position_bias=None, + past_key_value_state=None, + head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + ): + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + + outputs = self.longformer_self_attn( + query, #.transpose(0, 1), # LongformerSelfAttention expects (bsz, seqlen, embd_dim) + #attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1, + attention_mask=mask, #.unsqueeze(dim=1).unsqueeze(dim=1)*-1, + output_attentions=output_attentions, + ) + + attn_output = self.output(outputs[0].transpose(0, 1)) + + return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None) diff --git a/requirements.txt b/requirements.txt index a98ef2a..3eb9122 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,5 @@ pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_ torch==1.6.0 tensorboardX test-tube==0.7.5 -nlp +nlp==0.3.0 rouge_score diff --git a/scripts/convert_t5_to_longformerencoderdecoder.py b/scripts/convert_t5_to_longformerencoderdecoder.py new file mode 100644 index 0000000..6415cc0 --- /dev/null +++ b/scripts/convert_t5_to_longformerencoderdecoder.py @@ -0,0 +1,156 @@ +import argparse +import logging +import os + +from transformers import T5Tokenizer + +from transformers import T5ForConditionalGeneration +from transformers.modeling_bart import shift_tokens_right +from longformer.longformer_t5_encoder_decoder import LongformerSelfAttentionForT5, LongformerEncoderDecoderConfigT5 +from longformer.longformer_t5_encoder_decoder import LongformerEncoderDecoderForConditionalGenerationT5 + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def create_long_model( + save_model_to, + base_model, + tokenizer_name_or_path, + attention_window, + max_pos +): + model = T5ForConditionalGeneration.from_pretrained(base_model) + tokenizer = T5Tokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=max_pos) + config = LongformerEncoderDecoderConfigT5.from_pretrained(base_model) + model.config = config + + # in T5 attention_probs_dropout_prob is dropout_rate, but LongformerSelfAttention + # expects attention_probs_dropout_prob, so set it here + config.attention_probs_dropout_prob = config.dropout_rate + config.architectures = ['LongformerEncoderDecoderForConditionalGenerationT5', ] + + # extend position embeddings + tokenizer.model_max_length = max_pos + tokenizer.init_kwargs['model_max_length'] = max_pos + # current_max_pos, embed_size = model.model.embed_positions.weight.shape + # assert current_max_pos == config.max_position_embeddings + 2 + + # config.max_encoder_position_embeddings = max_pos + # config.max_decoder_position_embeddings = config.max_position_embeddings + # del config.max_position_embeddings + # # TODO: check what's the deal with T5 here. + # max_pos += 2 # NOTE: BART has positions 0,1 reserved, so embedding size is max position + 2 + # assert max_pos >= current_max_pos + + # # allocate a larger position embedding matrix for the encoder + # new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size) + # # copy position embeddings over and over to initialize the new position embeddings + # k = 2 + # step = current_max_pos - 2 + # while k < max_pos - 1: + # new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[2:] + # k += step + # model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed + + # allocate a larger position embedding matrix for the decoder + # new_decoder_pos_embed = model.model.decoder.embed_positions.weight.new_empty(max_pos, embed_size) + # # copy position embeddings over and over to initialize the new position embeddings + # k = 2 + # step = current_max_pos - 2 + # while k < max_pos - 1: + # new_decoder_pos_embed[k:(k + step)] = model.model.decoder.embed_positions.weight[2:] + # k += step + # model.model.decoder.embed_positions.weight.data = new_decoder_pos_embed + + # replace the `modeling_t5.T5Attention` object with `LongformerSelfAttention` + config.attention_window = [attention_window] * config.num_hidden_layers + config.attention_dilation = [1] * config.num_hidden_layers + + for i, layer in enumerate(model.encoder.block): + self_attn = layer.layer[0].SelfAttention + + longformer_self_attn_for_t5 = LongformerSelfAttentionForT5(config, layer_id=i) + + longformer_self_attn_for_t5.longformer_self_attn.query = self_attn.q + longformer_self_attn_for_t5.longformer_self_attn.key = self_attn.k + longformer_self_attn_for_t5.longformer_self_attn.value = self_attn.v + + longformer_self_attn_for_t5.longformer_self_attn.query_global = self_attn.q + longformer_self_attn_for_t5.longformer_self_attn.key_global = self_attn.k + longformer_self_attn_for_t5.longformer_self_attn.value_global = self_attn.v + + longformer_self_attn_for_t5.output = self_attn.o + + layer.layer[0].SelfAttention = longformer_self_attn_for_t5 + + logger.info(f'saving model to {save_model_to}') + model.save_pretrained(save_model_to) + tokenizer.save_pretrained(save_model_to) + return model, tokenizer + + +def main(): + parser = argparse.ArgumentParser(description="Convert T5 to LongT5. Replaces T5 encoder's T5Attention with LongformerSelfAttention") + parser.add_argument( + '--base_model', + type=str, + default='t5-large', + help='The name or path of the base model you want to convert' + ) + parser.add_argument( + '--tokenizer_name_or_path', + type=str, + default='t5-large', + help='The name or path of the tokenizer' + ) + parser.add_argument( + '--save_model_to', + type=str, + required=True, + help='The path to save the converted model' + ) + parser.add_argument( + '--attention_window', + type=int, + default=512, + help='attention window size for longformer self attention (one sided)' + ) + parser.add_argument( + '--max_pos', + type=int, + default=4096 * 4, + help='maximum encoder positions' + ) + + args = parser.parse_args() + + if not os.path.exists(args.save_model_to): + os.mkdir(args.save_model_to) + + create_long_model( + save_model_to=args.save_model_to, + base_model=args.base_model, + tokenizer_name_or_path=args.tokenizer_name_or_path, + attention_window=args.attention_window, + max_pos=args.max_pos + ) + + tokenizer = T5Tokenizer.from_pretrained(args.save_model_to) + TXT = "My friends are but they eat too many carbs." + model = LongformerEncoderDecoderForConditionalGenerationT5.from_pretrained(args.save_model_to) + model.encoder.config.gradient_checkpointing = True + model.decoder.config.gradient_checkpointing = True + data = tokenizer([TXT], return_tensors='pt', padding='max_length', max_length=2048) + input_ids = data['input_ids'] + attention_mask = data['attention_mask'] + decoder_input_ids = shift_tokens_right(input_ids[:, :5], tokenizer.pad_token_id) + logits = model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, use_cache=False)[0] + masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + probs = logits[0, masked_index].softmax(dim=0) + values, predictions = probs.topk(5) + print(tokenizer.convert_ids_to_tokens(predictions)) + + +if __name__ == "__main__": + main() From adc92cabc8e8c6fc89a4ccab03e887a3d53e0d97 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Mon, 23 Nov 2020 15:09:29 -0800 Subject: [PATCH 109/112] naive code for smaller score matrix --- longformer/longformer_t5_encoder_decoder.py | 44 ++++++++++++--------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py index b02cb36..ce46228 100644 --- a/longformer/longformer_t5_encoder_decoder.py +++ b/longformer/longformer_t5_encoder_decoder.py @@ -38,7 +38,7 @@ def __init__(self, attention_window: List[int] = None, attention_dilation: List[ self.autoregressive = autoregressive self.attention_mode = attention_mode self.gradient_checkpointing = gradient_checkpointing - # self.attention_probs_dropout_prob = self.dropout_rate + self.attention_probs_dropout_prob = self.dropout_rate assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] class LongformerSelfAttentionT5Basic(nn.Module): @@ -129,6 +129,26 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets ret += torch.where(is_small, n, val_if_large) return ret + @staticmethod + def _smaller_score_matrix(matrix, seq_len, w, bidirectional): + + diag_sums = torch.zeros(seq_len, 2*w+1) + #diag_sums.fill_(float('-inf')) + last = w+1 if bidirectional else 1 + + c = 0 + for k in range(-w, last): + d = torch.diagonal(matrix, offset=k, dim1=-2, dim2=-1) + if d.nelement(): + if k <= 0: + diag_sums[abs(k):seq_len, c] = d + else: + diag_sums[0:seq_len-k, c] = d + c += 1 + + # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True) + return diag_sums.long() + def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = torch.arange(qlen, dtype=torch.long)[:, None] @@ -139,9 +159,10 @@ def compute_bias(self, qlen, klen): bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) + rp_bucket = self._smaller_score_matrix(rp_bucket, qlen, w=self.attention_window, bidirectional=not self.is_decoder) rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) - # values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) +# values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) # Changing the shape to below because that's what LongformerSelfAttention's attn_weights need. values = values.permute([0, 2, 1]).unsqueeze(0) # shape (1, qlen, num_heads, klen) return values @@ -245,23 +266,20 @@ def forward( # (bsz, seq_len, num_heads, extra attention count + 2*window+1) attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1) - # TODO: added position_bias for T5 if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") - position_bias = self.compute_bias(seq_len, 2*self.attention_window + 1) + position_bias = self.compute_bias(seq_len, seq_len) # if key and values are already calculated # we want only the last query position bias if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] - # TODO: what should be the attn_mask added here? - # if attention_mask is not None: - # position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) + if attention_mask is not None: + position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) - # ipdb.set_trace() attn_weights += position_bias attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability @@ -356,16 +374,6 @@ def __init__(self, config, layer_id): has_relative_attention_bias=True) #config.has_relative_attention_bias) self.output = nn.Linear(self.embed_dim, self.embed_dim) - # def forward( - # self, - # query, - # key: Optional[Tensor], - # key_padding_mask: Optional[Tensor] = None, - # layer_state: Optional[Dict[str, Optional[Tensor]]] = None, - # attn_mask: Optional[Tensor] = None, - # need_weights=False, - # output_attentions=False, - # ) -> Tuple[Tensor, Optional[Tensor]]: def forward( self, query, From 02f9c3f50f821c2947dc6d24e58654149a9f35e3 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Mon, 23 Nov 2020 15:28:35 -0800 Subject: [PATCH 110/112] commenting attn_mask --- longformer/longformer_t5_encoder_decoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py index ce46228..df797e3 100644 --- a/longformer/longformer_t5_encoder_decoder.py +++ b/longformer/longformer_t5_encoder_decoder.py @@ -277,8 +277,9 @@ def forward( if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] - if attention_mask is not None: - position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) + # attention_mask is not the right size; should it even be added? + # if attention_mask is not None: + # position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) attn_weights += position_bias From de147decc6e05d5e95f4523e4fce02ff95880daa Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Mon, 23 Nov 2020 17:50:06 -0800 Subject: [PATCH 111/112] fixing order and issue with inf --- longformer/longformer_t5_encoder_decoder.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py index df797e3..3ab9caa 100644 --- a/longformer/longformer_t5_encoder_decoder.py +++ b/longformer/longformer_t5_encoder_decoder.py @@ -108,12 +108,17 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets """ ret = 0 n = -relative_position + # Since torch.abs() will not work correctly with converted inf values, explicitly set to a lower value. + # TODO: check this!! + n[n==float('inf')] = max_distance + n[n==float('-inf')] = -max_distance if bidirectional: num_buckets //= 2 ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets n = torch.abs(n) else: n = torch.max(n, torch.zeros_like(n)) + n = n.long() # now n is in the range [0, inf) # half of the buckets are for exact increments in positions @@ -134,6 +139,8 @@ def _smaller_score_matrix(matrix, seq_len, w, bidirectional): diag_sums = torch.zeros(seq_len, 2*w+1) #diag_sums.fill_(float('-inf')) + diag_sums[:, 0:w].fill_(float('-inf')) + diag_sums[:, w+1:].fill_(float('inf')) last = w+1 if bidirectional else 1 c = 0 @@ -147,19 +154,19 @@ def _smaller_score_matrix(matrix, seq_len, w, bidirectional): c += 1 # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True) - return diag_sums.long() + return diag_sums def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ context_position = torch.arange(qlen, dtype=torch.long)[:, None] memory_position = torch.arange(klen, dtype=torch.long)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) + relative_position = self._smaller_score_matrix(relative_position, qlen, w=self.attention_window, bidirectional=not self.is_decoder) rp_bucket = self._relative_position_bucket( relative_position, # shape (qlen, klen) bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, ) - rp_bucket = self._smaller_score_matrix(rp_bucket, qlen, w=self.attention_window, bidirectional=not self.is_decoder) rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) # values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) From 1ba5286d6f4126491263fe27cb15806a56dfab8a Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Mon, 7 Dec 2020 23:16:48 -0800 Subject: [PATCH 112/112] fixing compute_bias --- longformer/longformer_t5_encoder_decoder.py | 82 ++++++------------- .../convert_t5_to_longformerencoderdecoder.py | 1 + 2 files changed, 26 insertions(+), 57 deletions(-) diff --git a/longformer/longformer_t5_encoder_decoder.py b/longformer/longformer_t5_encoder_decoder.py index 3ab9caa..85cf786 100644 --- a/longformer/longformer_t5_encoder_decoder.py +++ b/longformer/longformer_t5_encoder_decoder.py @@ -87,81 +87,49 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 - Translate relative position to a bucket number for relative attention. - The relative position is defined as memory_position - query_position, i.e. - the distance in tokens from the attending position to the attended-to - position. If bidirectional=False, then positive relative positions are - invalid. - We use smaller buckets for small absolute relative_position and larger buckets - for larger absolute relative_positions. All relative positions >=max_distance - map to the same bucket. All relative positions <=-max_distance map to the - same bucket. This should allow for more graceful generalization to longer - sequences than the model has been trained on. + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: - a Tensor with the same shape as relative_position, containing int32 - values in the range [0, num_buckets) + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ - ret = 0 - n = -relative_position - # Since torch.abs() will not work correctly with converted inf values, explicitly set to a lower value. - # TODO: check this!! - n[n==float('inf')] = max_distance - n[n==float('-inf')] = -max_distance + relative_buckets = 0 if bidirectional: num_buckets //= 2 - ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets - n = torch.abs(n) + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) else: - n = torch.max(n, torch.zeros_like(n)) - n = n.long() - # now n is in the range [0, inf) + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) # half of the buckets are for exact increments in positions max_exact = num_buckets // 2 - is_small = n < max_exact + is_small = relative_position < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance - val_if_large = max_exact + ( - torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + relative_postion_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) ).to(torch.long) - val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) - - ret += torch.where(is_small, n, val_if_large) - return ret + relative_postion_if_large = torch.min( + relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1) + ) - @staticmethod - def _smaller_score_matrix(matrix, seq_len, w, bidirectional): - - diag_sums = torch.zeros(seq_len, 2*w+1) - #diag_sums.fill_(float('-inf')) - diag_sums[:, 0:w].fill_(float('-inf')) - diag_sums[:, w+1:].fill_(float('inf')) - last = w+1 if bidirectional else 1 - - c = 0 - for k in range(-w, last): - d = torch.diagonal(matrix, offset=k, dim1=-2, dim2=-1) - if d.nelement(): - if k <= 0: - diag_sums[abs(k):seq_len, c] = d - else: - diag_sums[0:seq_len-k, c] = d - c += 1 - - # mask_invalid_locations(diag_sums.unsqueeze(0).unsqueeze(2).float(), w, 1, True) - return diag_sums + relative_buckets += torch.where(is_small, relative_position, relative_postion_if_large) + return relative_buckets def compute_bias(self, qlen, klen): """ Compute binned relative position bias """ - context_position = torch.arange(qlen, dtype=torch.long)[:, None] - memory_position = torch.arange(klen, dtype=torch.long)[None, :] - relative_position = memory_position - context_position # shape (qlen, klen) - relative_position = self._smaller_score_matrix(relative_position, qlen, w=self.attention_window, bidirectional=not self.is_decoder) + relative_position = torch.tensor([[i-self.attention_window for i in range(2*self.attention_window+1)]]) rp_bucket = self._relative_position_bucket( relative_position, # shape (qlen, klen) bidirectional=not self.is_decoder, @@ -189,7 +157,6 @@ def forward( 0: local attention +ve: global attention ''' - if attention_mask is not None: attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1) key_padding_mask = attention_mask < 0 @@ -284,7 +251,8 @@ def forward( if past_key_value_state is not None: position_bias = position_bias[:, :, -1:, :] - # attention_mask is not the right size; should it even be added? + # TODO: attention_mask should also be the same shape as position_bias. + # Sliding attention window?? # if attention_mask is not None: # position_bias = position_bias + attention_mask # (1, num_heads, seq_len, 2*window+1) diff --git a/scripts/convert_t5_to_longformerencoderdecoder.py b/scripts/convert_t5_to_longformerencoderdecoder.py index 6415cc0..c06930b 100644 --- a/scripts/convert_t5_to_longformerencoderdecoder.py +++ b/scripts/convert_t5_to_longformerencoderdecoder.py @@ -66,6 +66,7 @@ def create_long_model( # replace the `modeling_t5.T5Attention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers config.attention_dilation = [1] * config.num_hidden_layers + # model.encoder.block = model.encoder.block[:1] for i, layer in enumerate(model.encoder.block): self_attn = layer.layer[0].SelfAttention