diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 47bd6d223..f963c7d2f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -136,6 +136,16 @@ jobs: priority: preemptible resources: gpuCount: 1 + constraints: + cluster: + - ai2/general-cirrascale + - ai2/general-cirrascale-a100-80g-ib + - ai2/allennlp-cirrascale + - ai2/aristo-cirrascale + - ai2/mosaic-cirrascale + - ai2/mosaic-cirrascale-a100 + - ai2/prior-cirrascale + - ai2/s2-cirrascale envVars: - name: COMMIT_SHA value: ${{ env.COMMIT_SHA }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 82a3a6ea9..7051173b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,4 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added GPT-based model, tokenizer, data pipeline, and `composer` training script. +- GPT-based model. +- Tokenizer and data pre-processing pipeline. +- `composer` training script. +- Triton-based FlashAttention. diff --git a/Makefile b/Makefile index e496a9224..07393fedb 100644 --- a/Makefile +++ b/Makefile @@ -21,16 +21,23 @@ beaker-info : @echo "Gantry image: $(GANTRY_IMAGE)" @echo "Testing image: $(TEST_IMAGE)" +.PHONY : images +images : gantry-image test-image + +PHONY : base-image +base-image : + docker build -f docker/Dockerfile.base -t $(IMAGE_NAME_BASE)-base . + .PHONY : gantry-image -gantry-image : - docker build -f Dockerfile.gantry -t $(IMAGE_NAME_BASE)-gantry . +gantry-image : base-image + docker build -f docker/Dockerfile.gantry -t $(IMAGE_NAME_BASE)-gantry . beaker image create $(IMAGE_NAME_BASE)-gantry --name $(IMAGE_NAME_BASE)-gantry-tmp --workspace $(BEAKER_WORKSPACE) beaker image delete $(GANTRY_IMAGE) || true beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-gantry-tmp $(IMAGE_NAME_BASE)-gantry .PHONY : test-image -test-image : - docker build -f Dockerfile.test -t $(IMAGE_NAME_BASE)-test . +test-image : base-image + docker build -f docker/Dockerfile.test -t $(IMAGE_NAME_BASE)-test . beaker image create $(IMAGE_NAME_BASE)-test --name $(IMAGE_NAME_BASE)-test-tmp --workspace $(BEAKER_WORKSPACE) beaker image delete $(TEST_IMAGE) || true beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-test-tmp $(IMAGE_NAME_BASE)-test diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base new file mode 100644 index 000000000..83411da3f --- /dev/null +++ b/docker/Dockerfile.base @@ -0,0 +1,15 @@ +# Defines a CUDA-enabled Docker image suitable for installing all dependencies +# to this project. + +FROM ghcr.io/allenai/pytorch:1.13.1-cuda11.7-python3.10 + +# We need cuda dev for the old version of triton. +# NOTE: once we're able to upgrade triton to >=2.0, we can remove this. +RUN /opt/conda/bin/conda install -c nvidia cuda-libraries-dev + +# Install flash attn (and triton dependency) from our pre-built wheel. +RUN /opt/conda/bin/pip install --no-cache-dir \ + triton==2.0.0.dev20221202 \ + https://storage.googleapis.com/ai2-python-wheels/flash_attn/flash_attn-0.2.8%2Bcu117torch1.13.1-cp310-cp310-linux_x86_64.whl + +ENV CUDA_HOME=/opt/conda diff --git a/Dockerfile.gantry b/docker/Dockerfile.gantry similarity index 86% rename from Dockerfile.gantry rename to docker/Dockerfile.gantry index c4ec30402..1387ebe66 100644 --- a/Dockerfile.gantry +++ b/docker/Dockerfile.gantry @@ -4,7 +4,7 @@ # To build and push the image to Beaker, run 'make gantry-image'. # To test the image after pushing to Beaker, run 'make gantry-test'. -FROM ghcr.io/allenai/pytorch:1.13.1-cuda11.7-python3.10 +FROM dolma-base WORKDIR /stage diff --git a/Dockerfile.test b/docker/Dockerfile.test similarity index 87% rename from Dockerfile.test rename to docker/Dockerfile.test index 7cf8adc36..eb301a845 100644 --- a/Dockerfile.test +++ b/docker/Dockerfile.test @@ -4,7 +4,7 @@ # # To build and push the image to Beaker, run 'make test-image'. -FROM ghcr.io/allenai/pytorch:1.13.1-cuda11.7-python3.10 +FROM dolma-base COPY scripts/test_entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/dolma/config.py b/dolma/config.py index 13d1206d8..a6bcab722 100644 --- a/dolma/config.py +++ b/dolma/config.py @@ -101,12 +101,12 @@ class ModelConfig(BaseConfig): mlp_ratio: int = 4 """ - The ratio of the inner MLP dimensionality to `d_model`. + The ratio of the inner MLP dimensionality to ``d_model``. """ alibi: bool = False """ - If `True`, use ALiBi embeddings. + If ``True``, use ALiBi embeddings. """ alibi_bias_max: float = 8.0 @@ -114,6 +114,11 @@ class ModelConfig(BaseConfig): Maximum absolute value of ALiBi bias. """ + flash_attention: bool = False + """ + If ``True``, use ``FlashAttention``. + """ + attention_dropout: float = 0.1 """ The dropout probability within the attention modules. diff --git a/dolma/model.py b/dolma/model.py index 84003bf2a..823208e2b 100644 --- a/dolma/model.py +++ b/dolma/model.py @@ -5,30 +5,40 @@ """ import math +from abc import abstractmethod from typing import NamedTuple, Optional, cast import torch import torch.nn as nn import torch.nn.functional as F +from einops import rearrange from .config import ModelConfig -__all__ = ["SelfAttention", "GPTMLP", "GPTBlock", "DolmaGPT"] +__all__ = ["TorchAttention", "GPTMLP", "GPTBlock", "DolmaGPT"] -class SelfAttention(nn.Module): +class DolmaAttentionBase(nn.Module): def __init__(self, config: ModelConfig): super().__init__() assert config.d_model % config.n_heads == 0 self.n_heads = config.n_heads self.d_model = config.d_model + # key, query, value projections for all heads, but in a batch self.c_attn = nn.Linear(config.d_model, 3 * config.d_model, device=config.init_device) + # for param init fn + self.c_attn._fused = (0, (self.d_model, 2 * self.d_model)) # type: ignore + # output projection self.c_proj = nn.Linear(config.d_model, config.d_model, device=config.init_device) + # for param init fn + self.c_proj._is_residual = True # type: ignore + # regularization self.attn_dropout = nn.Dropout(config.attention_dropout) self.resid_dropout = nn.Dropout(config.residual_dropout) + # optional layer norm for keys and queries. self.k_ln: Optional[nn.LayerNorm] = None self.q_ln: Optional[nn.LayerNorm] = None @@ -36,6 +46,19 @@ def __init__(self, config: ModelConfig): self.k_ln = nn.LayerNorm(self.d_model, device=config.init_device) self.q_ln = nn.LayerNorm(self.d_model, device=config.init_device) + @abstractmethod + def forward( + self, + x: torch.FloatTensor, + attention_bias: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + raise NotImplementedError + + +class TorchAttention(DolmaAttentionBase): + def __init__(self, config: ModelConfig): + super().__init__(config) + def forward( self, x: torch.FloatTensor, @@ -55,8 +78,9 @@ def forward( # Optionally apply layer norm to keys and queries. if self.k_ln is not None and self.q_ln is not None: - k = self.k_ln(k) - q = self.q_ln(q) + dtype = k.dtype + k = self.k_ln(k).to(dtype=dtype) + q = self.q_ln(q).to(dtype=dtype) # Move head forward to be next to the batch dim. # shape (all): (B, nh, T, hs) @@ -87,6 +111,55 @@ def forward( return y +class FlashAttention(DolmaAttentionBase): + """ + Triton implementation of FlashAttention. + """ + + def __init__(self, config: ModelConfig): + from flash_attn import flash_attn_triton # type: ignore + + super().__init__(config) + + assert self.d_model / self.n_heads in {64, 128}, "FlashAttention requires head dim of 64 or 128 for now" + assert config.attention_dropout == 0, "FlashAttention does not support attention dropout for now" + self.flash_attn_qkvpacked_func = flash_attn_triton.flash_attn_qkvpacked_func + + def forward( + self, x: torch.FloatTensor, attention_bias: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + """ + :param x: A tensor of shape `(batch_size, seq_len, d_model)`. + :param attention_bias: A tensor of shape `(batch_size, n_heads, seq_len, seq_len)` + or an equivalently broadcastable shape. This is used to introduce causal or other biases + and it is simply added to the attention scores before the softmax. + """ + # Calculate query, key, values for all heads in batch. + # shape: (batch_size, seq_length, d_model * 3) + qkv = self.c_attn(x) + + # Optionally apply layer norm to keys and queries. + if self.q_ln is not None and self.k_ln is not None: + # Applying layernorm to qk + dtype = qkv.dtype + q, k, v = qkv.split(self.d_model, dim=-1) + q = self.q_ln(q).to(dtype=dtype) + k = self.k_ln(k).to(dtype=dtype) + qkv = torch.cat([q, k, v], dim=-1) + + # Apply inner attention function. + qkv = rearrange(qkv, "b s (t h d) -> b s t h d", t=3, h=self.n_heads) + y = self.flash_attn_qkvpacked_func(qkv, attention_bias) + + # Re-assemble all head outputs side by side. + y = rearrange(y, "b s h d -> b s (h d)") + + # Apply output projection. + y = self.resid_dropout(self.c_proj(y)) + + return y + + class GPTMLP(nn.Module): def __init__(self, config: ModelConfig): super().__init__() @@ -103,8 +176,11 @@ def forward(self, x): class GPTBlock(nn.Module): def __init__(self, config: ModelConfig): super().__init__() + self.config = config self.ln_1 = nn.LayerNorm(config.d_model, device=config.init_device) - self.attn = SelfAttention(config) + self.attn: DolmaAttentionBase = ( + FlashAttention(config) if config.flash_attention else TorchAttention(config) + ) self.ln_2 = nn.LayerNorm(config.d_model, device=config.init_device) self.mlp = GPTMLP(config) @@ -357,16 +433,40 @@ def param_init_fn(self, module): init_fn = partial(torch.nn.init.normal_, mean=0.0, std=self.config.init_std) + def fused_init_fn(module): + # Parameter initialization is often based on the parameters shape. + # If a layer is fused, initialization should be based on the shapes + # of the original tensor instead of the shape of the fused tensor. + # Layers which are fused should have the _fused attribute defined. + # The first element of _fused is the dimension along which the tensor is fused. + # This is followed by an iterable of split indices. + _fused = getattr(module, "_fused", None) + if _fused is None: + raise RuntimeError("Internal logic error") + + dim, splits = _fused + splits = (0, *splits, module.weight.size(dim)) + for s, e in zip(splits[:-1], splits[1:]): + slice_indices = [slice(None)] * module.weight.ndim + slice_indices[dim] = slice(s, e) + init_fn(module.weight[slice_indices]) + # Linear if isinstance(module, nn.Linear): - init_fn(module.weight) + if hasattr(module, "_fused"): + fused_init_fn(module) + else: + init_fn(module.weight) + if module.bias is not None: torch.nn.init.zeros_(module.bias) if getattr(module, "_is_residual", False): - module.weight.data.normal_( - mean=0.0, std=(self.config.init_std / math.sqrt(2 * self.config.n_layers)) - ) + with torch.no_grad(): + module.weight.div_(math.sqrt(2 * self.config.n_layers)) + + if module.bias is not None: + torch.nn.init.zeros_(module.bias) # Embedding if isinstance(module, nn.Embedding): diff --git a/requirements.txt b/requirements.txt index 23c0e2d78..61e8555ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ # Docker images. See each Dockerfile for details on how to do that. numpy torch +einops # bug with 0.13.0, see https://github.com/mosaicml/composer/issues/2030 mosaicml!=0.13.0 torchmetrics @@ -12,3 +13,6 @@ cached-path beaker-gantry omegaconf wandb +# Can't install these on a CPU-only environment: +# triton +# flash-attn diff --git a/tests/model_test.py b/tests/model_test.py index b2bd2bb8c..2f8a49442 100644 --- a/tests/model_test.py +++ b/tests/model_test.py @@ -7,36 +7,95 @@ @pytest.mark.parametrize( - "alibi, cuda", + "alibi, flash_attn, cuda, dtype", [ - pytest.param(True, False, id="alibi-emb-cpu"), - pytest.param(False, False, id="posit-emb-cpu"), + pytest.param(True, False, False, torch.bfloat16, id="alibi-emb-cpu-bf16"), + pytest.param(False, False, False, torch.bfloat16, id="posit-emb-cpu-bf16"), + pytest.param(True, False, False, torch.float32, id="alibi-emb-cpu-f32"), + pytest.param(False, False, False, torch.float32, id="posit-emb-cpu-f32"), pytest.param( + True, + False, + True, + torch.bfloat16, + id="alibi-emb-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + False, + False, + True, + torch.bfloat16, + id="posit-emb-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + True, + True, + True, + torch.bfloat16, + id="alibi-emb-flash-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + False, + True, + True, + torch.bfloat16, + id="posit-emb-flash-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + True, True, True, - id="alibi-emb-cuda", + torch.float16, + id="alibi-emb-flash-cuda-f16", marks=( pytest.mark.gpu, - pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA devices"), + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), ), ), pytest.param( False, True, - id="posit-emb-cuda", + True, + torch.float16, + id="posit-emb-flash-cuda-f16", marks=( pytest.mark.gpu, - pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA devices"), + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), ), ), ], ) -def test_forward(train_config: TrainConfig, tokenizer: Tokenizer, alibi: bool, cuda: bool): +def test_forward( + train_config: TrainConfig, tokenizer: Tokenizer, alibi: bool, flash_attn: bool, cuda: bool, dtype +): torch.manual_seed(0) train_config.model.alibi = alibi + train_config.model.flash_attention = flash_attn + if flash_attn: + train_config.model.attention_dropout = 0.0 if cuda: train_config.model.init_device = "cuda" + else: + train_config.model.init_device = "cpu" + + use_amp = dtype in {torch.float16, torch.bfloat16} model = DolmaGPT(train_config.model).eval() @@ -52,34 +111,123 @@ def test_forward(train_config: TrainConfig, tokenizer: Tokenizer, alibi: bool, c k: v.to(device=train_config.device) if isinstance(v, torch.Tensor) else v for k, v in batch_inputs.items() } - # Check that logits from individual inputs are equal to logits from batch. + # Run forward pass. with torch.inference_mode(): - output1 = model(torch.tensor(input1, device=train_config.device).unsqueeze(0)) - output2 = model(torch.tensor(input2, device=train_config.device).unsqueeze(0)) - batch_output = model(**batch_inputs) + with torch.autocast( + device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype + ): + output1 = model(torch.tensor(input1, device=train_config.device).unsqueeze(0)) + output2 = model(torch.tensor(input2, device=train_config.device).unsqueeze(0)) + batch_output = model(**batch_inputs) - torch.testing.assert_close(output1.logits[0][: len(input1)], batch_output.logits[0][: len(input1)]) - torch.testing.assert_close(output2.logits[0][: len(input2)], batch_output.logits[1][: len(input2)]) + # Check that logits from individual inputs are equal to logits from batch. + # With using half-precision types these might have some big differences in a small + # percentage of the elements. + atol = 1e-2 if use_amp else None + rtol = 1e3 if use_amp else None + torch.testing.assert_close( + output1.logits[0][: len(input1)], batch_output.logits[0][: len(input1)], rtol=rtol, atol=atol + ) + torch.testing.assert_close( + output2.logits[0][: len(input2)], batch_output.logits[1][: len(input2)], rtol=rtol, atol=atol + ) -@pytest.mark.parametrize("alibi", [pytest.param(True, id="alibi-emb"), pytest.param(False, id="posit-emb")]) -def test_backward(train_config: TrainConfig, tokenizer: Tokenizer, alibi: bool): +@pytest.mark.parametrize( + "alibi, flash_attn, cuda, dtype", + [ + pytest.param(True, False, False, torch.bfloat16, id="alibi-emb-cpu-bf16"), + pytest.param(False, False, False, torch.bfloat16, id="posit-emb-cpu-bf16"), + pytest.param( + True, + False, + True, + torch.bfloat16, + id="alibi-emb-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + False, + False, + True, + torch.bfloat16, + id="posit-emb-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + ), + ), + pytest.param( + True, + True, + True, + torch.bfloat16, + id="alibi-emb-flash-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + pytest.mark.skipif( + torch.cuda.device_count() < 1 or "A100" not in torch.cuda.get_device_name(), + reason="Requires A100 GPU type", + ), + ), + ), + pytest.param( + False, + True, + True, + torch.bfloat16, + id="posit-emb-flash-cuda-bf16", + marks=( + pytest.mark.gpu, + pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA device"), + pytest.mark.skipif( + torch.cuda.device_count() < 1 or "A100" not in torch.cuda.get_device_name(), + reason="Requires A100 GPU type", + ), + ), + ), + ], +) +def test_backward( + train_config: TrainConfig, tokenizer: Tokenizer, alibi: bool, flash_attn: bool, cuda: bool, dtype +): torch.manual_seed(0) + use_amp = dtype in {torch.float16, torch.bfloat16} + scaler = None if not (cuda and use_amp) else torch.cuda.amp.GradScaler() + train_config.model.alibi = alibi + train_config.model.flash_attention = flash_attn + if flash_attn: + train_config.model.attention_dropout = 0.0 + if cuda: + train_config.model.init_device = "cuda" + else: + train_config.model.init_device = "cpu" + model = DolmaGPT(train_config.model).train() - # Forward pass to get logits. - input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!"), device=train_config.device).unsqueeze(0) - logits = model(input_ids).logits + with torch.autocast( + device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype + ): + # Forward pass to get logits. + input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!"), device=train_config.device).unsqueeze(0) + logits = model(input_ids).logits - # Compute loss. - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = input_ids[..., 1:].contiguous() - loss = CrossEntropyLoss()(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + # Compute loss. + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = input_ids[..., 1:].contiguous() + loss = CrossEntropyLoss()(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) # Backward pass. - loss.backward() + if scaler is not None: + scaler.scale(loss).backward() # type: ignore + else: + loss.backward() # Check gradients. for name, parameter in model.named_parameters():