allenai · dirkgr · Oct 27, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 26, 2023
diff --git a/olmo/config.py b/olmo/config.py
@@ -404,6 +404,12 @@ class ModelConfig(BaseConfig):
     See :data:`TrainConfig.precision` instead.
     """
 
+    activation_checkpointing: bool = False
+    """
+    Use activation checkpointing on transformer blocks. You shouldn't set this directly.
+    See :data:`TrainConfig.activation_checkpointing` instead.
+    """
+
 
 class OptimizerType(StrEnum):
     lionw = "lionw"
@@ -808,11 +814,6 @@ class TrainConfig(BaseConfig):
     Settings for compiling the model with ``torch.compile()``.
     """
 
-    activation_checkpointing: bool = False
-    """
-    Use activation checkpointing on transformer blocks.
-    """
-
     fsdp: FSDPConfig = field(default_factory=FSDPConfig)
     """
     Fully sharded data parallel settings.
@@ -853,6 +854,11 @@ class TrainConfig(BaseConfig):
     Stop at a specific step.
     """
 
+    activation_checkpointing: bool = False
+    """
+    Use activation checkpointing on transformer blocks.
+    """
+
     @property
     def autocast_precision(self) -> torch.dtype:
         if self.precision == "amp_bf16":

diff --git a/olmo/model.py b/olmo/model.py
@@ -10,7 +10,8 @@
 import math
 from abc import abstractmethod
 from collections.abc import MutableMapping
-from typing import Dict, List, NamedTuple, Optional, Sequence, Tuple, cast
+from functools import partial
+from typing import Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple, cast
 
 import torch
 import torch.backends.cuda
@@ -741,6 +742,27 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
                     "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
                 )
 
+        self.__activation_checkpoint_fn: Callable
+        if self.config.activation_checkpointing:
+            preserve_rng_state = (
+                (self.config.attention_dropout == 0.0)
+                and (self.config.embedding_dropout == 0.0)
+                and (self.config.residual_dropout == 0.0)
+            )
+            from torch.utils.checkpoint import checkpoint
+
+            self.__activation_checkpoint_fn = partial(
+                checkpoint,
+                preserve_rng_state=preserve_rng_state,
+                use_reentrant=False,
+            )
+        else:
+
+            def pass_through_fn(fn, *args, **kwargs):
+                return fn(*args, **kwargs)
+
+            self.__activation_checkpoint_fn = pass_through_fn
+
         torch.backends.cuda.enable_flash_sdp(self.config.flash_attention)
         torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
 
@@ -949,7 +971,10 @@ def forward(
             past_key_values or [None] * self.config.n_layers,  # type: ignore
         ):
             # shape: (batch_size, seq_len, d_model)
-            x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+            x, cache = self.__activation_checkpoint_fn(
+                block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+
             if attn_key_values is not None:
                 assert cache is not None
                 attn_key_values.append(cache)

diff --git a/scripts/train.py b/scripts/train.py
@@ -4,7 +4,6 @@
 import logging
 import os
 import sys
-from functools import partial
 from pathlib import Path
 from typing import Optional, TextIO
 
@@ -59,6 +58,7 @@ def main(cfg: TrainConfig) -> None:
 
     # Fill some configuration options.
     cfg.model.precision = cfg.precision
+    cfg.model.activation_checkpointing = cfg.activation_checkpointing
     cfg.device_train_batch_size = cfg.global_train_batch_size // get_world_size()
     assert cfg.device_train_batch_size is not None  # for mypy
     cfg.device_train_grad_accum = cfg.device_train_batch_size // cfg.device_train_microbatch_size
@@ -146,26 +146,6 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
         olmo_model.reset_parameters()
 
     log.info(f"Peak GPU Memory (MB) after FSDP: {int(peak_gpu_memory() or 0)}")
-
-    if cfg.activation_checkpointing:
-        # verify we have FSDP activation support ready by importing:
-        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-            CheckpointImpl,
-            apply_activation_checkpointing,
-            checkpoint_wrapper,
-        )
-
-        non_reentrant_wrapper = partial(
-            checkpoint_wrapper,
-            offload_to_cpu=False,
-            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-        )
-        apply_activation_checkpointing(
-            fsdp_model,
-            checkpoint_wrapper_fn=non_reentrant_wrapper,  # type: ignore
-            check_fn=olmo_model.activation_checkpointing_fn,  # type: ignore
-        )
-
     log.info("Model:")
     log.info(fsdp_model)