diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9e4e8536b..b93d52fb9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Changed
+
+- Rename `Olmo` to `OLMo` everywhere in the codebase
+
+### Removed
+
+- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore.
+
+
 ## [v0.2.5](https://github.com/allenai/OLMo/releases/tag/v0.2.5) - 2024-03-06
 
 ### Fixed
@@ -28,10 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Changed legacy checkpoint unsharding to use processes and shared memory instead of threads
 
-### Removed
-
-- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore.
-
 
 ## [v0.2.4](https://github.com/allenai/OLMo/releases/tag/v0.2.4) - 2024-02-02
 
diff --git a/docs/NOTES.md b/docs/NOTES.md
index 6a8f3bfa7..c6611b33a 100644
--- a/docs/NOTES.md
+++ b/docs/NOTES.md
@@ -70,10 +70,10 @@ For example, checkpoints for the run [https://wandb.ai/ai2-llm/c4-small/runs/euo
 You can load a checkpoint like this:
 
 ```python
-from olmo import Olmo, Tokenizer
+from olmo import OLMo, Tokenizer
 
 checkpoint = "gs://ai2-olmo/ai2-llm/c4-small/euox4j8q/step73000-unsharded"
-model = Olmo.from_checkpoint(checkpoint, device="cuda")
+model = OLMo.from_checkpoint(checkpoint, device="cuda")
 tokenizer = Tokenizer.from_checkpoint(checkpoint)
 ```
 
diff --git a/hf_olmo/configuration_olmo.py b/hf_olmo/configuration_olmo.py
index 5b15fa194..cb7670f6c 100644
--- a/hf_olmo/configuration_olmo.py
+++ b/hf_olmo/configuration_olmo.py
@@ -21,8 +21,8 @@ def __init__(self, use_cache: bool = False, **kwargs):
         all_kwargs.update({"use_cache": use_cache})
         all_kwargs.update(
             {
-                "architectures": all_kwargs.get("architectures", ["OlmoModelForCausalLM"])
-                or ["OlmoModelForCausalLM"]
+                "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"])
+                or ["OLMoModelForCausalLM"]
             }
         )
         super().__init__(**all_kwargs)
diff --git a/hf_olmo/modeling_olmo.py b/hf_olmo/modeling_olmo.py
index 6a279cb10..a1cc569f7 100644
--- a/hf_olmo/modeling_olmo.py
+++ b/hf_olmo/modeling_olmo.py
@@ -7,7 +7,7 @@
 from transformers.models.auto import AutoModelForCausalLM
 
 from olmo.config import ModelConfig
-from olmo.model import Olmo
+from olmo.model import OLMo
 
 from .configuration_olmo import OLMoConfig
 
@@ -34,14 +34,14 @@ class OLMoForCausalLM(PreTrainedModel):
     base_model_prefix = "model"
     _no_split_modules = ["OLMoBlock"]
 
-    def __init__(self, config: OLMoConfig, model: Optional[Olmo] = None, init_params: bool = False):
+    def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False):
         super().__init__(config)
 
         if not model:
             model_config = create_model_config_from_pretrained_config(config)
             # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             model_config.init_device = "cpu"
-            self.model = Olmo(model_config, init_params=init_params)
+            self.model = OLMo(model_config, init_params=init_params)
         else:
             self.model = model
 
diff --git a/hf_olmo/tokenization_olmo_fast.py b/hf_olmo/tokenization_olmo_fast.py
index e2bd665d1..19543a6c7 100644
--- a/hf_olmo/tokenization_olmo_fast.py
+++ b/hf_olmo/tokenization_olmo_fast.py
@@ -4,7 +4,7 @@
 
 
 class OLMoTokenizerFast(PreTrainedTokenizerFast):
-    # Note: Olmo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
+    # Note: OLMo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
     pass
 
     # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
diff --git a/inference/NOTES.md b/inference/NOTES.md
index 0af0f9d09..ea57a322a 100644
--- a/inference/NOTES.md
+++ b/inference/NOTES.md
@@ -45,12 +45,12 @@ To add an `olmo.py` module, we can basically just imitate what was done for othe
 There's one important wrinkle here: some OLMo models use *fused linear attention*. I'm not sure how GPTQ handles this or whether any existing supported models implement attention the same way. This might be something to discuss with Dirk and Pete.
 
 ```python
-Olmo(
+OLMo(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (emb_drop): Dropout(p=0.1, inplace=False)
     (blocks): ModuleList(
-      (0-11): 12 x OlmoSequentialBlock(
+      (0-11): 12 x OLMoSequentialBlock(
         (dropout): Dropout(p=0.1, inplace=False)
         (norm): LayerNorm()
         (act): SwiGLU()
diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
index 46ce32e69..9beb2ff33 100644
--- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
+++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
@@ -13,7 +13,7 @@
 from .internlm import InternLMGPTQForCausalLM
 from .llama import LlamaGPTQForCausalLM
 from .moss import MOSSGPTQForCausalLM
-from .olmo import OlmoGPTQForCausalLM
+from .olmo import OLMoGPTQForCausalLM
 from .opt import OPTGPTQForCausalLM
 from .qwen import QwenGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
@@ -24,7 +24,7 @@
     "gptj": GPTJGPTQForCausalLM,
     "gpt2": GPT2GPTQForCausalLM,
     "llama": LlamaGPTQForCausalLM,
-    "olmo": OlmoGPTQForCausalLM,
+    "olmo": OLMoGPTQForCausalLM,
     "opt": OPTGPTQForCausalLM,
     "moss": MOSSGPTQForCausalLM,
     "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
index 0bf18fc8c..01264bfdb 100644
--- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
+++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
@@ -1,7 +1,7 @@
 from ._base import *
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -19,4 +19,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
diff --git a/inference/compression/olmo_gptq_class.py b/inference/compression/olmo_gptq_class.py
index 645349d7b..0f6580a59 100644
--- a/inference/compression/olmo_gptq_class.py
+++ b/inference/compression/olmo_gptq_class.py
@@ -1,7 +1,7 @@
 from auto_gptq.modeling._base import BaseGPTQForCausalLM
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -17,12 +17,12 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
 
 # NOTE: In progress; may change if OLMo model is updated.
 
 
-# class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+# class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     # Attribute name of Transformer layer block.
 #     layers_block_name = "transformer.blocks"  # NOTE(wadden) Correct
 #
@@ -51,4 +51,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     ]
 
 
-# __all__ = ["OlmoGPTQForCausalLM"]
+# __all__ = ["OLMoGPTQForCausalLM"]
diff --git a/olmo/config.py b/olmo/config.py
index d9e257f88..a68a10a99 100644
--- a/olmo/config.py
+++ b/olmo/config.py
@@ -23,7 +23,7 @@
 from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
 
 from .aliases import PathOrStr
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 from .util import StrEnum
 
 __all__ = [
@@ -116,7 +116,7 @@ def new(cls: Type[C], **kwargs) -> C:
                 conf = om.merge(conf, kwargs)
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     @classmethod
     def load(
@@ -139,7 +139,7 @@ def load(
                 conf = om.merge(conf, om.from_dotlist(overrides))
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     def save(self, path: PathOrStr) -> None:
         """Save to a YAML file."""
diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py
index 52421b57a..7d8fbb56b 100644
--- a/olmo/data/__init__.py
+++ b/olmo/data/__init__.py
@@ -5,7 +5,7 @@
 
 from ..aliases import PathOrStr
 from ..config import DataConfig, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..torch_util import barrier, get_global_rank, get_world_size
 from .collator import DataCollator
 from .iterable_dataset import IterableDataset
@@ -21,7 +21,7 @@ def build_memmap_dataset(
     metadata: List[Dict[str, Any]] = []
     if data_config.paths:
         if data_config.datasets:
-            raise OlmoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
+            raise OLMoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
         paths = data_config.paths
         for path in paths:
             metadata.append({"path": str(path)})
@@ -32,7 +32,7 @@ def build_memmap_dataset(
             paths.extend(label_paths)
             metadata.extend([{"label": label}] * len(label_paths))
     else:
-        raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+        raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
     return MemMapDataset(
         *paths,
         chunk_size=train_config.model.max_sequence_length,
@@ -87,7 +87,7 @@ def build_train_dataloader(train_config: TrainConfig) -> DataLoader:
     work_dir = Path(train_config.save_folder) / "train_data"
     if get_global_rank() == 0:
         if work_dir.is_dir() and not train_config.save_overwrite:
-            raise OlmoConfigurationError(
+            raise OLMoConfigurationError(
                 "train data working directory already exists, use --save_overwrite to overwrite"
             )
         else:
diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py
index 5af73c277..c00f29e06 100644
--- a/olmo/data/memmap_dataset.py
+++ b/olmo/data/memmap_dataset.py
@@ -7,7 +7,7 @@
 import torch
 from torch.utils.data import Dataset
 
-from olmo.exceptions import OlmoEnvironmentError
+from olmo.exceptions import OLMoEnvironmentError
 
 from ..aliases import PathOrStr
 from ..util import _get_s3_client, file_size, get_bytes_range
@@ -93,7 +93,7 @@ def offsets(self) -> List[Tuple[int, int]]:
         _get_s3_client("s3")
         try:
             _get_s3_client("r2")
-        except OlmoEnvironmentError:
+        except OLMoEnvironmentError:
             # R2 might not be needed, so ignore this error. We will get an error
             # later if R2 is needed.
             pass
diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py
index 748e8e46f..bc8313c78 100644
--- a/olmo/eval/__init__.py
+++ b/olmo/eval/__init__.py
@@ -5,7 +5,7 @@
 from torchmetrics import MeanMetric, Metric
 
 from ..config import EvaluatorConfig, EvaluatorType, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..tokenizer import Tokenizer
 from ..torch_util import get_global_rank, get_world_size
 from .downstream import ICLMetric, label_to_task_map
@@ -93,7 +93,7 @@ def make_metric():
         elif eval_config.data.datasets:
             eval_metric = {label: make_metric() for label in eval_config.data.datasets.keys()}
         else:
-            raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+            raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
 
         return Evaluator(
             label=eval_config.label,
diff --git a/olmo/exceptions.py b/olmo/exceptions.py
index 754580c95..5474facc3 100644
--- a/olmo/exceptions.py
+++ b/olmo/exceptions.py
@@ -1,37 +1,37 @@
-__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError", "OlmoEnvironmentError", "OlmoNetworkError"]
+__all__ = ["OLMoError", "OLMoConfigurationError", "OLMoCliError", "OLMoEnvironmentError", "OLMoNetworkError"]
 
 
-class OlmoError(Exception):
+class OLMoError(Exception):
     """
     Base class for all custom OLMo exceptions.
     """
 
 
-class OlmoConfigurationError(OlmoError):
+class OLMoConfigurationError(OLMoError):
     """
     An error with a configuration file.
     """
 
 
-class OlmoCliError(OlmoError):
+class OLMoCliError(OLMoError):
     """
     An error from incorrect CLI usage.
     """
 
 
-class OlmoEnvironmentError(OlmoError):
+class OLMoEnvironmentError(OLMoError):
     """
     An error from incorrect environment variables.
     """
 
 
-class OlmoNetworkError(OlmoError):
+class OLMoNetworkError(OLMoError):
     """
     An error with a network request.
     """
 
 
-class OlmoThreadError(Exception):
+class OLMoThreadError(Exception):
     """
     Raised when a thread fails.
     """
diff --git a/olmo/model.py b/olmo/model.py
index f975c7c98..4235f7561 100644
--- a/olmo/model.py
+++ b/olmo/model.py
@@ -42,7 +42,7 @@
     LayerNormType,
     ModelConfig,
 )
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 from .initialization import ModuleType, init_weights
 from .torch_util import ensure_finite_
 
@@ -62,12 +62,12 @@
     "GELU",
     "ReLU",
     "SwiGLU",
-    "OlmoBlock",
-    "OlmoSequentialBlock",
-    "OlmoParallelBlock",
-    "Olmo",
-    "OlmoOutput",
-    "OlmoGenerateOutput",
+    "OLMoBlock",
+    "OLMoSequentialBlock",
+    "OLMoParallelBlock",
+    "OLMo",
+    "OLMoOutput",
+    "OLMoGenerateOutput",
 ]
 
 
@@ -386,7 +386,7 @@ def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device
     return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
 
 
-class OlmoBlock(nn.Module):
+class OLMoBlock(nn.Module):
     """
     A base class for transformer block implementations.
     """
@@ -589,18 +589,18 @@ def forward(
         raise NotImplementedError
 
     @classmethod
-    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OlmoBlock:
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OLMoBlock:
         if config.block_type == BlockType.sequential:
-            return OlmoSequentialBlock(layer_id, config, cache)
+            return OLMoSequentialBlock(layer_id, config, cache)
         elif config.block_type == BlockType.parallel:
-            return OlmoParallelBlock(layer_id, config, cache)
+            return OLMoParallelBlock(layer_id, config, cache)
         elif config.block_type == BlockType.llama:
-            return OlmoLlamaBlock(layer_id, config, cache)
+            return OLMoLlamaBlock(layer_id, config, cache)
         else:
             raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
 
 
-class OlmoSequentialBlock(OlmoBlock):
+class OLMoSequentialBlock(OLMoBlock):
     """
     This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
@@ -689,11 +689,11 @@ def forward(
         return x, cache
 
 
-class OlmoParallelBlock(OlmoBlock):
+class OLMoParallelBlock(OLMoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))``
     as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))``
-    as in :class:`OlmoSequentialBlock` (ignoring some skip connections).
+    as in :class:`OLMoSequentialBlock` (ignoring some skip connections).
 
     The decoupling of the MLP and Attention functions allow us to fuse the separate input projections
     into a single linear layer to increase throughput. In this configuration it's also straight-forward
@@ -781,10 +781,10 @@ def forward(
             )
 
 
-class OlmoLlamaBlock(OlmoBlock):
+class OLMoLlamaBlock(OLMoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
-    (plus another skip connection). This block is similar to `OlmoSequentialBlock`
+    (plus another skip connection). This block is similar to `OLMoSequentialBlock`
     but some operations have slightly different implementations to imitate the
     behavior of Llama.
     """
@@ -904,7 +904,7 @@ def forward(
         return x, cache
 
 
-class OlmoOutput(NamedTuple):
+class OLMoOutput(NamedTuple):
     logits: torch.FloatTensor
     """
     A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
@@ -922,7 +922,7 @@ class OlmoOutput(NamedTuple):
     """
 
 
-class OlmoGenerateOutput(NamedTuple):
+class OLMoGenerateOutput(NamedTuple):
     token_ids: torch.LongTensor
     """
     The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
@@ -935,7 +935,7 @@ class OlmoGenerateOutput(NamedTuple):
     """
 
 
-class OlmoBlockGroup(nn.ModuleList):
+class OLMoBlockGroup(nn.ModuleList):
     def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
         super().__init__(modules)
         self.config = config
@@ -991,7 +991,7 @@ def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointin
             block.set_activation_checkpointing(strategy)
 
 
-class Olmo(nn.Module):
+class OLMo(nn.Module):
     def __init__(self, config: ModelConfig, init_params: bool = True):
         super().__init__()
         self.config = config
@@ -999,14 +999,14 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
 
         # Validate config.
         if self.config.alibi and self.config.flash_attention:
-            raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention")
+            raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention")
 
         if self.config.alibi and self.config.rope:
-            raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive")
+            raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive")
 
         if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
             if self.config.embedding_size < self.config.vocab_size:
-                raise OlmoConfigurationError("embedding size should be at least as big as vocab size")
+                raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
             elif self.config.embedding_size % 128 != 0:
                 import warnings
 
@@ -1021,7 +1021,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
             0 < self.config.block_group_size <= self.config.n_layers
             and self.config.n_layers % self.config.block_group_size == 0
         ):
-            raise OlmoConfigurationError("n layers must be divisible by block group size")
+            raise OLMoConfigurationError("n layers must be divisible by block group size")
 
         torch.backends.cuda.enable_flash_sdp(self.config.flash_attention)
         torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
@@ -1036,10 +1036,10 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
             )
         )
 
-        blocks = [OlmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
         if self.config.block_group_size > 1:
             block_groups = [
-                OlmoBlockGroup(config, i, blocks[i : i + config.block_group_size])
+                OLMoBlockGroup(config, i, blocks[i : i + config.block_group_size])
                 for i in range(0, config.n_layers, config.block_group_size)
             ]
             self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
@@ -1138,7 +1138,7 @@ def forward(
         use_cache: bool = False,
         last_logits_only: bool = False,
         output_hidden_states: Optional[bool] = None,
-    ) -> OlmoOutput:
+    ) -> OLMoOutput:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
@@ -1316,7 +1316,7 @@ def forward(
         if self.config.scale_logits:
             logits.mul_(1 / math.sqrt(self.config.d_model))
 
-        return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+        return OLMoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
 
     def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None):
         if wrap_strategy is None:
@@ -1336,7 +1336,7 @@ def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None)
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlock)
+                wrap = isinstance(module, OLMoBlock)
                 if recurse:
                     return True
                 else:
@@ -1347,7 +1347,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, (OlmoBlock,)) or module in size_based_module_to_wrap
+                wrap = isinstance(module, (OLMoBlock,)) or module in size_based_module_to_wrap
                 if recurse:
                     return True
                 else:
@@ -1356,13 +1356,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
             return fsdp_wrap_fn
         elif wrap_strategy == FSDPWrapStrategy.by_block_group:
             if self.config.block_group_size <= 1:
-                raise OlmoConfigurationError(
+                raise OLMoConfigurationError(
                     "'by_block_group' FSDP wrapping strategy requires block group size greater than 1"
                 )
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlockGroup)
+                wrap = isinstance(module, OLMoBlockGroup)
                 if recurse:
                     return True
                 else:
@@ -1371,13 +1371,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
             return fsdp_wrap_fn
         elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size:
             if self.config.block_group_size <= 1:
-                raise OlmoConfigurationError(
+                raise OLMoConfigurationError(
                     "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1"
                 )
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, (OlmoBlockGroup,)) or module in size_based_module_to_wrap
+                wrap = isinstance(module, (OLMoBlockGroup,)) or module in size_based_module_to_wrap
                 if recurse:
                     return True
                 else:
@@ -1403,7 +1403,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlock) and module.layer_id % c == 0
+                wrap = isinstance(module, OLMoBlock) and module.layer_id % c == 0
                 if recurse:
                     return True
                 else:
@@ -1454,7 +1454,7 @@ def generate(
         min_steps: Optional[int] = None,
         final_sequence_scorer: Optional[FinalSequenceScorer] = None,
         constraints: Optional[List[Constraint]] = None,
-    ) -> OlmoGenerateOutput:
+    ) -> OLMoGenerateOutput:
         """
         Generate token IDs using beam search.
 
@@ -1564,7 +1564,7 @@ def step(
         with torch.no_grad():
             token_ids, scores = beam_search.search(initial_preds, state, step)
 
-        return OlmoGenerateOutput(
+        return OLMoGenerateOutput(
             token_ids=token_ids,  # type: ignore[arg-type]
             scores=scores,  # type: ignore[arg-type]
         )
@@ -1572,7 +1572,7 @@ def step(
     @classmethod
     def from_checkpoint(
         cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: Optional[CheckpointType] = None
-    ) -> Olmo:
+    ) -> OLMo:
         """
         Load an OLMo model from a checkpoint.
         """
@@ -1595,7 +1595,7 @@ def from_checkpoint(
         if checkpoint_type == CheckpointType.unsharded:
             # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             model_config.init_device = "cpu"
-            model = Olmo(model_config)
+            model = OLMo(model_config)
 
             # Load state dict directly to target device.
             state_dict_path = resource_path(checkpoint_dir, "model.pt")
@@ -1608,7 +1608,7 @@ def from_checkpoint(
             # Initialize model on target device. In this case the state dict is loaded in-place
             # so it's not necessary to start on CPU if the target device is a GPU.
             model_config.init_device = device
-            model = Olmo(model_config)
+            model = OLMo(model_config)
 
             # Load state dict in place.
             load_model_state(checkpoint_dir, model)
diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py
index a833d3c21..3ed064097 100644
--- a/olmo/tokenizer.py
+++ b/olmo/tokenizer.py
@@ -8,7 +8,7 @@
 
 from .aliases import PathOrStr
 from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 
 __all__ = ["Tokenizer"]
 
@@ -68,7 +68,7 @@ def from_train_config(cls, config: TrainConfig) -> Tokenizer:
                 pad_token_id=config.model.pad_token_id,
             )
         if config.model.vocab_size != tokenizer.vocab_size:
-            raise OlmoConfigurationError("vocab size mismatch between config and tokenizer")
+            raise OLMoConfigurationError("vocab size mismatch between config and tokenizer")
         return tokenizer
 
     @classmethod
@@ -117,7 +117,7 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer:
             pad_token_id=model_config.pad_token_id,
         )
         if model_config.vocab_size != tokenizer.vocab_size:
-            raise OlmoConfigurationError("vocab size mismatch between config and tokenizer")
+            raise OLMoConfigurationError("vocab size mismatch between config and tokenizer")
         return tokenizer
 
     def add_special_tokens(self, input_ids: List[int]) -> List[int]:
diff --git a/olmo/train.py b/olmo/train.py
index 79132f0fc..43d4ee5fc 100644
--- a/olmo/train.py
+++ b/olmo/train.py
@@ -33,8 +33,8 @@
 )
 from .data import IterableDataset
 from .eval import Evaluator
-from .exceptions import OlmoConfigurationError
-from .model import Olmo
+from .exceptions import OLMoConfigurationError
+from .model import OLMo
 from .optim import Optimizer, Scheduler
 from .torch_util import (
     barrier,
@@ -96,7 +96,7 @@ def check(self) -> Dict[str, float]:
 @dataclass
 class Trainer:
     cfg: TrainConfig
-    model: Olmo
+    model: OLMo
     fsdp_model: FSDP
     optim: Optimizer
     scheduler: Scheduler
@@ -351,7 +351,7 @@ def _save_checkpoint(
                 upload_to=remote_checkpoint_dir,
             )
         except FileExistsError:
-            raise OlmoConfigurationError(
+            raise OLMoConfigurationError(
                 f"Checkpoint for step {self.global_step} already exists, use --save-overwrite to overwrite it"
             )
 
diff --git a/olmo/util.py b/olmo/util.py
index c0519e9a1..c13cc8086 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -25,11 +25,11 @@
 
 from .aliases import PathOrStr
 from .exceptions import (
-    OlmoCliError,
-    OlmoEnvironmentError,
-    OlmoError,
-    OlmoNetworkError,
-    OlmoThreadError,
+    OLMoCliError,
+    OLMoEnvironmentError,
+    OLMoError,
+    OLMoNetworkError,
+    OLMoThreadError,
 )
 from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
 
@@ -150,9 +150,9 @@ def excepthook(exctype, value, traceback):
     """
     if issubclass(exctype, KeyboardInterrupt):
         sys.__excepthook__(exctype, value, traceback)
-    elif issubclass(exctype, OlmoCliError):
+    elif issubclass(exctype, OLMoCliError):
         rich.get_console().print(f"[yellow]{value}[/]", highlight=False)
-    elif issubclass(exctype, OlmoError):
+    elif issubclass(exctype, OLMoError):
         rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False)
     else:
         log.critical("Uncaught %s: %s", exctype.__name__, value, exc_info=(exctype, value, traceback))
@@ -450,7 +450,7 @@ def _get_s3_profile_name(scheme: str) -> Optional[str]:
     if scheme == "r2":
         profile_name = os.environ.get("R2_PROFILE")
         if profile_name is None:
-            raise OlmoEnvironmentError(
+            raise OLMoEnvironmentError(
                 "R2 profile name is not set. Did you forget to set the 'R2_PROFILE' env var?"
             )
 
@@ -465,7 +465,7 @@ def _get_s3_endpoint_url(scheme: str) -> Optional[str]:
     if scheme == "r2":
         r2_endpoint_url = os.environ.get("R2_ENDPOINT_URL")
         if r2_endpoint_url is None:
-            raise OlmoEnvironmentError(
+            raise OLMoEnvironmentError(
                 "R2 endpoint url is not set. Did you forget to set the 'R2_ENDPOINT_URL' env var?"
             )
 
@@ -511,12 +511,12 @@ def _s3_upload(
                 _wait_before_retry(attempt)
 
         if err is not None:
-            raise OlmoNetworkError("Failed to check object existence during s3 upload") from err
+            raise OLMoNetworkError("Failed to check object existence during s3 upload") from err
 
     try:
         _get_s3_client(scheme).upload_file(source, bucket_name, key)
     except boto_exceptions.ClientError as e:
-        raise OlmoNetworkError("Failed to upload to s3") from e
+        raise OLMoNetworkError("Failed to upload to s3") from e
 
 
 def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3) -> int:
@@ -533,7 +533,7 @@ def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3
             log.warning("%s failed attempt %d with retriable error: %s", _s3_file_size.__name__, attempt, err)
             _wait_before_retry(attempt)
 
-    raise OlmoNetworkError("Failed to get s3 file size") from err
+    raise OLMoNetworkError("Failed to get s3 file size") from err
 
 
 def _s3_get_bytes_range(
@@ -572,7 +572,7 @@ def _s3_get_bytes_range(
     # This can cause an irrelevant exception (e.g. KeyError: 'error'), resulting
     # in us losing the true exception info. To avoid this, we change the exception
     # to a type that has a single-parameter constructor.
-    raise OlmoNetworkError("Failed to get bytes range from s3") from err
+    raise OLMoNetworkError("Failed to get bytes range from s3") from err
 
 
 def _s3_find_latest_checkpoint(scheme: str, bucket_name: str, prefix: str) -> Optional[str]:
@@ -626,7 +626,7 @@ def fill_queue():
 
     for x in iter(q.get, sentinel):
         if isinstance(x, Exception):
-            raise OlmoThreadError(f"generator thread {thread_name} failed") from x
+            raise OLMoThreadError(f"generator thread {thread_name} failed") from x
         else:
             yield x
 
diff --git a/olmo/version.py b/olmo/version.py
index 3f9d92c5b..e75c8373e 100644
--- a/olmo/version.py
+++ b/olmo/version.py
@@ -1,8 +1,8 @@
 _MAJOR = "0"
-_MINOR = "2"
+_MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "5"
+_PATCH = "0"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
diff --git a/scripts/init_config.py b/scripts/init_config.py
index 22143d401..22c223d7b 100644
--- a/scripts/init_config.py
+++ b/scripts/init_config.py
@@ -7,7 +7,7 @@
 from typing import List
 
 from olmo import TrainConfig
-from olmo.exceptions import OlmoCliError
+from olmo.exceptions import OLMoCliError
 from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger(__name__)
@@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None:
     try:
         save_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
 
     main(Path(save_path), [clean_opt(s) for s in args_list])
diff --git a/scripts/inspect_train_data.py b/scripts/inspect_train_data.py
index 871702b8c..bed8b432f 100644
--- a/scripts/inspect_train_data.py
+++ b/scripts/inspect_train_data.py
@@ -9,7 +9,7 @@
 
 from olmo.config import TrainConfig
 from olmo.data import build_memmap_dataset
-from olmo.exceptions import OlmoCliError
+from olmo.exceptions import OLMoCliError
 from olmo.tokenizer import Tokenizer
 from olmo.util import clean_opt, prepare_cli_environment
 
@@ -51,6 +51,6 @@ def main(save_folder: Path, *steps: int, rank: Optional[int] = None):
     try:
         save_folder, rank, steps = sys.argv[1], int(sys.argv[2]), [int(i) for i in sys.argv[3:]]
     except (IndexError, ValueError):
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]")
 
     main(Path(save_folder), *steps, rank=rank if rank >= 0 else None)
diff --git a/scripts/show_model_size.py b/scripts/show_model_size.py
index 3740137dc..cf2ca1e22 100644
--- a/scripts/show_model_size.py
+++ b/scripts/show_model_size.py
@@ -10,8 +10,8 @@
 import logging
 import sys
 
-from olmo import Olmo, TrainConfig
-from olmo.exceptions import OlmoCliError
+from olmo import OLMo, TrainConfig
+from olmo.exceptions import OLMoCliError
 from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger(__name__)
@@ -23,7 +23,7 @@ def main(cfg: TrainConfig) -> None:
     n_layers = cfg.model.n_layers
     cfg.model.n_layers = 1
 
-    single_layer_model = Olmo(cfg.model)
+    single_layer_model = OLMo(cfg.model)
     block = single_layer_model.transformer.blocks[0]  # type: ignore
     params_per_block = sum(p.numel() for p in block.parameters())  # type: ignore
 
@@ -42,7 +42,7 @@ def main(cfg: TrainConfig) -> None:
     try:
         yaml_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
 
     cfg = TrainConfig.load(
         yaml_path,
diff --git a/scripts/train.py b/scripts/train.py
index fca309a7f..c59bc51f3 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -16,8 +16,8 @@
 from olmo.config import CheckpointType, TrainConfig
 from olmo.data import build_train_dataloader
 from olmo.eval import build_evaluators
-from olmo.exceptions import OlmoCliError, OlmoConfigurationError
-from olmo.model import Olmo
+from olmo.exceptions import OLMoCliError, OLMoConfigurationError
+from olmo.model import OLMo
 from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler
 from olmo.torch_util import (
     barrier,
@@ -37,7 +37,7 @@
 def main(cfg: TrainConfig) -> None:
     # Ensure run name set.
     if cfg.run_name is None:
-        raise OlmoConfigurationError("--run_name is required")
+        raise OLMoConfigurationError("--run_name is required")
     log_extra_field("run_name", cfg.run_name)
 
     # Sanity check
@@ -77,7 +77,7 @@ def main(cfg: TrainConfig) -> None:
             # Save config.
             save_path = Path(cfg.save_folder) / "config.yaml"
             if save_path.is_file() and not cfg.save_overwrite:
-                raise OlmoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite")
+                raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite")
             else:
                 log.info(f"Saving config to {save_path}")
                 save_path.parent.mkdir(exist_ok=True, parents=True)
@@ -114,7 +114,7 @@ def main(cfg: TrainConfig) -> None:
 
     # Initialize the model.
     log.info("Building model...")
-    olmo_model = Olmo(cfg.model)
+    olmo_model = OLMo(cfg.model)
     log.info(f"Total number of parameters: {olmo_model.num_params():,d}")
     log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}")
     log.info(f"Peak GPU Memory (MB) before FSDP: {int(peak_gpu_memory() or 0)}")
@@ -159,7 +159,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
     if cfg.save_data_indices:
         indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz"
         if indices_file_path.exists() and not cfg.save_overwrite:
-            raise OlmoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite")
+            raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite")
         indices_file_path.parent.mkdir(exist_ok=True, parents=True)
         indices_file = gzip.open(indices_file_path, "wt")
 
@@ -250,7 +250,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
     try:
         yaml_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
 
     cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list])
     main(cfg)
diff --git a/test_fixtures/test-olmo-model/config.json b/test_fixtures/test-olmo-model/config.json
index 71e7b981e..352a4c976 100644
--- a/test_fixtures/test-olmo-model/config.json
+++ b/test_fixtures/test-olmo-model/config.json
@@ -3,7 +3,7 @@
   "alibi": false,
   "alibi_bias_max": 8.0,
   "architectures": [
-    "OlmoModelForCausalLM"
+    "OLMoModelForCausalLM"
   ],
   "attention_dropout": 0.1,
   "attention_layer_norm": false,
diff --git a/tests/hf_olmo/hf_olmo_test.py b/tests/hf_olmo/hf_olmo_test.py
index 0b323c4e8..6f70c0090 100644
--- a/tests/hf_olmo/hf_olmo_test.py
+++ b/tests/hf_olmo/hf_olmo_test.py
@@ -3,7 +3,7 @@
 
 from olmo import BlockType, Tokenizer, TrainConfig
 from olmo.data import DataCollator
-from olmo.model import Olmo
+from olmo.model import OLMo
 from olmo.torch_util import seed_all
 
 
@@ -188,7 +188,7 @@ def test_forward(
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
     seed_all(1234)
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     hf_config = OLMoConfig(**model.config.asdict())
 
diff --git a/tests/hf_olmo/modeling_olmo_test.py b/tests/hf_olmo/modeling_olmo_test.py
index fda1bd715..e4bb02f54 100644
--- a/tests/hf_olmo/modeling_olmo_test.py
+++ b/tests/hf_olmo/modeling_olmo_test.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from olmo.model import Olmo
+from olmo.model import OLMo
 
 
 def test_olmo_model(model_path: str):
@@ -11,7 +11,7 @@ def test_olmo_model(model_path: str):
 
     from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast  # noqa: F401
 
-    model = Olmo.from_checkpoint(model_path)
+    model = OLMo.from_checkpoint(model_path)
     hf_model = AutoModelForCausalLM.from_pretrained(model_path)
 
     tokenizer = AutoTokenizer.from_pretrained(model_path)
diff --git a/tests/model_test.py b/tests/model_test.py
index 79f2b1a26..1833223ce 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -3,7 +3,7 @@
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 
-from olmo import BlockType, LayerNorm, Olmo, Tokenizer, TrainConfig
+from olmo import BlockType, LayerNorm, OLMo, Tokenizer, TrainConfig
 from olmo.config import ModelConfig, PaddingDirection
 from olmo.data import DataCollator
 
@@ -173,7 +173,7 @@ def test_forward(
 
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     input1 = tokenizer.encode("My name is OLMo!")
     input2 = tokenizer.encode("I'm a delightful large open language model :)")
@@ -293,7 +293,7 @@ def test_backward(
     else:
         train_config.model.init_device = "cpu"
 
-    model = Olmo(train_config.model).train()
+    model = OLMo(train_config.model).train()
 
     with torch.autocast(
         device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype
@@ -364,7 +364,7 @@ def test_generate(
         train_config.model.init_device = "cpu"
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False)
     input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False)
@@ -426,8 +426,8 @@ def test_layer_norm(train_config: TrainConfig, elementwise_affine: bool, include
 
 
 def test_block_groups():
-    model_with_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval()
-    model_without_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval()
+    model_with_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval()
+    model_without_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval()
 
     # We should be able to load the state dict from one model into the other, and vice-versa.
     state_dict_to_load, og_keys_to_new_keys = model_with_block_groups._make_state_dict_compatible(