From 5d167cf1b50474d627055d41fee018edb9ff0eee Mon Sep 17 00:00:00 2001
From: Pete <epwalsh10@gmail.com>
Date: Thu, 6 Apr 2023 10:12:46 -0700
Subject: [PATCH] rename DOLMA -> OLMo (#86)

---
 .github/ISSUE_TEMPLATE/bug_report.yml  |  2 +-
 .github/workflows/main.yml             |  4 +-
 .github/workflows/pr_checks.yml        |  2 +-
 CONTRIBUTING.md                        |  8 ++--
 Makefile                               | 10 ++---
 README.md                              |  8 ++--
 RELEASE_PROCESS.md                     |  2 +-
 conftest.py                            |  4 +-
 docker/Dockerfile.gantry               |  4 +-
 docker/Dockerfile.test                 |  2 +-
 dolma/exceptions.py                    | 19 ---------
 {dolma => olmo}/__init__.py            |  2 +-
 {dolma => olmo}/aliases.py             |  0
 {dolma => olmo}/beam_search.py         |  0
 {dolma => olmo}/composer.py            | 22 +++++------
 {dolma => olmo}/config.py              | 10 ++---
 {dolma => olmo}/data/__init__.py       |  0
 {dolma => olmo}/data/collator.py       |  0
 {dolma => olmo}/data/memmap_dataset.py |  0
 olmo/exceptions.py                     | 19 +++++++++
 {dolma => olmo}/model.py               | 54 +++++++++++++-------------
 {dolma => olmo}/optim.py               |  0
 {dolma => olmo}/py.typed               |  0
 {dolma => olmo}/tokenizer.py           |  4 +-
 {dolma => olmo}/util.py                |  8 ++--
 {dolma => olmo}/version.py             |  0
 scripts/init_config.py                 |  8 ++--
 scripts/prepare_changelog.py           |  2 +-
 scripts/prepare_memmap_dataset.py      |  4 +-
 scripts/release.sh                     |  2 +-
 scripts/train.py                       | 34 ++++++++--------
 scripts/upload_artifact.py             |  2 +-
 setup.py                               |  6 +--
 tests/beam_search_test.py              |  2 +-
 tests/config_test.py                   |  2 +-
 tests/data/collator_test.py            |  2 +-
 tests/data/memmap_dataset_test.py      |  4 +-
 tests/model_test.py                    | 22 +++++------
 tests/tokenizer_test.py                |  2 +-
 39 files changed, 138 insertions(+), 138 deletions(-)
 delete mode 100644 dolma/exceptions.py
 rename {dolma => olmo}/__init__.py (86%)
 rename {dolma => olmo}/aliases.py (100%)
 rename {dolma => olmo}/beam_search.py (100%)
 rename {dolma => olmo}/composer.py (95%)
 rename {dolma => olmo}/config.py (98%)
 rename {dolma => olmo}/data/__init__.py (100%)
 rename {dolma => olmo}/data/collator.py (100%)
 rename {dolma => olmo}/data/memmap_dataset.py (100%)
 create mode 100644 olmo/exceptions.py
 rename {dolma => olmo}/model.py (96%)
 rename {dolma => olmo}/optim.py (100%)
 rename {dolma => olmo}/py.typed (100%)
 rename {dolma => olmo}/tokenizer.py (97%)
 rename {dolma => olmo}/util.py (97%)
 rename {dolma => olmo}/version.py (100%)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 05d1a4b9c..77ad24c4d 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -17,7 +17,7 @@ body:
 
       ```python
       # All necessary imports at the beginning
-      import dolma
+      import olmo
 
       # A succinct reproducing example trimmed down to the essential parts:
       assert False is True, "Oh no!"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ab3089822..346098cdf 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -101,7 +101,7 @@ jobs:
         if: always()
         run: |
           . .venv/bin/activate
-          pip uninstall -y dolma
+          pip uninstall -y olmo
 
   gpu_tests:
     name: GPU Tests
@@ -109,7 +109,7 @@ jobs:
     timeout-minutes: 15
     env:
       BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
-      BEAKER_IMAGE: dolma-torch2-test
+      BEAKER_IMAGE: olmo-torch2-test
       BEAKER_WORKSPACE: ai2/llm-testing
     steps:
       - name: Determine current commit SHA (pull request)
diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml
index f853fb446..12f918aaf 100644
--- a/.github/workflows/pr_checks.yml
+++ b/.github/workflows/pr_checks.yml
@@ -9,7 +9,7 @@ on:
     branches:
       - main
     paths:
-      - 'dolma/**'
+      - 'olmo/**'
 
 jobs:
   changelog:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5f8294129..f790ef6ac 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -77,8 +77,8 @@ When you're ready to contribute code to address an open issue, please follow the
 
     Then you can create and activate a new Python environment by running:
 
-        conda create -n dolma python=3.9
-        conda activate dolma
+        conda create -n olmo python=3.9
+        conda activate olmo
 
     Once your virtual environment is activated, you can install your local clone in "editable mode" with
 
@@ -139,13 +139,13 @@ When you're ready to contribute code to address an open issue, please follow the
 
     We also strive to maintain high test coverage, so most contributions should include additions to [the unit tests](https://github.com/allenai/LLM/tree/main/tests). These tests are run with [`pytest`](https://docs.pytest.org/en/latest/), which you can use to locally run any test modules that you've added or changed.
 
-    For example, if you've fixed a bug in `dolma/a/b.py`, you can run the tests specific to that module with
+    For example, if you've fixed a bug in `olmo/a/b.py`, you can run the tests specific to that module with
 
         pytest -v tests/a/b_test.py
 
     To check the code coverage locally in this example, you could run
 
-        pytest -v --cov dolma.a.b tests/a/b_test.py
+        pytest -v --cov olmo.a.b tests/a/b_test.py
 
     If your contribution involves additions to any public part of the API, we require that you write docstrings
     for each function, method, class, or module that you add.
diff --git a/Makefile b/Makefile
index 3ac3be0f5..d6e37708f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # If you update this, also update BEAKER_IMAGE in .github/workflows/main.yml
-IMAGE_NAME_BASE = dolma-torch2
+IMAGE_NAME_BASE = olmo-torch2
 # If you update this, also update BEAKER_WORKSPACE in .github/workflows/main.yml
 BEAKER_WORKSPACE = ai2/llm-testing
 
@@ -24,7 +24,7 @@ beaker-info :
 .PHONY : images
 images : gantry-image test-image
 
-PHONY : base-image
+.PHONY : base-image
 base-image :
 	docker build -f docker/Dockerfile.base -t $(IMAGE_NAME_BASE)-base .
 
@@ -91,7 +91,7 @@ gantry-run-ib :
 		--env NCCL_DEBUG=INFO \
 		--env SCRATCH_DIR=/tmp/scratch \
 		--env FLASH_DIR=/tmp/flash \
-		--env WANDB_PROJECT=dolma-beaker-ib \
+		--env WANDB_PROJECT=olmo-beaker-ib \
 		--env-secret WANDB_API_KEY=WANDB_API_KEY \
 		--replicas 4 \
 		--leader-selection \
@@ -103,8 +103,8 @@ gantry-run-ib :
 
 .PHONY : check-cpu-install
 check-cpu-install :
-	@python -c 'from dolma import check_install; check_install(cuda=False)'
+	@python -c 'from olmo import check_install; check_install(cuda=False)'
 
 .PHONY : check-cuda-install
 check-cuda-install :
-	@python -c 'from dolma import check_install; check_install(cuda=True)'
+	@python -c 'from olmo import check_install; check_install(cuda=True)'
diff --git a/README.md b/README.md
index 17d9c557a..64bd8a22d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# DOLMA: Delightful Open Language Model from AI2
+# OLMo: Delightful Open Language Model from AI2
 
 ## Setup
 
@@ -24,7 +24,7 @@ gantry run \
   --nfs \
   --priority preemptible \
   --gpus 8 \
-  --beaker-image dolma-torch2-gantry \
+  --beaker-image olmo-torch2-gantry \
   --cluster 'ai2/*-cirrascale' \
   --allow-dirty \
   -- composer scripts/train.py configs/1.2b-c4.yaml
@@ -36,7 +36,7 @@ Train the 70B model on c4 with gantry across multiple nodes:
 gantry run \
   --workspace ai2/llm-testing \
   --priority "high" \
-  --beaker-image dolma-torch2-gantry \
+  --beaker-image olmo-torch2-gantry \
   --cluster ai2/general-cirrascale-a100-80g-ib \
   --gpus 8 \
   --nfs \
@@ -45,7 +45,7 @@ gantry run \
   --env NCCL_DEBUG=INFO \
   --env SCRATCH_DIR=/tmp/scratch \
   --env FLASH_DIR=/tmp/flash \
-  --env WANDB_PROJECT=dolma-beaker-ib \
+  --env WANDB_PROJECT=olmo-beaker-ib \
   --env-secret WANDB_API_KEY=WANDB_API_KEY \
   --replicas 4 \
   --leader-selection \
diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md
index 03bb4ba80..dc1eb9c94 100644
--- a/RELEASE_PROCESS.md
+++ b/RELEASE_PROCESS.md
@@ -2,7 +2,7 @@
 
 ## Steps
 
-1. Update the version in `dolma/version.py`.
+1. Update the version in `olmo/version.py`.
 
 3. Run the release script:
 
diff --git a/conftest.py b/conftest.py
index ea8c522bb..3aa87ca93 100644
--- a/conftest.py
+++ b/conftest.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from dolma.config import (
+from olmo.config import (
     DataConfig,
     ModelConfig,
     OptimizerConfig,
@@ -11,7 +11,7 @@
     TokenizerConfig,
     TrainConfig,
 )
-from dolma.tokenizer import Tokenizer
+from olmo.tokenizer import Tokenizer
 
 TEST_MODEL = "gpt2"
 
diff --git a/docker/Dockerfile.gantry b/docker/Dockerfile.gantry
index 14a902870..18fd894b0 100644
--- a/docker/Dockerfile.gantry
+++ b/docker/Dockerfile.gantry
@@ -4,11 +4,11 @@
 # To build and push the image to Beaker, run 'make gantry-image'.
 # To test the image after pushing to Beaker, run 'make gantry-test'.
 
-FROM dolma-torch2-base
+FROM olmo-torch2-base
 
 WORKDIR /stage
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-WORKDIR /app/dolma
+WORKDIR /app/olmo
diff --git a/docker/Dockerfile.test b/docker/Dockerfile.test
index 35614df8b..e4589f964 100644
--- a/docker/Dockerfile.test
+++ b/docker/Dockerfile.test
@@ -4,7 +4,7 @@
 #
 # To build and push the image to Beaker, run 'make test-image'.
 
-FROM dolma-torch2-base
+FROM olmo-torch2-base
 
 COPY scripts/test_entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
diff --git a/dolma/exceptions.py b/dolma/exceptions.py
deleted file mode 100644
index 6f3383d7d..000000000
--- a/dolma/exceptions.py
+++ /dev/null
@@ -1,19 +0,0 @@
-__all__ = ["DolmaError", "DolmaConfigurationError", "DolmaCliError"]
-
-
-class DolmaError(Exception):
-    """
-    Base class for all custom DOLMA exceptions.
-    """
-
-
-class DolmaConfigurationError(DolmaError):
-    """
-    An error with a configuration file.
-    """
-
-
-class DolmaCliError(DolmaError):
-    """
-    An error from incorrect CLI usage.
-    """
diff --git a/dolma/__init__.py b/olmo/__init__.py
similarity index 86%
rename from dolma/__init__.py
rename to olmo/__init__.py
index 3d9043e83..e14934765 100644
--- a/dolma/__init__.py
+++ b/olmo/__init__.py
@@ -12,4 +12,4 @@ def check_install(cuda: bool = False):
         assert torch.cuda.is_available(), "CUDA is not available!"
         print("CUDA available")
 
-    print(f"DOLMA v{VERSION} installed")
+    print(f"OLMo v{VERSION} installed")
diff --git a/dolma/aliases.py b/olmo/aliases.py
similarity index 100%
rename from dolma/aliases.py
rename to olmo/aliases.py
diff --git a/dolma/beam_search.py b/olmo/beam_search.py
similarity index 100%
rename from dolma/beam_search.py
rename to olmo/beam_search.py
diff --git a/dolma/composer.py b/olmo/composer.py
similarity index 95%
rename from dolma/composer.py
rename to olmo/composer.py
index 9f737551e..0952871c5 100644
--- a/dolma/composer.py
+++ b/olmo/composer.py
@@ -23,16 +23,16 @@
     TrainConfig,
 )
 from .data import DataCollator, MemMapDataset
-from .exceptions import DolmaConfigurationError
-from .model import Dolma, LayerNormBase
+from .exceptions import OlmoConfigurationError
+from .model import LayerNormBase, Olmo
 from .optim import DecoupledLionW
 
 log = logging.getLogger(__name__)
 
 __all__ = [
     "TrainBatchPerplexity",
-    "ComposerDolmaLM",
-    "DolmaConsoleLogger",
+    "ComposerOlmoLM",
+    "OlmoConsoleLogger",
     "build_dataloader",
     "build_optimizer",
     "build_scheduler",
@@ -76,10 +76,10 @@ def compute(self) -> torch.Tensor:
         return torch.exp(self.loss)
 
 
-class ComposerDolmaLM(ComposerModel):
-    def __init__(self, model_or_config: Union[Dolma, ModelConfig]):
+class ComposerOlmoLM(ComposerModel):
+    def __init__(self, model_or_config: Union[Olmo, ModelConfig]):
         super().__init__()
-        self.model = Dolma(model_or_config) if isinstance(model_or_config, ModelConfig) else model_or_config
+        self.model = Olmo(model_or_config) if isinstance(model_or_config, ModelConfig) else model_or_config
         self.config = self.model.config
         self.num_fwd_flops = self.model.num_fwd_flops
 
@@ -131,7 +131,7 @@ def flops_per_batch(self, batch: BatchDict):
         return self.num_fwd_flops * 3 * batch["input_ids"].shape[0]
 
 
-class DolmaConsoleLogger(ConsoleLogger):
+class OlmoConsoleLogger(ConsoleLogger):
     metrics_to_log: Set[str] = {"loss/train/total", "trainer/global_step", "metrics/*"}
 
     def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
@@ -285,7 +285,7 @@ def calculate_batch_size_info(
     global_batch_size: int, device_microbatch_size: Union[int, str]
 ) -> Tuple[int, Union[str, int], Union[str, int]]:
     if global_batch_size % dist.get_world_size() != 0:
-        raise DolmaConfigurationError(
+        raise OlmoConfigurationError(
             f"Global batch size {global_batch_size} is not divisible by {dist.get_world_size()} "
             "as a result, the batch size would be truncated, please adjust `global_batch_size` "
             f"to be divisible by world size, {dist.get_world_size()}."
@@ -303,7 +303,7 @@ def calculate_batch_size_info(
             device_microbatch_size = device_batch_size
         device_grad_accum = math.ceil(device_batch_size / device_microbatch_size)
     else:
-        raise DolmaConfigurationError(f"Not sure how to parse {device_microbatch_size=}")
+        raise OlmoConfigurationError(f"Not sure how to parse {device_microbatch_size=}")
 
     return device_batch_size, device_microbatch_size, device_grad_accum
 
@@ -324,7 +324,7 @@ def update_batch_size_info(cfg: TrainConfig):
         elif isinstance(cfg.device_train_microbatch_size, int):
             cfg.device_eval_batch_size = cfg.device_train_microbatch_size
         else:
-            raise DolmaConfigurationError(
+            raise OlmoConfigurationError(
                 f"Not sure how to parse device_train_microbatch_size={cfg.device_train_microbatch_size}"
             )
     return cfg
diff --git a/dolma/config.py b/olmo/config.py
similarity index 98%
rename from dolma/config.py
rename to olmo/config.py
index 1aabacc23..079da5486 100644
--- a/dolma/config.py
+++ b/olmo/config.py
@@ -22,7 +22,7 @@
 from omegaconf.errors import OmegaConfBaseException
 
 from .aliases import PathOrStr
-from .exceptions import DolmaConfigurationError
+from .exceptions import OlmoConfigurationError
 
 __all__ = [
     "ActivationType",
@@ -90,7 +90,7 @@ def new(cls: Type[C], **kwargs) -> C:
                 conf = om.merge(conf, kwargs)
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise DolmaConfigurationError(str(e))
+            raise OlmoConfigurationError(str(e))
 
     @classmethod
     def load(cls: Type[C], path: PathOrStr, overrides: Optional[List[str]] = None) -> C:
@@ -103,7 +103,7 @@ def load(cls: Type[C], path: PathOrStr, overrides: Optional[List[str]] = None) -
                 conf = om.merge(conf, om.from_dotlist(overrides))
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise DolmaConfigurationError(str(e))
+            raise OlmoConfigurationError(str(e))
 
     def save(self, path: PathOrStr) -> None:
         """Save to a YAML file."""
@@ -155,7 +155,7 @@ class BlockType(StrEnum):
 @dataclass
 class ModelConfig(BaseConfig):
     """
-    DOLMA (model) configuration.
+    OLMo (model) configuration.
     """
 
     # Note that the defaults for these attributes are equivalent to the base GPT2 model.
@@ -401,7 +401,7 @@ class CompilerConfig(BaseConfig):
 @dataclass
 class TrainConfig(BaseConfig):
     """
-    DOLMA training configuration.
+    OLMo training configuration.
     """
 
     run_name: Optional[str] = None
diff --git a/dolma/data/__init__.py b/olmo/data/__init__.py
similarity index 100%
rename from dolma/data/__init__.py
rename to olmo/data/__init__.py
diff --git a/dolma/data/collator.py b/olmo/data/collator.py
similarity index 100%
rename from dolma/data/collator.py
rename to olmo/data/collator.py
diff --git a/dolma/data/memmap_dataset.py b/olmo/data/memmap_dataset.py
similarity index 100%
rename from dolma/data/memmap_dataset.py
rename to olmo/data/memmap_dataset.py
diff --git a/olmo/exceptions.py b/olmo/exceptions.py
new file mode 100644
index 000000000..9b46995a5
--- /dev/null
+++ b/olmo/exceptions.py
@@ -0,0 +1,19 @@
+__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError"]
+
+
+class OlmoError(Exception):
+    """
+    Base class for all custom OLMo exceptions.
+    """
+
+
+class OlmoConfigurationError(OlmoError):
+    """
+    An error with a configuration file.
+    """
+
+
+class OlmoCliError(OlmoError):
+    """
+    An error from incorrect CLI usage.
+    """
diff --git a/dolma/model.py b/olmo/model.py
similarity index 96%
rename from dolma/model.py
rename to olmo/model.py
index 98c69f841..577d72928 100644
--- a/dolma/model.py
+++ b/olmo/model.py
@@ -18,7 +18,7 @@
 
 from .beam_search import BeamSearch, Constraint, FinalSequenceScorer, Sampler
 from .config import ActivationType, BlockType, LayerNormType, ModelConfig
-from .exceptions import DolmaConfigurationError
+from .exceptions import OlmoConfigurationError
 
 __all__ = [
     "LayerNormBase",
@@ -29,12 +29,12 @@
     "GELU",
     "ReLU",
     "SwiGLU",
-    "DolmaBlock",
-    "DolmaSequentialBlock",
-    "DolmaParallelBlock",
-    "Dolma",
-    "DolmaOutput",
-    "DolmaGenerateOutput",
+    "OlmoBlock",
+    "OlmoSequentialBlock",
+    "OlmoParallelBlock",
+    "Olmo",
+    "OlmoOutput",
+    "OlmoGenerateOutput",
 ]
 
 
@@ -215,7 +215,7 @@ def output_multiplier(self) -> float:
         return 0.5
 
 
-class DolmaBlock(nn.Module):
+class OlmoBlock(nn.Module):
     """
     A base class for transformer block implementations.
     """
@@ -317,16 +317,16 @@ def forward(
         raise NotImplementedError
 
     @classmethod
-    def build(cls, config: ModelConfig) -> DolmaBlock:
+    def build(cls, config: ModelConfig) -> OlmoBlock:
         if config.block_type == BlockType.sequential:
-            return DolmaSequentialBlock(config)
+            return OlmoSequentialBlock(config)
         elif config.block_type == BlockType.parallel:
-            return DolmaParallelBlock(config)
+            return OlmoParallelBlock(config)
         else:
             raise NotImplementedError(f"not sure how to handle block type '{config.block_type}'")
 
 
-class DolmaSequentialBlock(DolmaBlock):
+class OlmoSequentialBlock(OlmoBlock):
     """
     This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
@@ -364,11 +364,11 @@ def forward(
         return x
 
 
-class DolmaParallelBlock(DolmaBlock):
+class OlmoParallelBlock(OlmoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))``
     as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))``
-    as in :class:`DolmaSequentialBlock` (ignoring some skip connections).
+    as in :class:`OlmoSequentialBlock` (ignoring some skip connections).
 
     The decoupling of the MLP and Attention functions allow us to fuse the separate input projections
     into a single linear layer to increase throughput. In this configuration it's also straight-forward
@@ -408,7 +408,7 @@ def forward(
         return x + self.dropout(self.ff_out(self.act(ff))) + self.dropout(att)
 
 
-class DolmaOutput(NamedTuple):
+class OlmoOutput(NamedTuple):
     logits: torch.FloatTensor
     """
     A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
@@ -416,7 +416,7 @@ class DolmaOutput(NamedTuple):
     """
 
 
-class DolmaGenerateOutput(NamedTuple):
+class OlmoGenerateOutput(NamedTuple):
     token_ids: torch.LongTensor
     """
     The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
@@ -429,21 +429,21 @@ class DolmaGenerateOutput(NamedTuple):
     """
 
 
-class Dolma(nn.Module):
+class Olmo(nn.Module):
     def __init__(self, config: ModelConfig, init_params: bool = True):
         super().__init__()
         self.config = config
 
         # Validate config.
         if self.config.alibi and self.config.flash_attention:
-            raise DolmaConfigurationError("ALiBi is currently not supported with FlashAttention")
+            raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention")
 
         if self.config.alibi and self.config.rope:
-            raise DolmaConfigurationError("ALiBi and RoPE are mutually exclusive")
+            raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive")
 
         if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
             if self.config.embedding_size < self.config.vocab_size:
-                raise DolmaConfigurationError("embedding size should be at least as big as vocab size")
+                raise OlmoConfigurationError("embedding size should be at least as big as vocab size")
             elif self.config.embedding_size % 128 != 0:
                 import warnings
 
@@ -460,7 +460,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
                     config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
                 ),
                 emb_drop=nn.Dropout(config.embedding_dropout),
-                blocks=nn.ModuleList([DolmaBlock.build(config) for _ in range(config.n_layers)]),
+                blocks=nn.ModuleList([OlmoBlock.build(config) for _ in range(config.n_layers)]),
                 ln_f=LayerNorm.build(config),
             )
         )
@@ -541,7 +541,7 @@ def forward(
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.Tensor] = None,
         attention_bias: Optional[torch.Tensor] = None,
-    ) -> DolmaOutput:
+    ) -> OlmoOutput:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
@@ -625,13 +625,13 @@ def forward(
         # shape: (batch_size, seq_len, vocab_size)
         logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
 
-        return DolmaOutput(logits=logits)  # type: ignore[arg-type]
+        return OlmoOutput(logits=logits)  # type: ignore[arg-type]
 
     def fsdp_wrap_fn(self, module):
-        return isinstance(module, DolmaBlock)
+        return isinstance(module, OlmoBlock)
 
     def activation_checkpointing_fn(self, module):
-        return isinstance(module, DolmaBlock)
+        return isinstance(module, OlmoBlock)
 
     def param_init_fn(self, module):
         from functools import partial
@@ -723,7 +723,7 @@ def generate(
         min_steps: Optional[int] = None,
         final_sequence_scorer: Optional[FinalSequenceScorer] = None,
         constraints: Optional[List[Constraint]] = None,
-    ) -> DolmaGenerateOutput:
+    ) -> OlmoGenerateOutput:
         """
         Generate token IDs using beam search.
 
@@ -803,7 +803,7 @@ def step(
             state["attention_bias"] = attention_bias
         token_ids, scores = beam_search.search(initial_preds, state, step)
 
-        return DolmaGenerateOutput(
+        return OlmoGenerateOutput(
             token_ids=token_ids,  # type: ignore[arg-type]
             scores=scores,  # type: ignore[arg-type]
         )
diff --git a/dolma/optim.py b/olmo/optim.py
similarity index 100%
rename from dolma/optim.py
rename to olmo/optim.py
diff --git a/dolma/py.typed b/olmo/py.typed
similarity index 100%
rename from dolma/py.typed
rename to olmo/py.typed
diff --git a/dolma/tokenizer.py b/olmo/tokenizer.py
similarity index 97%
rename from dolma/tokenizer.py
rename to olmo/tokenizer.py
index 41b896151..f22243ab6 100644
--- a/dolma/tokenizer.py
+++ b/olmo/tokenizer.py
@@ -6,7 +6,7 @@
 from tokenizers import Tokenizer as BaseTokenizer
 
 from .config import TrainConfig, TruncationDirection
-from .exceptions import DolmaConfigurationError
+from .exceptions import OlmoConfigurationError
 
 __all__ = ["Tokenizer"]
 
@@ -43,7 +43,7 @@ def vocab_size(self) -> int:
     def from_train_config(cls, config: TrainConfig) -> Tokenizer:
         tokenizer = cls.from_pretrained(config.tokenizer.identifier, eos_token_id=config.model.eos_token_id)
         if config.model.vocab_size != tokenizer.vocab_size:
-            raise DolmaConfigurationError("vocab size mismatch between config and tokenizer")
+            raise OlmoConfigurationError("vocab size mismatch between config and tokenizer")
         return tokenizer
 
     @classmethod
diff --git a/dolma/util.py b/olmo/util.py
similarity index 97%
rename from dolma/util.py
rename to olmo/util.py
index 68977d1fc..5979de2fb 100644
--- a/dolma/util.py
+++ b/olmo/util.py
@@ -13,7 +13,7 @@
 from rich.text import Text
 from rich.traceback import Traceback
 
-from .exceptions import DolmaCliError, DolmaError
+from .exceptions import OlmoCliError, OlmoError
 
 _log_extra_fields: Dict[str, Any] = {}
 
@@ -43,7 +43,7 @@ def log_record_factory(*args, **kwargs) -> logging.LogRecord:
 
     handler: logging.Handler
     if (
-        os.environ.get("DOLMA_NONINTERACTIVE", False)
+        os.environ.get("OLMo_NONINTERACTIVE", False)
         or os.environ.get("DEBIAN_FRONTEND", None) == "noninteractive"
         or not sys.stdout.isatty()
     ):
@@ -74,9 +74,9 @@ def excepthook(exctype, value, traceback):
     """
     if issubclass(exctype, KeyboardInterrupt):
         sys.__excepthook__(exctype, value, traceback)
-    elif issubclass(exctype, DolmaCliError):
+    elif issubclass(exctype, OlmoCliError):
         rich.get_console().print(f"[yellow]{value}[/]", highlight=False)
-    elif issubclass(exctype, DolmaError):
+    elif issubclass(exctype, OlmoError):
         rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False)
     else:
         logging.getLogger().critical(
diff --git a/dolma/version.py b/olmo/version.py
similarity index 100%
rename from dolma/version.py
rename to olmo/version.py
diff --git a/scripts/init_config.py b/scripts/init_config.py
index 05740b7da..22143d401 100644
--- a/scripts/init_config.py
+++ b/scripts/init_config.py
@@ -6,9 +6,9 @@
 from pathlib import Path
 from typing import List
 
-from dolma import TrainConfig
-from dolma.exceptions import DolmaCliError
-from dolma.util import clean_opt, prepare_cli_environment
+from olmo import TrainConfig
+from olmo.exceptions import OlmoCliError
+from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger(__name__)
 
@@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None:
     try:
         save_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise DolmaCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
+        raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
 
     main(Path(save_path), [clean_opt(s) for s in args_list])
diff --git a/scripts/prepare_changelog.py b/scripts/prepare_changelog.py
index 768fb5caf..6a286df82 100644
--- a/scripts/prepare_changelog.py
+++ b/scripts/prepare_changelog.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 from pathlib import Path
 
-from dolma.version import VERSION
+from olmo.version import VERSION
 
 
 def main() -> None:
diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 04dfb40f9..6d38f20d2 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -29,8 +29,8 @@
     TimeElapsedColumn,
 )
 
-from dolma import Tokenizer
-from dolma.util import prepare_cli_environment
+from olmo import Tokenizer
+from olmo.util import prepare_cli_environment
 
 log = logging.getLogger(__name__)
 
diff --git a/scripts/release.sh b/scripts/release.sh
index 6577df6ff..7b61bfe26 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -2,7 +2,7 @@
 
 set -e
 
-TAG=$(python -c 'from dolma.version import VERSION; print("v" + VERSION)')
+TAG=$(python -c 'from olmo.version import VERSION; print("v" + VERSION)')
 
 read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
 
diff --git a/scripts/train.py b/scripts/train.py
index 3b276db4b..df1b9ffa3 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,5 +1,5 @@
 """
-This is the script used to train DOLMA.
+This is the script used to train OLMo.
 
 There is one required positional argument, the path to a YAML :class:`TrainConfig`.
 Following the YAML path, you could pass any number of options to override
@@ -25,9 +25,9 @@
 
 import torch
 
-from dolma import Dolma, TrainConfig
-from dolma.exceptions import DolmaCliError
-from dolma.util import clean_opt, log_extra_field, prepare_cli_environment
+from olmo import Olmo, TrainConfig
+from olmo.exceptions import OlmoCliError
+from olmo.util import clean_opt, log_extra_field, prepare_cli_environment
 
 log = logging.getLogger(__name__)
 
@@ -41,9 +41,9 @@ def main(cfg: TrainConfig) -> None:
     from composer.utils import dist, get_device, reproducibility
     from composer.utils.dist import get_node_rank
 
-    from dolma.composer import (
-        ComposerDolmaLM,
-        DolmaConsoleLogger,
+    from olmo.composer import (
+        ComposerOlmoLM,
+        OlmoConsoleLogger,
         build_algorithm,
         build_dataloader,
         build_optimizer,
@@ -78,11 +78,11 @@ def main(cfg: TrainConfig) -> None:
         )
 
     # Initialize the model.
-    dolma_model = Dolma(cfg.model)
+    olmo_model = Olmo(cfg.model)
     if get_node_rank() == 0:
-        log.info(f"Total number of parameters: {dolma_model.num_params():,d}")
+        log.info(f"Total number of parameters: {olmo_model.num_params():,d}")
         log.info(
-            f"Number of non-embedding parameters: {dolma_model.num_params(include_embedding=False):,d}",
+            f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}",
         )
 
     # Compile it if necessary.
@@ -90,11 +90,11 @@ def main(cfg: TrainConfig) -> None:
         compile_kwargs = cfg.compile.asdict()
         if compile_kwargs.get("fullgraph") is None:
             compile_kwargs["fullgraph"] = cfg.fsdp_config is None
-        # As far as duck typing is concerned, this is still a Dolma object.
-        dolma_model = cast(Dolma, torch.compile(dolma_model, **compile_kwargs))
+        # As far as duck typing is concerned, this is still a Olmo object.
+        olmo_model = cast(Olmo, torch.compile(olmo_model, **compile_kwargs))
 
     # Optimizer.
-    optimizer = build_optimizer(dolma_model, **cfg.optimizer.asdict())
+    optimizer = build_optimizer(olmo_model, **cfg.optimizer.asdict())
 
     # Scheduler.
     scheduler = build_scheduler(cfg.scheduler)
@@ -117,13 +117,13 @@ def main(cfg: TrainConfig) -> None:
     ]
 
     # Loggers.
-    loggers: List[LoggerDestination] = [DolmaConsoleLogger(log_interval=cfg.console_log_interval)]
+    loggers: List[LoggerDestination] = [OlmoConsoleLogger(log_interval=cfg.console_log_interval)]
     if cfg.wandb is not None:
         loggers.append(WandBLogger(init_kwargs={"config": cfg.asdict(exclude=["wandb"])}, **cfg.wandb.asdict()))
 
     # Wrap model into composer model.
-    composer_model = ComposerDolmaLM(dolma_model)
-    del dolma_model
+    composer_model = ComposerOlmoLM(olmo_model)
+    del olmo_model
 
     # Trainer.
     trainer = Trainer(
@@ -181,7 +181,7 @@ def main(cfg: TrainConfig) -> None:
     try:
         yaml_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise DolmaCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
+        raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
 
     cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list])
     main(cfg)
diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py
index ee41f5a6b..e53dbcfd3 100644
--- a/scripts/upload_artifact.py
+++ b/scripts/upload_artifact.py
@@ -6,7 +6,7 @@
 from google.cloud import storage
 from tqdm import tqdm
 
-from dolma.util import prepare_cli_environment
+from olmo.util import prepare_cli_environment
 
 log = logging.getLogger(__name__)
 
diff --git a/setup.py b/setup.py
index 9ca12cb45..7951268c2 100644
--- a/setup.py
+++ b/setup.py
@@ -15,11 +15,11 @@ def read_requirements(filename: str):
 # version.py defines the VERSION and VERSION_SHORT variables.
 # We use exec here so we don't import cached_path whilst setting up.
 VERSION = {}  # type: ignore
-with open("dolma/version.py", "r") as version_file:
+with open("olmo/version.py", "r") as version_file:
     exec(version_file.read(), VERSION)
 
 setup(
-    name="dolma",
+    name="olmo",
     version=VERSION["VERSION"],
     description="",
     long_description=open("README.md").read(),
@@ -39,7 +39,7 @@ def read_requirements(filename: str):
     packages=find_packages(
         exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
     ),
-    package_data={"dolma": ["py.typed"]},
+    package_data={"olmo": ["py.typed"]},
     install_requires=read_requirements("requirements.txt"),
     extras_require={"dev": read_requirements("dev-requirements.txt")},
     python_requires=">=3.8",
diff --git a/tests/beam_search_test.py b/tests/beam_search_test.py
index 872517c8a..1e6d4156b 100644
--- a/tests/beam_search_test.py
+++ b/tests/beam_search_test.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 
-from dolma.beam_search import (
+from olmo.beam_search import (
     BeamSearch,
     GumbelSampler,
     LengthNormalizedSequenceLogProbabilityScorer,
diff --git a/tests/config_test.py b/tests/config_test.py
index 3a5607a1e..a0aedc5a3 100644
--- a/tests/config_test.py
+++ b/tests/config_test.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from dolma.config import StrEnum, TrainConfig
+from olmo.config import StrEnum, TrainConfig
 
 
 def test_str_enum():
diff --git a/tests/data/collator_test.py b/tests/data/collator_test.py
index 0e906caeb..2279570d5 100644
--- a/tests/data/collator_test.py
+++ b/tests/data/collator_test.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from dolma.data.collator import DataCollator, PaddingDirection
+from olmo.data.collator import DataCollator, PaddingDirection
 
 
 @pytest.mark.parametrize(
diff --git a/tests/data/memmap_dataset_test.py b/tests/data/memmap_dataset_test.py
index e3c97c862..bb1b1e3d4 100644
--- a/tests/data/memmap_dataset_test.py
+++ b/tests/data/memmap_dataset_test.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 
-from dolma.data.memmap_dataset import MemMapDataset
-from dolma.tokenizer import Tokenizer
+from olmo.data.memmap_dataset import MemMapDataset
+from olmo.tokenizer import Tokenizer
 
 
 def test_mmap_dataset(tokenizer: Tokenizer, tmp_path: Path, lorem_ipsum_docs: List[str]):
diff --git a/tests/model_test.py b/tests/model_test.py
index a952adb01..a84d4bb2c 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -2,10 +2,10 @@
 import torch
 from torch.nn import CrossEntropyLoss
 
-from dolma import BlockType, Dolma, ModelConfig, Tokenizer, TrainConfig
-from dolma.composer import build_optimizer
-from dolma.config import PaddingDirection
-from dolma.data import DataCollator
+from olmo import BlockType, ModelConfig, Olmo, Tokenizer, TrainConfig
+from olmo.composer import build_optimizer
+from olmo.config import PaddingDirection
+from olmo.data import DataCollator
 
 
 @pytest.mark.parametrize(
@@ -126,9 +126,9 @@ def test_forward(
 
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Dolma(train_config.model).eval()
+    model = Olmo(train_config.model).eval()
 
-    input1 = tokenizer.encode("My name is DOLMA!")
+    input1 = tokenizer.encode("My name is OLMo!")
     input2 = tokenizer.encode("I'm a delightful large open language model :)")
     batch_inputs = DataCollator.from_train_config(train_config)(
         [  # type: ignore
@@ -223,13 +223,13 @@ def test_backward(
     else:
         train_config.model.init_device = "cpu"
 
-    model = Dolma(train_config.model).train()
+    model = Olmo(train_config.model).train()
 
     with torch.autocast(
         device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype
     ):
         # Forward pass to get logits.
-        input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!"), device=train_config.device).unsqueeze(0)
+        input_ids = torch.tensor(tokenizer.encode("My name is OLMo!"), device=train_config.device).unsqueeze(0)
         logits = model(input_ids).logits
 
         # Compute loss.
@@ -255,7 +255,7 @@ def test_backward(
 
 
 def test_build_optimizer(model_config: ModelConfig):
-    build_optimizer(Dolma(model_config))
+    build_optimizer(Olmo(model_config))
 
 
 @pytest.mark.parametrize(
@@ -297,9 +297,9 @@ def test_generate(
         train_config.model.init_device = "cpu"
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Dolma(train_config.model).eval()
+    model = Olmo(train_config.model).eval()
 
-    input1 = tokenizer.encode("My name is DOLMA! ", add_special_tokens=False)
+    input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False)
     input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False)
     batch_inputs = DataCollator.from_train_config(train_config)(
         [  # type: ignore
diff --git a/tests/tokenizer_test.py b/tests/tokenizer_test.py
index 1e1110c07..a3c761413 100644
--- a/tests/tokenizer_test.py
+++ b/tests/tokenizer_test.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from dolma.tokenizer import Tokenizer
+from olmo.tokenizer import Tokenizer
 
 
 @pytest.mark.parametrize("add_special_tokens", [pytest.param(x, id=f"specials={x}") for x in (True, False)])