allenai · aman-17 · Feb 12, 2025 · Feb 12, 2025 · Feb 14, 2025 · Feb 15, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - MPS support
 
+### Changed
+
+- Updated `memmap_dtype` to `uint32` for compatibility with OLMo-2-1124.
+
 ## [v0.6.0](https://github.com/allenai/OLMo/releases/tag/v0.6.0) - 2024-12-17
 
 ### Added

diff --git a/README.md b/README.md
@@ -15,6 +15,12 @@
   <a href="https://arxiv.org/pdf/2501.00656.pdf">
     <img alt="Paper URL" src="https://img.shields.io/badge/arxiv-2402.00838-blue">
   </a>
+  <a href="https://playground.allenai.org">
+    <img alt="Playground" src="https://img.shields.io/badge/Ai2-Playground-F0529C">
+  </a>
+  <a href="https://discord.gg/sZq3jTNVNG">
+    <img alt="Discord" src="https://img.shields.io/badge/Discord%20-%20blue?style=flat&logo=discord&label=Ai2&color=%235B65E9">
+  </a>
 </p>
 
 OLMo is a repository for training and using AI2's state-of-the-art open language models. It is designed by scientists, for scientists.
@@ -46,8 +52,8 @@ You can find *all* the checkpoints, at minimum every 1000 training steps in OLMo
 
 | Variant          | OLMo Format                                                                                          | Hugging Face Format                                                               |
 |------------------|-----------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------|
-| **OLMo 7B**      | [OLMo 7B](https://github.com/allenai/OLMo/blob/main/configs/official-1124/OLMo-2-1124-7B.csv)       | [Hugging Face for the 7B variant](https://huggingface.co/allenai/OLMo-2-1124-7B)  |
-| **OLMo 13B**     | [OLMo 13B](https://github.com/allenai/OLMo/blob/main/configs/official-1124/OLMo-2-1124-13B.csv)     | [Hugging Face for the 13B variant](https://huggingface.co/allenai/OLMo-2-1124-13B) |
+| **OLMo-2 7B**      | [OLMo-2 7B](https://github.com/allenai/OLMo/blob/main/configs/official-1124/OLMo-2-1124-7B.csv)       | [Hugging Face for the 7B variant](https://huggingface.co/allenai/OLMo-2-1124-7B)  |
+| **OLMo-2 13B**     | [OLMo-2 13B](https://github.com/allenai/OLMo/blob/main/configs/official-1124/OLMo-2-1124-13B.csv)     | [Hugging Face for the 13B variant](https://huggingface.co/allenai/OLMo-2-1124-13B) |
 
 ### Steps to reproduce
 
@@ -81,7 +87,7 @@ Example:
 ```bash
 python scripts/train.py configs/tiny/OLMo-20M.yaml --save_overwrite
 ```
-Note: You need to upgrade PyTorch to 2.5.x to run.
+Note: You need to upgrade PyTorch to 2.5.x to run. OLMo-2-1124 uses `uint32` for `memmap_dtype`, whereas OLMo-0724 uses `uint16`.
 
 ### Stage 1
 

diff --git a/olmo/config.py b/olmo/config.py
@@ -608,7 +608,7 @@ class InstanceFilterConfig(BaseConfig):
 @dataclass
 class DataConfig(BaseConfig):
     paths: Optional[List[str]] = None
-    memmap_dtype: str = "uint16"
+    memmap_dtype: str = "uint32"
     datasets: Optional[Dict[str, List[str]]] = None
     label_mask_paths: Optional[List[str]] = None
     pad_direction: PaddingDirection = PaddingDirection.right

diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         *paths: PathOrStr,
         chunk_size: int = 1024,
-        memmap_dtype: Union[Type[np.uint8], Type[np.uint16], Type[np.uint32], Type[np.uint64]] = np.uint16,
+        memmap_dtype: Union[Type[np.uint8], Type[np.uint16], Type[np.uint32], Type[np.uint64]] = np.uint32,
         metadata: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None,
         include_instance_metadata: bool = True,
         generate_attention_mask: bool = False,

diff --git a/tests/data/memmap_dataset_test.py b/tests/data/memmap_dataset_test.py
@@ -8,12 +8,12 @@
 
 
 def test_mmap_dataset(tmp_path: Path):
-    mmap1 = np.memmap(tmp_path / "mmap1.npy", mode="w+", dtype=np.uint16, shape=(16,))
-    mmap1[:] = np.array(list(range(16)), dtype=np.uint16)
+    mmap1 = np.memmap(tmp_path / "mmap1.npy", mode="w+", dtype=np.uint32, shape=(16,))
+    mmap1[:] = np.array(list(range(16)), dtype=np.uint32)
     mmap1.flush()
 
-    mmap2 = np.memmap(tmp_path / "mmap2.npy", mode="w+", dtype=np.uint16, shape=(16,))
-    mmap2[:] = np.array(list(range(16, 32)), dtype=np.uint16)
+    mmap2 = np.memmap(tmp_path / "mmap2.npy", mode="w+", dtype=np.uint32, shape=(16,))
+    mmap2[:] = np.array(list(range(16, 32)), dtype=np.uint32)
     mmap2.flush()
 
     ds = MemMapDataset(tmp_path / "mmap1.npy", tmp_path / "mmap2.npy", chunk_size=4)
@@ -23,8 +23,8 @@ def test_mmap_dataset(tmp_path: Path):
 
 
 def test_mmap_dataset_with_label_mask(tmp_path: Path):
-    mmap1 = np.memmap(tmp_path / "mmap1.npy", mode="w+", dtype=np.uint16, shape=(16,))
-    mmap1[:] = np.array(list(range(16)), dtype=np.uint16)
+    mmap1 = np.memmap(tmp_path / "mmap1.npy", mode="w+", dtype=np.uint32, shape=(16,))
+    mmap1[:] = np.array(list(range(16)), dtype=np.uint32)
     mmap1.flush()
 
     mask1 = [True] * 16
@@ -33,8 +33,8 @@ def test_mmap_dataset_with_label_mask(tmp_path: Path):
     mask_mmap1[:] = np.array(mask1, dtype=np.bool_)
     mask_mmap1.flush()
 
-    mmap2 = np.memmap(tmp_path / "mmap2.npy", mode="w+", dtype=np.uint16, shape=(16,))
-    mmap2[:] = np.array(list(range(16, 32)), dtype=np.uint16)
+    mmap2 = np.memmap(tmp_path / "mmap2.npy", mode="w+", dtype=np.uint32, shape=(16,))
+    mmap2[:] = np.array(list(range(16, 32)), dtype=np.uint32)
     mmap2.flush()
 
     mask2 = [True] * 16
@@ -66,7 +66,7 @@ def test_mmap_dataset_with_metadata(tokenizer: Tokenizer, tmp_path: Path, lorem_
 
     # Write tokens to memory-mapped array.
     tokens_fname = tmp_path / "tokens.npy"
-    mmap = np.memmap(tokens_fname, dtype=np.uint16, mode="w+", shape=(len(all_token_ids),))
+    mmap = np.memmap(tokens_fname, dtype=np.uint32, mode="w+", shape=(len(all_token_ids),))
     mmap[:] = all_token_ids
     mmap.flush()
     del mmap
@@ -83,10 +83,10 @@ def test_mmap_dataset_with_metadata(tokenizer: Tokenizer, tmp_path: Path, lorem_
 
 def test_concat_mmap_datasets(tmp_path: Path):
     # Write some data to disk.
-    mmap1 = np.memmap(tmp_path / "tokens1.npy", dtype=np.uint16, mode="w+", shape=(16,))
+    mmap1 = np.memmap(tmp_path / "tokens1.npy", dtype=np.uint32, mode="w+", shape=(16,))
     mmap1[:] = list(range(16))
     mmap1.flush()
-    mmap2 = np.memmap(tmp_path / "tokens2.npy", dtype=np.uint16, mode="w+", shape=(8,))
+    mmap2 = np.memmap(tmp_path / "tokens2.npy", dtype=np.uint32, mode="w+", shape=(8,))
     mmap2[:] = list(range(8))
     mmap2.flush()
     del mmap1, mmap2