From 2fc3e05208b1fecf76cc3cdea3222a0ef9c8d1d9 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Wed, 2 Oct 2024 11:50:36 -0700 Subject: [PATCH] save sequentially --- olmo/data/iterable_dataset.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/olmo/data/iterable_dataset.py b/olmo/data/iterable_dataset.py index 4f86bfaf7..b7ce628ed 100644 --- a/olmo/data/iterable_dataset.py +++ b/olmo/data/iterable_dataset.py @@ -81,13 +81,17 @@ def _build_and_save_global_indices(self): log.info(f"dataset 0: {self.dataset[0]}") log.info(f"dataset 1: {self.dataset[1]}") log.info(f"global indices: {global_indices}") - global_indices_mmap = np.memmap( - self.global_indices_file, dtype=np.uint32, mode="w+", shape=(len(global_indices),) - ) - global_indices_mmap[:] = global_indices - global_indices_mmap.flush() - del global_indices_mmap - sanity = np.memmap(self.global_indices_file, mode="r+", dtype=np.uint32) + # global_indices_mmap = np.memmap( + # self.global_indices_file, dtype=np.uint32, mode="w+", shape=(len(global_indices),) + # ) + # global_indices_mmap[:] = global_indices + # global_indices_mmap.flush() + # del global_indices_mmap + with open(self.global_indices_file, "wb") as f: + np.save(f, global_indices) + + #sanity = np.memmap(self.global_indices_file, mode="r+", dtype=np.uint32) + sanity = np.load(self.global_indices_file, mmap_mode='r') log.info(f"sanity check: {sanity}") del sanity log.info("Global data order indices saved to '%s'", self.global_indices_file) @@ -120,7 +124,8 @@ def _build_global_indices(self) -> np.ndarray: def get_global_indices(self) -> np.ndarray: if self.global_indices_file is not None: - return np.memmap(self.global_indices_file, mode="r", dtype=np.uint32) # type: ignore + #return np.memmap(self.global_indices_file, mode="r", dtype=np.uint32) # type: ignore + return np.load(self.global_indices_file, mmap_mode="r") else: return self._build_global_indices()