Skip to content

Commit

Permalink
save sequentially
Browse files Browse the repository at this point in the history
  • Loading branch information
AkshitaB committed Oct 2, 2024
1 parent 9afc8f2 commit 2fc3e05
Showing 1 changed file with 13 additions and 8 deletions.
21 changes: 13 additions & 8 deletions olmo/data/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,17 @@ def _build_and_save_global_indices(self):
log.info(f"dataset 0: {self.dataset[0]}")
log.info(f"dataset 1: {self.dataset[1]}")
log.info(f"global indices: {global_indices}")
global_indices_mmap = np.memmap(
self.global_indices_file, dtype=np.uint32, mode="w+", shape=(len(global_indices),)
)
global_indices_mmap[:] = global_indices
global_indices_mmap.flush()
del global_indices_mmap
sanity = np.memmap(self.global_indices_file, mode="r+", dtype=np.uint32)
# global_indices_mmap = np.memmap(
# self.global_indices_file, dtype=np.uint32, mode="w+", shape=(len(global_indices),)
# )
# global_indices_mmap[:] = global_indices
# global_indices_mmap.flush()
# del global_indices_mmap
with open(self.global_indices_file, "wb") as f:
np.save(f, global_indices)

#sanity = np.memmap(self.global_indices_file, mode="r+", dtype=np.uint32)
sanity = np.load(self.global_indices_file, mmap_mode='r')
log.info(f"sanity check: {sanity}")
del sanity
log.info("Global data order indices saved to '%s'", self.global_indices_file)
Expand Down Expand Up @@ -120,7 +124,8 @@ def _build_global_indices(self) -> np.ndarray:

def get_global_indices(self) -> np.ndarray:
if self.global_indices_file is not None:
return np.memmap(self.global_indices_file, mode="r", dtype=np.uint32) # type: ignore
#return np.memmap(self.global_indices_file, mode="r", dtype=np.uint32) # type: ignore
return np.load(self.global_indices_file, mmap_mode="r")
else:
return self._build_global_indices()

Expand Down

0 comments on commit 2fc3e05

Please sign in to comment.