Merge branch 'train-olmo-large' of https://github.com/allenai/LLM int…

…o train-olmo-large
allenai · Apr 8, 2024 · 5cc5484 · 5cc5484
2 parents bb07d50 + bce98bf
commit 5cc5484
Show file tree

Hide file tree

Showing 3 changed files with 250 additions and 3 deletions.
diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
@@ -6,9 +6,131 @@ scheduling:
   # max_retries: 3
 compute:
   cluster: r15z4
-  gpus: 384
+  gpus: 896
   gpu_type: h100_80gb
   instance: oci.bm.gpu.h100.8
+  node_names:
+    - inst-ij1rg-r15z3-workers
+    - inst-kx7fu-r15z3-workers
+    - inst-ubbqk-r15z3-workers
+    - inst-8jhc4-r15z3-workers
+    - inst-xh87c-r15z3-workers
+    - inst-cm3ec-r15z3-workers
+    - inst-vzhyo-r15z3-workers
+    - inst-tg5bs-r15z3-workers
+    - inst-eigqe-r15z3-workers
+    - inst-pzgox-r15z3-workers
+    - inst-i9qwf-r15z3-workers
+    - inst-yg289-r15z3-workers
+    - inst-go2bm-r15z3-workers
+    - inst-dpvjh-r15z3-workers
+    - inst-jmxxa-r15z3-workers
+    - inst-grtmk-r15z3-workers
+    - inst-fdyxp-r15z3-workers
+    - inst-5irk5-r15z3-workers
+    - inst-lwagu-r15z3-workers
+    - inst-bsgg4-r15z3-workers
+    - inst-2iaxk-r15z3-workers
+    - inst-lduqx-r15z3-workers
+    - inst-r01sx-r15z3-workers
+    - inst-wrucg-r15z3-workers
+    - inst-vvd97-r15z3-workers
+    - inst-bluc6-r15z3-workers
+    - inst-o186f-r15z3-workers
+    - inst-bw20d-r15z3-workers
+    - inst-f0kqy-r15z3-workers
+    - inst-ed8jl-r15z3-workers
+    - inst-vjsri-r15z3-workers
+    - inst-ndddu-r15z3-workers
+    - inst-edsue-r15z3-workers
+    - inst-rdvlq-r15z3-workers
+    - inst-ekaiy-r15z3-workers
+    - inst-cupyv-r15z3-workers
+    - inst-tfi9t-r15z3-workers
+    - inst-hzzsd-r15z3-workers
+    - inst-vy0zb-r15z3-workers
+    - inst-kdmu6-r15z3-workers
+    - inst-6tz4b-r15z3-workers
+    - inst-ih7jm-r15z3-workers
+    - inst-2oyig-r15z3-workers
+    - inst-rymxc-r15z3-workers
+    - inst-1nnph-r15z3-workers
+    - inst-dhjn2-r15z3-workers
+    - inst-kxpsv-r15z3-workers
+    - inst-v87vf-r15z3-workers
+    - inst-ivjqi-r15z3-workers
+    - inst-bv9yy-r15z3-workers
+    - inst-0mf4w-r15z3-workers
+    - inst-bg14o-r15z3-workers
+    - inst-bn5zq-r15z3-workers
+    - inst-glcak-r15z3-workers
+    - inst-xalw1-r15z3-workers
+    - inst-vwwku-r15z3-workers
+    - inst-ijtgf-r15z3-workers
+    - inst-21fqf-r15z3-workers
+    - inst-ht0xx-r15z3-workers
+    - inst-entnk-r15z3-workers
+    - inst-hvw6t-r15z3-workers
+    - inst-3to96-r15z3-workers
+    - inst-4ki3x-r15z3-workers
+    - inst-aixwt-r15z3-workers
+    - inst-pbivr-r15z3-workers
+    - inst-6yvq9-r15z3-workers
+    - inst-i1ted-r15z3-workers
+    - inst-nv70l-r15z3-workers
+    - inst-awtjo-r15z3-workers
+    - inst-olazl-r15z3-workers
+    - inst-qc1pa-r15z3-workers
+    - inst-daiox-r15z3-workers
+    - inst-5wqam-r15z3-workers
+    - inst-drkao-r15z3-workers
+    - inst-j8byk-r15z3-workers
+    - inst-csom5-r15z3-workers
+    - inst-mrxmj-r15z3-workers
+    - inst-g5ojd-r15z3-workers
+    - inst-irzic-r15z3-workers
+    - inst-gggd1-r15z3-workers
+    - inst-vwnx8-r15z3-workers
+    - inst-4zdz3-r15z3-workers
+    - inst-c6t2k-r15z3-workers
+    - inst-jhqyu-r15z3-workers
+    - inst-8z7hr-r15z3-workers
+    - inst-v8mxi-r15z3-workers
+    - inst-jeel7-r15z3-workers
+    - inst-zlnho-r15z3-workers
+    - inst-xdqqd-r15z3-workers
+    - inst-di0ri-r15z3-workers
+    - inst-lpz5k-r15z3-workers
+    - inst-jhhcv-r15z3-workers
+    - inst-fatfc-r15z3-workers
+    - inst-xoiov-r15z3-workers
+    - inst-rtaii-r15z3-workers
+    - inst-tcttd-r15z3-workers
+    - inst-likvg-r15z3-workers
+    - inst-gn4hg-r15z3-workers
+    - inst-rnyqr-r15z3-workers
+    - inst-zgb86-r15z3-workers
+    - inst-rpmhf-r15z3-workers
+    - inst-pfzsm-r15z3-workers
+    - inst-9hoiv-r15z3-workers
+    - inst-v2vx0-r15z3-workers
+    - inst-tw9i6-r15z3-workers
+    - inst-aj1o1-r15z3-workers
+    - inst-xmxc2-r15z3-workers
+    - inst-o3fxl-r15z3-workers
+    - inst-4vqjq-r15z3-workers
+    - inst-ll38i-r15z3-workers
+    - inst-j3mfc-r15z3-workers
+    - inst-e1ijl-r15z3-workers
+    # - inst-evbig-r15z3-workers
+    # - inst-tturo-r15z3-workers
+    # - inst-kc1z1-r15z3-workers
+    # - inst-97xv1-r15z3-workers
+    # - inst-vaqst-r15z3-workers
+    # - inst-i6mnk-r15z3-workers
+    # Bad nodes:
+    # - inst-hdlqg-r15z3-workers
+    # - inst-6jp2q-r15z3-workers
 integrations:
   - integration_type: git_repo
     git_repo: allenai/OLMo
@@ -64,7 +186,7 @@ command: |-
     '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
     --load_path_sharded_checkpointer=olmo_core \
     --sharded_checkpointer=olmo_core \
-    --global_train_batch_size=1536 \
+    --global_train_batch_size=1792 \
     --device_train_microbatch_size=2 \
     --time_limit=604800 \
     --save_overwrite

diff --git a/olmo/data/iterable_dataset.py b/olmo/data/iterable_dataset.py
@@ -131,7 +131,7 @@ def __iter__(self) -> Iterator[Dict[str, Any]]:
 
         # Start at the specified index.
         if self.start_index > 0:
-            assert self.start_index % self.world_size == 0
+            #  assert self.start_index % self.world_size == 0
             indices = indices[self.start_index :]
 
         # Slice indices by rank to avoid duplicates.

diff --git a/scripts/s3_unshard_to_hf.py b/scripts/s3_unshard_to_hf.py
@@ -0,0 +1,125 @@
+"""
+Convenience script to take a model checkpoint stored on S3, unshard, and convert to HF
+format. Requires the AWS CLI to be installed and configured.
+
+Example usage for new-style checkpoint (circa April 2024):
+python scripts/s3_unshard_to_hf.py \
+    --sharded_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000 \
+    --unsharded_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000-unsharded \
+    --hf_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000-huggingface \
+    --type olmo_core \
+    --tmp_dir /net/nfs.cirrascale/allennlp/davidw/tmp/unshard
+"""
+
+import argparse
+import pathlib
+import shutil
+import subprocess
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(
+        description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo."
+    )
+    parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str)
+    parser.add_argument(
+        "--unsharded_bucket",
+        help="S3 bucket to save the unsharded checkpoint.",
+        type=str,
+    )
+    parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str)
+    parser.add_argument(
+        "--tmp_dir",
+        help="""Temporary directory to store checkpoints locally. This will be deleted
+        if everything runs successfully, but will keep files around otherwise to avoid
+        re-downloads when possible.""",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="If given, don't show progress for AWS commands.",
+    )
+    parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.")
+    parser.add_argument("--model-only", action="store_true", help="If given, only unshard the model.")
+    parser.add_argument(
+        "--safe-tensors",
+        action="store_true",
+        help="Save unsharded safetensors as well.",
+    )
+    return parser
+
+
+def aws_copy(src, dest, quiet):
+    base = "aws s3 cp --recursive"
+    if quiet:
+        base += " --quiet"
+    cmd = f"{base} {src} {dest}"
+
+    return cmd
+
+
+def s3_unshard_to_hf(args):
+    # Set directories
+    sharded_dir = args.tmp_dir / "sharded"
+    unsharded_dir = args.tmp_dir / "unsharded"
+    hf_dir = args.tmp_dir / "hf"
+    hf_dir.mkdir()
+
+    # Download sharded checkpoint.
+    print("Downloading sharded checkpoint from S3.")
+    download_cmd = aws_copy(args.sharded_bucket, sharded_dir, args.quiet)
+    subprocess.run(download_cmd, shell=True, check=True)
+
+    # Unshard.
+    print("Unsharding.")
+    unshard_cmd = f"python scripts/unshard.py {sharded_dir} {unsharded_dir}"
+    # Add a `--type` argument if given.
+    if args.type is not None:
+        unshard_cmd += f" --type {args.type}"
+    if args.model_only:
+        unshard_cmd += " --model-only"
+    if args.safe_tensors:
+        unshard_cmd += " --safe-tensors"
+
+    subprocess.run(unshard_cmd, shell=True, check=True)
+
+    # Convert to HF
+    print("Converting to HF.")
+    hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir}"
+    subprocess.run(hf_cmd, shell=True, check=True)
+
+    # Move the HF files from the unsharded dir to their own.
+    for fname in [
+        "config.json",
+        "pytorch_model.bin",
+        "special_tokens_map.json",
+        "tokenizer.json",
+        "tokenizer_config.json",
+    ]:
+        (unsharded_dir / fname).rename(hf_dir / fname)
+
+    # Upload the unsharded and HF files back to S3.
+    print("Uploading unsharded and HF files back to S3.")
+    upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args.quiet)
+    subprocess.run(upload_unsharded_cmd, shell=True, check=True)
+
+    upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args.quiet)
+    subprocess.run(upload_hf_cmd, shell=True, check=True)
+
+
+def main():
+    parser = make_parser()
+    args = parser.parse_args()
+    if args.tmp_dir.exists():
+        raise ValueError(f"Temporary directory {args.tmp_dir} already exists; refusing to write.")
+    args.tmp_dir.mkdir()
+
+    s3_unshard_to_hf(args)
+
+    # Clear out temp dir if we got here (everything ran without error).
+    shutil.rmtree(args.tmp_dir)
+
+
+if __name__ == "__main__":
+    main()