Skip to content

Commit

Permalink
Merge branch 'train-olmo-large' of https://github.com/allenai/LLM int…
Browse files Browse the repository at this point in the history
…o train-olmo-large
  • Loading branch information
dirkgr committed Apr 8, 2024
2 parents bb07d50 + bce98bf commit 5cc5484
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 3 deletions.
126 changes: 124 additions & 2 deletions configs/mcli/mitchish70.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,131 @@ scheduling:
# max_retries: 3
compute:
cluster: r15z4
gpus: 384
gpus: 896
gpu_type: h100_80gb
instance: oci.bm.gpu.h100.8
node_names:
- inst-ij1rg-r15z3-workers
- inst-kx7fu-r15z3-workers
- inst-ubbqk-r15z3-workers
- inst-8jhc4-r15z3-workers
- inst-xh87c-r15z3-workers
- inst-cm3ec-r15z3-workers
- inst-vzhyo-r15z3-workers
- inst-tg5bs-r15z3-workers
- inst-eigqe-r15z3-workers
- inst-pzgox-r15z3-workers
- inst-i9qwf-r15z3-workers
- inst-yg289-r15z3-workers
- inst-go2bm-r15z3-workers
- inst-dpvjh-r15z3-workers
- inst-jmxxa-r15z3-workers
- inst-grtmk-r15z3-workers
- inst-fdyxp-r15z3-workers
- inst-5irk5-r15z3-workers
- inst-lwagu-r15z3-workers
- inst-bsgg4-r15z3-workers
- inst-2iaxk-r15z3-workers
- inst-lduqx-r15z3-workers
- inst-r01sx-r15z3-workers
- inst-wrucg-r15z3-workers
- inst-vvd97-r15z3-workers
- inst-bluc6-r15z3-workers
- inst-o186f-r15z3-workers
- inst-bw20d-r15z3-workers
- inst-f0kqy-r15z3-workers
- inst-ed8jl-r15z3-workers
- inst-vjsri-r15z3-workers
- inst-ndddu-r15z3-workers
- inst-edsue-r15z3-workers
- inst-rdvlq-r15z3-workers
- inst-ekaiy-r15z3-workers
- inst-cupyv-r15z3-workers
- inst-tfi9t-r15z3-workers
- inst-hzzsd-r15z3-workers
- inst-vy0zb-r15z3-workers
- inst-kdmu6-r15z3-workers
- inst-6tz4b-r15z3-workers
- inst-ih7jm-r15z3-workers
- inst-2oyig-r15z3-workers
- inst-rymxc-r15z3-workers
- inst-1nnph-r15z3-workers
- inst-dhjn2-r15z3-workers
- inst-kxpsv-r15z3-workers
- inst-v87vf-r15z3-workers
- inst-ivjqi-r15z3-workers
- inst-bv9yy-r15z3-workers
- inst-0mf4w-r15z3-workers
- inst-bg14o-r15z3-workers
- inst-bn5zq-r15z3-workers
- inst-glcak-r15z3-workers
- inst-xalw1-r15z3-workers
- inst-vwwku-r15z3-workers
- inst-ijtgf-r15z3-workers
- inst-21fqf-r15z3-workers
- inst-ht0xx-r15z3-workers
- inst-entnk-r15z3-workers
- inst-hvw6t-r15z3-workers
- inst-3to96-r15z3-workers
- inst-4ki3x-r15z3-workers
- inst-aixwt-r15z3-workers
- inst-pbivr-r15z3-workers
- inst-6yvq9-r15z3-workers
- inst-i1ted-r15z3-workers
- inst-nv70l-r15z3-workers
- inst-awtjo-r15z3-workers
- inst-olazl-r15z3-workers
- inst-qc1pa-r15z3-workers
- inst-daiox-r15z3-workers
- inst-5wqam-r15z3-workers
- inst-drkao-r15z3-workers
- inst-j8byk-r15z3-workers
- inst-csom5-r15z3-workers
- inst-mrxmj-r15z3-workers
- inst-g5ojd-r15z3-workers
- inst-irzic-r15z3-workers
- inst-gggd1-r15z3-workers
- inst-vwnx8-r15z3-workers
- inst-4zdz3-r15z3-workers
- inst-c6t2k-r15z3-workers
- inst-jhqyu-r15z3-workers
- inst-8z7hr-r15z3-workers
- inst-v8mxi-r15z3-workers
- inst-jeel7-r15z3-workers
- inst-zlnho-r15z3-workers
- inst-xdqqd-r15z3-workers
- inst-di0ri-r15z3-workers
- inst-lpz5k-r15z3-workers
- inst-jhhcv-r15z3-workers
- inst-fatfc-r15z3-workers
- inst-xoiov-r15z3-workers
- inst-rtaii-r15z3-workers
- inst-tcttd-r15z3-workers
- inst-likvg-r15z3-workers
- inst-gn4hg-r15z3-workers
- inst-rnyqr-r15z3-workers
- inst-zgb86-r15z3-workers
- inst-rpmhf-r15z3-workers
- inst-pfzsm-r15z3-workers
- inst-9hoiv-r15z3-workers
- inst-v2vx0-r15z3-workers
- inst-tw9i6-r15z3-workers
- inst-aj1o1-r15z3-workers
- inst-xmxc2-r15z3-workers
- inst-o3fxl-r15z3-workers
- inst-4vqjq-r15z3-workers
- inst-ll38i-r15z3-workers
- inst-j3mfc-r15z3-workers
- inst-e1ijl-r15z3-workers
# - inst-evbig-r15z3-workers
# - inst-tturo-r15z3-workers
# - inst-kc1z1-r15z3-workers
# - inst-97xv1-r15z3-workers
# - inst-vaqst-r15z3-workers
# - inst-i6mnk-r15z3-workers
# Bad nodes:
# - inst-hdlqg-r15z3-workers
# - inst-6jp2q-r15z3-workers
integrations:
- integration_type: git_repo
git_repo: allenai/OLMo
Expand Down Expand Up @@ -64,7 +186,7 @@ command: |-
'--load_path=${path.last_checkpoint:${remote_save_folder}}' \
--load_path_sharded_checkpointer=olmo_core \
--sharded_checkpointer=olmo_core \
--global_train_batch_size=1536 \
--global_train_batch_size=1792 \
--device_train_microbatch_size=2 \
--time_limit=604800 \
--save_overwrite
Expand Down
2 changes: 1 addition & 1 deletion olmo/data/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def __iter__(self) -> Iterator[Dict[str, Any]]:

# Start at the specified index.
if self.start_index > 0:
assert self.start_index % self.world_size == 0
# assert self.start_index % self.world_size == 0
indices = indices[self.start_index :]

# Slice indices by rank to avoid duplicates.
Expand Down
125 changes: 125 additions & 0 deletions scripts/s3_unshard_to_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Convenience script to take a model checkpoint stored on S3, unshard, and convert to HF
format. Requires the AWS CLI to be installed and configured.
Example usage for new-style checkpoint (circa April 2024):
python scripts/s3_unshard_to_hf.py \
--sharded_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000 \
--unsharded_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000-unsharded \
--hf_bucket s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step239000-huggingface \
--type olmo_core \
--tmp_dir /net/nfs.cirrascale/allennlp/davidw/tmp/unshard
"""

import argparse
import pathlib
import shutil
import subprocess


def make_parser():
parser = argparse.ArgumentParser(
description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo."
)
parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str)
parser.add_argument(
"--unsharded_bucket",
help="S3 bucket to save the unsharded checkpoint.",
type=str,
)
parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str)
parser.add_argument(
"--tmp_dir",
help="""Temporary directory to store checkpoints locally. This will be deleted
if everything runs successfully, but will keep files around otherwise to avoid
re-downloads when possible.""",
type=pathlib.Path,
)
parser.add_argument(
"--quiet",
action="store_true",
help="If given, don't show progress for AWS commands.",
)
parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.")
parser.add_argument("--model-only", action="store_true", help="If given, only unshard the model.")
parser.add_argument(
"--safe-tensors",
action="store_true",
help="Save unsharded safetensors as well.",
)
return parser


def aws_copy(src, dest, quiet):
base = "aws s3 cp --recursive"
if quiet:
base += " --quiet"
cmd = f"{base} {src} {dest}"

return cmd


def s3_unshard_to_hf(args):
# Set directories
sharded_dir = args.tmp_dir / "sharded"
unsharded_dir = args.tmp_dir / "unsharded"
hf_dir = args.tmp_dir / "hf"
hf_dir.mkdir()

# Download sharded checkpoint.
print("Downloading sharded checkpoint from S3.")
download_cmd = aws_copy(args.sharded_bucket, sharded_dir, args.quiet)
subprocess.run(download_cmd, shell=True, check=True)

# Unshard.
print("Unsharding.")
unshard_cmd = f"python scripts/unshard.py {sharded_dir} {unsharded_dir}"
# Add a `--type` argument if given.
if args.type is not None:
unshard_cmd += f" --type {args.type}"
if args.model_only:
unshard_cmd += " --model-only"
if args.safe_tensors:
unshard_cmd += " --safe-tensors"

subprocess.run(unshard_cmd, shell=True, check=True)

# Convert to HF
print("Converting to HF.")
hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir}"
subprocess.run(hf_cmd, shell=True, check=True)

# Move the HF files from the unsharded dir to their own.
for fname in [
"config.json",
"pytorch_model.bin",
"special_tokens_map.json",
"tokenizer.json",
"tokenizer_config.json",
]:
(unsharded_dir / fname).rename(hf_dir / fname)

# Upload the unsharded and HF files back to S3.
print("Uploading unsharded and HF files back to S3.")
upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args.quiet)
subprocess.run(upload_unsharded_cmd, shell=True, check=True)

upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args.quiet)
subprocess.run(upload_hf_cmd, shell=True, check=True)


def main():
parser = make_parser()
args = parser.parse_args()
if args.tmp_dir.exists():
raise ValueError(f"Temporary directory {args.tmp_dir} already exists; refusing to write.")
args.tmp_dir.mkdir()

s3_unshard_to_hf(args)

# Clear out temp dir if we got here (everything ran without error).
shutil.rmtree(args.tmp_dir)


if __name__ == "__main__":
main()

0 comments on commit 5cc5484

Please sign in to comment.