From 3bf47d6430cc04689ea1d200b1e8ff5ed3948834 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 13 Dec 2024 15:04:55 -0800 Subject: [PATCH] replace create_seed_checkpoint.md with a note in docs/checkpoint.md ghstack-source-id: 6b70ca7604d6701fac0e34d623826d91922a0424 Pull Request resolved: https://github.com/pytorch/torchtitan/pull/736 --- create_seed_checkpoint.sh | 34 ---------------------------------- docs/checkpoint.md | 17 ++++++++++++++--- test_runner.py | 22 ++-------------------- torchtitan/utils.py | 2 +- 4 files changed, 17 insertions(+), 58 deletions(-) delete mode 100755 create_seed_checkpoint.sh diff --git a/create_seed_checkpoint.sh b/create_seed_checkpoint.sh deleted file mode 100755 index a9c2b216..00000000 --- a/create_seed_checkpoint.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# -# create_seed_checkpoint.sh -# -# Run this script to create a seed checkpoint used to initialize a model from step-0. -# Seed checkpoints are used to initialize pipeline-parallel models since the model initializer -# functions don't cleanly run on chunked model parts after meta-initialization. -# -# Use the same model config to generate your seed checkpoint as you use for training. -# e.g. -# CONFIG= ./create_seed_checkpoint.sh - -set -ex - -NGPU=1 -LOG_RANK=0 -CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"} - -seed_checkpoint="--checkpoint.enable_checkpoint --checkpoint.create_seed_checkpoint" -force_1d="--training.data_parallel_shard_degree 1 --training.tensor_parallel_degree 1 --experimental.pipeline_parallel_degree 1" -overrides="" -if [ $# -ne 0 ]; then - overrides="$*" -fi - -torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ ---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ -train.py --job.config_file ${CONFIG_FILE} $seed_checkpoint $force_1d $overrides diff --git a/docs/checkpoint.md b/docs/checkpoint.md index a4f2ecc8..3f66e5ac 100644 --- a/docs/checkpoint.md +++ b/docs/checkpoint.md @@ -1,4 +1,4 @@ -## How to convert a Llama3 checkpoint for use in torchtitan +## How to convert a Llama 3 checkpoint for use in torchtitan If you want to continue training from an existing model checkpoint, the checkpoint must be in the DCP format expected by the checkpoint manager. An example script for converting the original Llama3 checkpoints into the expected DCP format can be found in `scripts/convert_llama_to_dcp.py`. @@ -9,8 +9,7 @@ python3 scripts/convert_llama_to_dcp.py ``` - -## How to Convert a torchtitan Checkpoint for Use in torchtune +## How to convert a torchtitan checkpoint for use in torchtune This guide will walk you through the steps required to convert a checkpoint from torchtitan so that it can be loaded into torchtune. @@ -66,3 +65,15 @@ python -m torch.distributed.checkpoint.format_utils dcp_to_torch torchtitan/outp ``` That's it. You have now successfully converted a sharded torchtitan checkpoint for use in torchtune. + + +## How to create a seed checkpoint +Sometimes one needs to create a seed checkpoint to initialize a model from step 0. +E.g. it is hard, if not impossible, for meta initialization on multiple devices to reproduce the initialization on a single device. +A seed checkpoint does initialization of the model on a single CPU, and can be loaded from another job on an arbitrary number of GPUs via DCP resharding. + +To create a seed checkpoint, use the same model config as you use for training. +e.g. +```bash +NGPU=1 CONFIG= ./run_llama_train.sh --checkpoint.enable_checkpoint --checkpoint.create_seed_checkpoint --training.data_parallel_shard_degree 1 +``` diff --git a/test_runner.py b/test_runner.py index 9a52ce19..a0c8fe4f 100755 --- a/test_runner.py +++ b/test_runner.py @@ -30,7 +30,6 @@ class OverrideDefinitions: override_args: Sequence[Sequence[str]] = tuple(tuple(" ")) test_descr: str = "default" test_name: str = "default" - requires_seed_checkpoint: bool = False ngpu: int = 4 model_flavor: str = "debugmodel" @@ -146,7 +145,6 @@ def build_test_list(): ], "PP looped zero bubble test", "pp_looped_zero_bubble", - requires_seed_checkpoint=True, ngpu=4, ), OverrideDefinitions( @@ -160,7 +158,6 @@ def build_test_list(): ], "PP 1D test 1F1B", "pp_1f1b", - requires_seed_checkpoint=True, ngpu=2, ), OverrideDefinitions( @@ -174,7 +171,6 @@ def build_test_list(): ], "PP 1D test GPipe", "pp_gpipe", - requires_seed_checkpoint=True, ngpu=2, ), OverrideDefinitions( @@ -188,7 +184,6 @@ def build_test_list(): ], "PP+DP 1F1B 2D test", "pp_dp_1f1b", - requires_seed_checkpoint=True, ), OverrideDefinitions( [ @@ -201,7 +196,6 @@ def build_test_list(): ], "PP+DP GPipe 2D test", "pp_dp_gpipe", - requires_seed_checkpoint=True, ), OverrideDefinitions( [ @@ -213,7 +207,6 @@ def build_test_list(): ], "PP+TP 2D test", "pp_tp", - requires_seed_checkpoint=True, ), OverrideDefinitions( [ @@ -233,7 +226,6 @@ def build_test_list(): ], "PP+DP+TP 3D test with save/load resume ckpt", "pp_dp_tp", - requires_seed_checkpoint=True, ngpu=8, ), OverrideDefinitions( @@ -247,7 +239,6 @@ def build_test_list(): ], "PP+DP+TP 3D test with torch.compile", "3d_compile", - requires_seed_checkpoint=True, ngpu=8, ), OverrideDefinitions( @@ -260,7 +251,6 @@ def build_test_list(): ], "PP looped 1F1B test", "pp_looped_1f1b", - requires_seed_checkpoint=True, ngpu=4, ), OverrideDefinitions( @@ -384,7 +374,7 @@ def build_test_list(): ] ], "FSDP2 Memory Tracking and Estimation", - "fsdp2_mem_tracker", + "fsdp2_memory_estimation", ngpu=2, ), OverrideDefinitions( @@ -421,17 +411,9 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): model_flavor_arg = f"--model.flavor {test_flavor.model_flavor}" all_ranks = ",".join(map(str, range(test_flavor.ngpu))) - if test_flavor.requires_seed_checkpoint: - cmd = f"CONFIG_FILE={full_path} ./create_seed_checkpoint.sh {dump_folder_arg} {model_flavor_arg}" - logger.info( - f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}=====" - ) - result = _run_cmd(cmd) - logger.info(result.stdout) - for override_arg in test_flavor.override_args: cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh" - if test_name == "fsdp2_mem_tracker": + if test_name == "fsdp2_memory_estimation": cmd = ( f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " "./scripts/estimate/run_memory_estimation.sh" diff --git a/torchtitan/utils.py b/torchtitan/utils.py index 55062762..c377b7f5 100644 --- a/torchtitan/utils.py +++ b/torchtitan/utils.py @@ -111,7 +111,7 @@ def set_determinism( # As long as we are not in the 1-D (PP-only) case, we will have a seed to use for all ranks of the SPMD mesh. # IF PP is also used, this seed is unique per PP rank. - if spmd_mesh: + if spmd_mesh and spmd_mesh.get_coordinate() is not None: torch.distributed.tensor._random.manual_seed(seed, spmd_mesh)