diff --git a/submitit_train.py b/submitit_train.py index 89a62b57..0b64937f 100644 --- a/submitit_train.py +++ b/submitit_train.py @@ -6,19 +6,22 @@ if __name__ == "__main__": executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j") - n_gpus = 4 + n_gpus = 8 executor.update_parameters( - name="titan", timeout_min=15, + name="titan", timeout_min=3 * 24 * 60, gpus_per_node=n_gpus, - nodes=1, mem_gb=40, cpus_per_task=n_gpus * 2 + nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4, + slurm_additional_parameters={ + "partition": "h100" + } ) jobs = [] with executor.batch(): for _ in range(1): - # train_config = './train_configs/chemlactica_125m.toml' + train_config = './train_configs/chemlactica_125m.toml' # train_config = './train_configs/chemlactica_1.3b.toml' - train_config = './train_configs/llama3_8b.toml' + # train_config = './train_configs/llama3_8b.toml' # train_config = './train_configs/debug_model.toml' function = submitit.helpers.CommandFunction([ 'python3', '-m', 'torch.distributed.run', diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py index 5431d93d..5086c30a 100644 --- a/torchtitan/checkpoint.py +++ b/torchtitan/checkpoint.py @@ -171,6 +171,7 @@ def __init__( lr_schedulers: List[torch.optim.lr_scheduler.LRScheduler], states: Dict[str, Any], job_config: JobConfig, + experiment_hash: str, ) -> None: ckpt_config = job_config.checkpoint self.enable_checkpoint = ckpt_config.enable_checkpoint @@ -235,7 +236,7 @@ def __init__( for idx, lr_scheduler in enumerate(lr_schedulers): self.states[f"lr_scheduler_{idx}"] = lr_scheduler - self.save_folder = os.path.join(job_config.job.dump_folder, ckpt_config.save_folder) + self.save_folder = os.path.join(job_config.job.dump_folder, os.path.join(ckpt_config.save_folder, experiment_hash)) self.load_folder = os.path.join(job_config.job.dump_folder, ckpt_config.load_folder) self.interval_type = ( IntervalType.SECONDS diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index 85c7b687..14ef3a4e 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -392,12 +392,20 @@ def __init__(self): help="Whether to enable checkpoint", ) self.parser.add_argument( - "--checkpoint.folder", + "--checkpoint.load_folder", + type=str, + default="", + help=""" + The folder to load the checkpoints. + When enable_checkpoint is set to true, checkpoints will loaded from {--job.dump_folder}/{--checkpoint.load_folder}. + """, + ) + self.parser.add_argument( + "--checkpoint.save_folder", type=str, - default="checkpoint", help=""" The folder to store the checkpoints. - When enable_checkpoint is set to true, checkpoints will be in {--job.dump_folder}/{--checkpoint.folder}. + When enable_checkpoint is set to true, checkpoints will saved to {--job.dump_folder}/{--checkpoint.save_folder}. """, ) self.parser.add_argument( @@ -643,13 +651,13 @@ def parse_args(self, args_list: list = sys.argv[1:]): args, cmd_args = self.parse_args_from_command_line(args_list) config_file = getattr(args, "job.config_file", None) # build up a two level dict - args_dict = self._args_to_two_level_dict(args) + self.args_dict = self._args_to_two_level_dict(args) if config_file is not None: try: with open(config_file, "rb") as f: for k, v in tomllib.load(f).items(): # to prevent overwrite of non-specified keys - args_dict[k] |= v + self.args_dict[k] |= v except (FileNotFoundError, tomllib.TOMLDecodeError) as e: logger.exception( f"Error while loading the configuration file: {config_file}" @@ -661,9 +669,9 @@ def parse_args(self, args_list: list = sys.argv[1:]): cmd_args_dict = self._args_to_two_level_dict(cmd_args) for section, section_args in cmd_args_dict.items(): for k, v in section_args.items(): - args_dict[section][k] = v + self.args_dict[section][k] = v - for k, v in args_dict.items(): + for k, v in self.args_dict.items(): class_type = type(k.title(), (), v) setattr(self, k, class_type()) self._validate_config() diff --git a/torchtitan/datasets/hf_datasets.py b/torchtitan/datasets/hf_datasets.py index fc98b35b..6840f469 100644 --- a/torchtitan/datasets/hf_datasets.py +++ b/torchtitan/datasets/hf_datasets.py @@ -6,6 +6,9 @@ import pickle from typing import Any, Dict, List, Optional +from pathlib import Path +import glob +import os import numpy as np @@ -33,7 +36,8 @@ _supported_datasets = { "c4_test": "test/assets/c4_test", "c4": "allenai/c4", - "chemlactica_train_mini": "test/assets/chemlactica_train_mini" + "chemlactica_train_mini": "test/assets/chemlactica_train_mini", + "chemlactica_train": "/nfs/dgx/raid/chem/data/rdkit_computed_rel+form/train_rdkit_computed_rel+form" } _supported_data_processing_styles = { @@ -111,13 +115,16 @@ def __init__( # c4 is huge, and requires both streaming and language selection # (we default to en) ds = load_dataset(dataset_path, name="en", split="train", streaming=True) - else: + elif dataset_name == "c4_test": ds = load_dataset(dataset_path, split="train") - + else: + dataset_files = glob.glob(os.path.join(dataset_path, "*.jsonl")) + ds = load_dataset("text", data_files=dataset_files, split="train", streaming=True) try: data_processing_fn = _supported_data_processing_styles[data_processing_style] except KeyError as e: raise ValueError(f"Unsupported data processing style: {data_processing_style}") + # data_processing_fn = lambda x, e: str(x) # TODO: support shuffling and checkpointing self.dataset_name = dataset_name @@ -217,9 +224,8 @@ class DPAwareDataLoader(StatefulDataLoader, Stateful): """ A wrapper around the StatefulDataLoader that ensures that the state is stored only once per DP rank. """ - def __init__(self, dp_rank: int, hf_ds: IterableDataset, batch_size: int, pin_memory: bool, num_workers: int): - super().__init__(hf_ds, batch_size) + super().__init__(hf_ds, batch_size, num_workers=num_workers) self._dp_rank = dp_rank self._rank_id = f"dp_rank_{dp_rank}" @@ -251,7 +257,7 @@ def build_hf_data_loader( rank, infinite: bool = True, pin_memory: bool = False, - num_workers: int = 0, + num_workers: int = 2, special_mode = None, context = "train", ): diff --git a/torchtitan/logging.py b/torchtitan/logging.py index 32d2ee1c..8bdddacf 100644 --- a/torchtitan/logging.py +++ b/torchtitan/logging.py @@ -27,6 +27,9 @@ def init_logger(log_level): # suppress verbose torch.profiler logging os.environ["KINETO_LOG_LEVEL"] = "5" + # enable dataloading logging for logging the type of dataloading used + enable_dataloader_logging(log_level) + class LogLevel(Enum): DEBUG = "DEBUG" @@ -46,3 +49,6 @@ def from_string(cls, value: str): def validate_log_level(value): return LogLevel.from_string(value) + +def enable_dataloader_logging(log_level): + logging.getLogger('datasets.iterable_dataset').setLevel(log_level) \ No newline at end of file diff --git a/torchtitan/metrics.py b/torchtitan/metrics.py index 732d0e8f..ad0570e1 100644 --- a/torchtitan/metrics.py +++ b/torchtitan/metrics.py @@ -116,6 +116,12 @@ def log_hparams(self, config): if self.writer is not None: self.writer.experiment['hparams'] = config + @property + def experiment_hash(self): + if self.writer is None: + return "default" + return self.writer._run.hash + def build_metric_logger( job_config: JobConfig, parallel_dims: ParallelDims ): @@ -127,7 +133,7 @@ def build_metric_logger( """ dump_dir = job_config.job.dump_folder aim_config = job_config.metrics - save_aim_folder = aim_config.save_aim_folder + save_aim_folder = os.path.join(job_config.job.dump_folder, aim_config.save_aim_folder) # since we don't have run id, use current minute as the identifier datetime_str = datetime.now().strftime("%Y%m%d-%H%M") log_dir = os.path.join(dump_dir, datetime_str) diff --git a/torchtitan/models/opt/model.py b/torchtitan/models/opt/model.py index 8eefc2b4..28c9eefc 100644 --- a/torchtitan/models/opt/model.py +++ b/torchtitan/models/opt/model.py @@ -24,10 +24,9 @@ class ModelArgs: n_heads: int = 12 n_kv_heads: Optional[int] = None vocab_size: int = -1 # defined later by tokenizer - multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + multiple_of: int = 256 ffn_dim_multiplier: Optional[float] = None norm_eps: float = 1e-5 - rope_theta: float = 10000 dropout_p: float = 0.1 max_batch_size: int = 32 @@ -35,7 +34,7 @@ class ModelArgs: # If `True`, then each transformer block init uses its layer ID, and if # `False`, each uses the total number of transformer blocks depth_init: bool = True - norm_type: str = "layersnorm" + norm_type: str = "layernorm_bias" def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: diff --git a/torchtitan/models/opt/utils.py b/torchtitan/models/opt/utils.py index 8ec295ed..0248462d 100644 --- a/torchtitan/models/opt/utils.py +++ b/torchtitan/models/opt/utils.py @@ -73,7 +73,7 @@ def export_opt_weights(model: OPT, save_dir: str, token_embedding_size: int): """ write docs """ - hf_model = OPTForCausalLM.from_pretrained(map_n_layers_to_model_name(model.n_layers)) + hf_model = OPTForCausalLM.from_pretrained(map_n_layers_to_model_name(model.n_layers), tie_word_embeddings=False) hf_model.resize_token_embeddings(new_num_tokens=token_embedding_size) keys_mapping = get_hf_opt_state_dict_keys_mapping(model.n_layers) state_dict = model.state_dict() diff --git a/torchtitan/tokenizers/tokenizer/custom.py b/torchtitan/tokenizers/tokenizer/custom.py index 386b6c22..7f38deb8 100644 --- a/torchtitan/tokenizers/tokenizer/custom.py +++ b/torchtitan/tokenizers/tokenizer/custom.py @@ -7,10 +7,13 @@ # copied and adjusted from https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py from typing import List +import os from torchtitan.logging import logger from transformers import AutoTokenizer +os.environ["TOKENIZER_PARALLELISM"] = "true" + class CustomTokenizer: """ diff --git a/torchtitan/utils/dataset_utils.py b/torchtitan/utils/dataset_utils.py index 5a68ce3b..40ef7aae 100644 --- a/torchtitan/utils/dataset_utils.py +++ b/torchtitan/utils/dataset_utils.py @@ -32,13 +32,14 @@ def load_jsonl_line(jsonl_line): def chemlactica_style_data_processing(sample_json, rng): try: + sample_json = json.loads(sample_json["text"]) compound = delete_empty_tags(sample_json) sample_json = generate_formatted_string( compound, rng ) except Exception as e: print(e) - sample_json = {} + sample_json = "" return sample_json diff --git a/train.py b/train.py index 808c1813..b7ee0d23 100644 --- a/train.py +++ b/train.py @@ -7,6 +7,7 @@ import contextlib import os import time +import logging from datetime import timedelta import torch @@ -186,6 +187,9 @@ def loss_fn(pred, labels): train_state = TrainState() + metric_logger = build_metric_logger(job_config, parallel_dims) + metric_logger.log_hparams(job_config.args_dict) + # load initial checkpoint checkpoint = CheckpointManager( dataloader=data_loader, @@ -194,6 +198,7 @@ def loss_fn(pred, labels): lr_schedulers=lr_schedulers.schedulers, states={"train_state": train_state}, job_config=job_config, + experiment_hash=metric_logger.experiment_hash ) if job_config.model_download_export.to_titan: @@ -218,11 +223,6 @@ def loss_fn(pred, labels): logger.info("Created huggingface checkpoint") return - metric_logger = build_metric_logger(job_config, parallel_dims) - args, cmd_args = job_config.parse_args_from_command_line(job_config.args_list) - job_config_dict = job_config._args_to_two_level_dict(args) - metric_logger.log_hparams(job_config_dict) - data_iterator = iter(data_loader) train_context = get_train_context( @@ -284,12 +284,14 @@ def loss_fn(pred, labels): # need to free to before bwd to avoid peaking memory del pred loss.backward() + + for m in model_parts: + torch.nn.utils.clip_grad_norm_( + m.parameters(), job_config.training.max_norm, foreach=True + ) + if force_finish_train: break - for m in model_parts: - torch.nn.utils.clip_grad_norm_( - m.parameters(), job_config.training.max_norm, foreach=True - ) # sync float8 amaxes and scales float8_handler.sync_float8_amax_and_scale_history(model_parts) diff --git a/train_configs/chemlactica_1.3b.toml b/train_configs/chemlactica_1.3b.toml index 4e4e0546..58d5bf15 100644 --- a/train_configs/chemlactica_1.3b.toml +++ b/train_configs/chemlactica_1.3b.toml @@ -15,31 +15,35 @@ save_memory_snapshot_folder = "memory_snapshot" [metrics] log_freq = 1 enable_color_printing = true -enable_tensorboard = true -save_tb_folder = "tb" +enable_aim = true +save_aim_folder = "aim" [model] name = "opt" flavor = "1.3B" -# norm_type = "layernorm_bias" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm -norm_type = "rmsnorm" +norm_type = "layernorm_bias" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm # test tokenizer.model, for debug purpose only -tokenizer_path = "./test/assets/test_tiktoken.model" +# tokenizer_path = "./test/assets/test_tiktoken.model" +tokenizer_path = "./torchtitan/tokenizers/chemlactica-125m" [optimizer] name = "AdamW" -lr = 8e-4 +lr = 1.0e-4 [training] -batch_size = 10 +batch_size = 13 +gradient_accumulation_steps = 9 seq_len = 2048 -warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping -steps = 50 +steps = 18000 data_parallel_degree = -1 tensor_parallel_degree = 1 -compile = false -dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +compile = true +# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +dataset = "chemlactica_train" +data_process_style="chemlactica_style" [experimental] pipeline_parallel_degree = 1 @@ -47,17 +51,16 @@ enable_async_tensor_parallel = false [checkpoint] enable_checkpoint = true -create_seed_checkpoint = false load_folder = "facebook/galactica-1.3b" save_folder = "yerevann/chemlactica-1.3b" interval_type = "steps" -interval = 100 +interval = 2000 model_weights_only = false export_dtype = "float32" async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] [activation_checkpoint] -mode = 'selective' # ['none', 'selective', 'full'] +mode = 'none' # ['none', 'selective', 'full'] selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy [float8] diff --git a/train_configs/chemlactica_125m.toml b/train_configs/chemlactica_125m.toml index 1546d069..35d19872 100644 --- a/train_configs/chemlactica_125m.toml +++ b/train_configs/chemlactica_125m.toml @@ -15,7 +15,7 @@ save_memory_snapshot_folder = "memory_snapshot" [metrics] log_freq = 1 enable_color_printing = true -enable_aim = false +enable_aim = true save_aim_folder = "aim" #aim_hash = "c6b4d8b340f74287b82ef928" #aim_experiment_name = "hello" @@ -29,19 +29,21 @@ tokenizer_path = "./torchtitan/tokenizers/chemlactica-125m" [optimizer] name = "AdamW" -lr = 8e-4 +lr = 1.4e-3 [training] -batch_size = 1 +batch_size = 20 +gradient_accumulation_steps = 8 seq_len = 2048 -warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping -steps = 50 +steps = 18000 data_parallel_degree = -1 tensor_parallel_degree = 1 -compile = false +compile = true # dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) -dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +dataset = "chemlactica_train" data_process_style="chemlactica_style" [experimental] @@ -54,13 +56,13 @@ create_seed_checkpoint = false load_folder = "facebook/galactica-125m" save_folder = "yerevann/chemlactica-125m" interval_type = "steps" -interval = 100 +interval = 2000 model_weights_only = false export_dtype = "float32" async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] [activation_checkpoint] -mode = 'selective' # ['none', 'selective', 'full'] +mode = 'none' # ['none', 'selective', 'full'] selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy [float8] diff --git a/train_configs/chemlactica_debug.toml b/train_configs/chemlactica_debug.toml new file mode 100644 index 00000000..ae40acb0 --- /dev/null +++ b/train_configs/chemlactica_debug.toml @@ -0,0 +1,72 @@ +# torchtitan Config.toml + +[job] +dump_folder = "/nfs/dgx/raid/chem/titan_outputs" +description = "Galactica training" +use_for_integration_test = false + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +enable_color_printing = true +enable_aim = true +save_aim_folder = "aim" +#aim_hash = "c6b4d8b340f74287b82ef928" +#aim_experiment_name = "hello" + +[model] +name = "opt" +flavor = "debugmodel" +norm_type = "layernorm_bias" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm +# test tokenizer.model, for debug purpose only +tokenizer_path = "./torchtitan/tokenizers/chemlactica-125m" + +[optimizer] +name = "AdamW" +lr = 1.4e-3 + +[training] +batch_size = 1 +gradient_accumulation_steps = 1 +seq_len = 2048 +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps +max_norm = 1.0 # grad norm clipping +steps = 50 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = true +# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +# dataset = "chemlactica_train" +data_process_style="chemlactica_style" + +[dataloader] +num_workers = 1 + +[experimental] +pipeline_parallel_degree = 1 +enable_async_tensor_parallel = false + +[checkpoint] +enable_checkpoint = true +create_seed_checkpoint = false +# load_folder = "facebook/galactica-125m" +save_folder = "yerevann/chemlactica-125m" +interval_type = "steps" +interval = 2000 +model_weights_only = false +export_dtype = "float32" +async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = 'none' # ['none', 'selective', 'full'] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_float8_linear = false diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml index fdd360d0..2829d098 100644 --- a/train_configs/debug_model.toml +++ b/train_configs/debug_model.toml @@ -41,7 +41,7 @@ max_norm = 1.0 # grad norm clipping steps = 30 data_parallel_degree = -1 tensor_parallel_degree = 1 -compile = false +compile = true dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) data_process_style="chemlactica_style" diff --git a/train_configs/galactica_1.3b_conversion.toml b/train_configs/galactica_1.3b_conversion.toml new file mode 100644 index 00000000..8f671e8c --- /dev/null +++ b/train_configs/galactica_1.3b_conversion.toml @@ -0,0 +1,72 @@ +# torchtitan Config.toml + +[job] +dump_folder = "/nfs/dgx/raid/chem/titan_outputs" +description = "Galactica training" +use_for_integration_test = false + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +enable_color_printing = true +enable_aim = false +save_aim_folder = "aim" + +[model] +name = "opt" +flavor = "1.3B" +norm_type = "layernorm_bias" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm +# test tokenizer.model, for debug purpose only +# tokenizer_path = "./test/assets/test_tiktoken.model" +tokenizer_path = "./torchtitan/tokenizers/chemlactica-125m" + +[optimizer] +name = "AdamW" +lr = 1.0e-4 + +[training] +batch_size = 20 +gradient_accumulation_steps = 3 +seq_len = 2048 +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps +max_norm = 1.0 # grad norm clipping +steps = 10000 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = true +# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +dataset = "chemlactica_train" +data_process_style="chemlactica_style" + +[experimental] +pipeline_parallel_degree = 1 +enable_async_tensor_parallel = false + +[checkpoint] +enable_checkpoint = true +load_folder = "facebook/galactica-1.3b" +save_folder = "facebook/galactica-1.3b" +interval_type = "steps" +interval = 1000 +model_weights_only = false +export_dtype = "float32" +async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] + +[model_download_export] +to_titan = true +weights_source = "huggingface" +# to_hf = true + +[activation_checkpoint] +mode = 'selective' # ['none', 'selective', 'full'] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_float8_linear = false diff --git a/train_configs/galactica_125m_conversion.toml b/train_configs/galactica_125m_conversion.toml index 74119aae..21368c86 100644 --- a/train_configs/galactica_125m_conversion.toml +++ b/train_configs/galactica_125m_conversion.toml @@ -51,9 +51,12 @@ enable_async_tensor_parallel = false [checkpoint] enable_checkpoint = true -load_folder = "facebook/galactica-125m" -save_folder = "facebook/galactica-125m" -# save_folder = "hf/facebook/galactica-125m" +# use for hf to titan +# load_folder = "facebook/galactica-125m" +# save_folder = "facebook/galactica-125m" +# use for titan to hf +load_folder = "yerevann/chemlactica-125m" +save_folder = "hf/yerevann/chemlactica-125m" interval_type = "steps" interval = 5 model_weights_only = false @@ -61,9 +64,9 @@ export_dtype = "float32" async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] [model_download_export] -to_titan = true -weights_source = "huggingface" -# to_hf = true +# to_titan = true +# weights_source = "huggingface" +to_hf = true [activation_checkpoint] mode = 'selective' # ['none', 'selective', 'full']