From 9798c088f820885f1cb3ce6a03ecc4bb8b72ec4c Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 25 Feb 2024 10:22:36 +0100 Subject: [PATCH] Rm confs --- configs/c4-medium_ckptfine.yaml | 184 ------------------ configs/c4-medium_ckptoneintwo.yaml | 184 ------------------ configs/c4-medium_ckptwhole.yaml | 184 ------------------ configs/c4-medium_nockpt.yaml | 182 ----------------- configs/c4-small_nockpt.yaml | 183 ----------------- configs/olmo_nockpt.yml | 86 -------- configs/olmo_wholeckpt.yml | 88 --------- configs/v1_5-mix-medium-mitch-ish_nockpt.yaml | 98 ---------- 8 files changed, 1189 deletions(-) delete mode 100644 configs/c4-medium_ckptfine.yaml delete mode 100644 configs/c4-medium_ckptoneintwo.yaml delete mode 100644 configs/c4-medium_ckptwhole.yaml delete mode 100644 configs/c4-medium_nockpt.yaml delete mode 100644 configs/c4-small_nockpt.yaml delete mode 100644 configs/olmo_nockpt.yml delete mode 100644 configs/olmo_wholeckpt.yml delete mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml deleted file mode 100644 index f5e77a958..000000000 --- a/configs/c4-medium_ckptfine.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: fine_grained - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml deleted file mode 100644 index 4ceb2901f..000000000 --- a/configs/c4-medium_ckptoneintwo.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: one_in_two - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml deleted file mode 100644 index 2e8084d32..000000000 --- a/configs/c4-medium_ckptwhole.yaml +++ /dev/null @@ -1,184 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -activation_checkpointing: whole_layer - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml deleted file mode 100644 index 0d862117b..000000000 --- a/configs/c4-medium_nockpt.yaml +++ /dev/null @@ -1,182 +0,0 @@ -run_name: c4-medium-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - -model: - d_model: 4096 - n_heads: 16 - n_layers: 30 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 2 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 50000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml deleted file mode 100644 index bdc2e04a9..000000000 --- a/configs/c4-small_nockpt.yaml +++ /dev/null @@ -1,183 +0,0 @@ -run_name: c4-small-run-001 -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50257 - embedding_size: 50304 - eos_token_id: 50256 - pad_token_id: 50256 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 2.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy} - pad_direction: right - num_workers: 2 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: gpt2 - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 476837 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} -evaluators: - ########################## - # Perplexity evaluations # - ########################## - - label: c4-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - - label: rp-validation - subset_num_batches: 10 - data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy} - num_workers: 1 - drop_last: true - pin_memory: true - persistent_workers: true - prefetch_factor: 4 - - # lump all the small datasets together (we still get separate metrics). - - label: all-small-ppl-validation - data: - datasets: - 4chan-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy - c4_100_domains-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy - c4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy - gab-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy - ice-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy - m2d2_s2orc-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy - m2d2_wiki-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy - manosphere-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy - mc4_en-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy - pile-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy - ptb-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy - twitterAEE-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy - wikitext_103-validation: - - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy - drop_last: true - - ########################## - # Downstream evaluations # - ########################## - - label: piqa - type: downstream - - - label: hellaswag - type: downstream - - - label: winogrande - type: downstream - - - label: openbook_qa - type: downstream - - # - label: boolq # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: sciq - type: downstream - - - label: arc_easy - type: downstream - - # - label: arc_challenge # requires implemention of the pmi_dc matrix - # type: downstream - # - - label: copa - type: downstream - - - label: rte - type: downstream - - - label: commitment_bank - type: downstream - - - label: mrpc - type: downstream - - - label: sst2 - type: downstream diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml deleted file mode 100644 index f09396738..000000000 --- a/configs/olmo_nockpt.yml +++ /dev/null @@ -1,86 +0,0 @@ -run_name: olmo-small-ablation -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50277 - embedding_size: 50304 - eos_token_id: 50276 - pad_token_id: 50276 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: EleutherAI/gpt-neox-20b - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 953674 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml deleted file mode 100644 index efc53dbf2..000000000 --- a/configs/olmo_wholeckpt.yml +++ /dev/null @@ -1,88 +0,0 @@ -run_name: olmo-small-ablation -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: c4-small - -activation_checkpointing: whole_layer - -model: - d_model: 2048 - n_heads: 16 - n_layers: 16 - mlp_ratio: 8 - alibi: true - alibi_bias_max: 8.0 - attention_dropout: 0.0 - attention_layer_norm: true - multi_query_attention: true - block_type: sequential - layer_norm_type: low_precision # if not compiling, use 'low_precision' - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50277 - embedding_size: 50304 - eos_token_id: 50276 - pad_token_id: 50276 - init_device: meta - init_std: 0.02 - -compile: null # causes instability on AMD GPUs - -optimizer: - name: lionw - learning_rate: 1.0e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.95 - -scheduler: - name: cosine_with_warmup - t_warmup: 2000 - t_max: null - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 4 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0 - -tokenizer: - identifier: EleutherAI/gpt-neox-20b - truncate_direction: right - -save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: 9 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: 10000 -save_num_unsharded_checkpoints_to_keep: -1 - -load_path: null - -# max_duration: 953674 # 2T tokens -max_duration: 50 # 200B tokens -global_train_batch_size: 1024 -device_train_microbatch_size: 8 - -precision: amp_bf16 - -max_grad_norm: 1.0 - -speed_monitor: - window_size: 20 - -eval_interval: ${save_interval} -eval_subset_num_batches: -1 -device_eval_batch_size: ${device_train_microbatch_size} \ No newline at end of file diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml deleted file mode 100644 index 0c0974f0e..000000000 --- a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml +++ /dev/null @@ -1,98 +0,0 @@ -run_name: v1_5-mix-medium-mitch-ish -seed: 6198 -dry_run: false - -wandb: - name: ${run_name} - project: olmo-medium - group: v1_5-mix - -model: - d_model: 4096 - n_heads: 32 - n_layers: 32 - # mlp_ratio: 6 - mlp_hidden_size: 22016 - weight_tying: false - alibi: false - rope: true - flash_attention: false # not available on AMD - attention_dropout: 0.0 - attention_layer_norm: false - multi_query_attention: false - include_bias: false - block_type: sequential - layer_norm_type: default - layer_norm_with_affine: false - bias_for_layer_norm: false - attention_layer_norm_with_affine: false - activation_type: swiglu - residual_dropout: 0.0 - embedding_dropout: 0.0 - max_sequence_length: 2048 - vocab_size: 50280 - embedding_size: 50304 - eos_token_id: 0 - pad_token_id: 1 - init_device: meta - init_fn: mitchell - -compile: null - -optimizer: - name: adamw - learning_rate: 3.0e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - metrics_log_interval: 10 - -scheduler: - name: linear_with_warmup - t_warmup: 5000 - alpha_f: 0.1 - grad_clip_warmup_steps: 1000 - grad_clip_warmup_factor: 10.0 - -tokenizer: - identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json - truncate_direction: right - -save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}} -save_overwrite: false -# Sharded checkpoints (best for restarts) -save_interval: 1000 -save_num_checkpoints_to_keep: -1 -# Unsharded checkpoints (for final storage) -save_interval_unsharded: null # getting errors on LUMI right now -save_num_unsharded_checkpoints_to_keep: -1 -no_pre_train_checkpoint: true - -load_path: null - -max_duration: 50 # 2T tokens -global_train_batch_size: 2048 -device_train_microbatch_size: 2 - -precision: amp_bf16 - -fsdp: - wrapping_strategy: null - precision: mixed - -max_grad_norm: 1.0 -max_grad_norm_ratio: null - -speed_monitor: - window_size: 20 - -data: - paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy} - pad_direction: right - num_workers: 0 - drop_last: true - pin_memory: true - prefetch_factor: 16 - persistent_workers: true - timeout: 0