From 9798c088f820885f1cb3ce6a03ecc4bb8b72ec4c Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:22:36 +0100
Subject: [PATCH] Rm confs

---
 configs/c4-medium_ckptfine.yaml               | 184 ------------------
 configs/c4-medium_ckptoneintwo.yaml           | 184 ------------------
 configs/c4-medium_ckptwhole.yaml              | 184 ------------------
 configs/c4-medium_nockpt.yaml                 | 182 -----------------
 configs/c4-small_nockpt.yaml                  | 183 -----------------
 configs/olmo_nockpt.yml                       |  86 --------
 configs/olmo_wholeckpt.yml                    |  88 ---------
 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml |  98 ----------
 8 files changed, 1189 deletions(-)
 delete mode 100644 configs/c4-medium_ckptfine.yaml
 delete mode 100644 configs/c4-medium_ckptoneintwo.yaml
 delete mode 100644 configs/c4-medium_ckptwhole.yaml
 delete mode 100644 configs/c4-medium_nockpt.yaml
 delete mode 100644 configs/c4-small_nockpt.yaml
 delete mode 100644 configs/olmo_nockpt.yml
 delete mode 100644 configs/olmo_wholeckpt.yml
 delete mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml

diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml
deleted file mode 100644
index f5e77a958..000000000
--- a/configs/c4-medium_ckptfine.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: fine_grained
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml
deleted file mode 100644
index 4ceb2901f..000000000
--- a/configs/c4-medium_ckptoneintwo.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: one_in_two
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml
deleted file mode 100644
index 2e8084d32..000000000
--- a/configs/c4-medium_ckptwhole.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: whole_layer
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml
deleted file mode 100644
index 0d862117b..000000000
--- a/configs/c4-medium_nockpt.yaml
+++ /dev/null
@@ -1,182 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml
deleted file mode 100644
index bdc2e04a9..000000000
--- a/configs/c4-small_nockpt.yaml
+++ /dev/null
@@ -1,183 +0,0 @@
-run_name: c4-small-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 2.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-  
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml
deleted file mode 100644
index f09396738..000000000
--- a/configs/olmo_nockpt.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-run_name: olmo-small-ablation
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50277
-  embedding_size: 50304
-  eos_token_id: 50276
-  pad_token_id: 50276
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 4
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: EleutherAI/gpt-neox-20b
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 953674  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 1024
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml
deleted file mode 100644
index efc53dbf2..000000000
--- a/configs/olmo_wholeckpt.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-run_name: olmo-small-ablation
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-activation_checkpointing: whole_layer
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50277
-  embedding_size: 50304
-  eos_token_id: 50276
-  pad_token_id: 50276
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 4
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: EleutherAI/gpt-neox-20b
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 953674  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 1024
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
deleted file mode 100644
index 0c0974f0e..000000000
--- a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-run_name: v1_5-mix-medium-mitch-ish
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: olmo-medium
-  group: v1_5-mix
-
-model:
-  d_model: 4096
-  n_heads: 32
-  n_layers: 32
-  # mlp_ratio: 6
-  mlp_hidden_size: 22016
-  weight_tying: false
-  alibi: false
-  rope: true
-  flash_attention: false  # not available on AMD
-  attention_dropout: 0.0
-  attention_layer_norm: false
-  multi_query_attention: false
-  include_bias: false
-  block_type: sequential
-  layer_norm_type: default
-  layer_norm_with_affine: false
-  bias_for_layer_norm: false
-  attention_layer_norm_with_affine: false
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50280
-  embedding_size: 50304
-  eos_token_id: 0
-  pad_token_id: 1
-  init_device: meta
-  init_fn: mitchell
-
-compile: null
-
-optimizer:
-  name: adamw
-  learning_rate: 3.0e-4
-  weight_decay: 0.1
-  betas:
-  - 0.9
-  - 0.95
-  metrics_log_interval: 10
-
-scheduler:
-  name: linear_with_warmup
-  t_warmup: 5000
-  alpha_f: 0.1
-  grad_clip_warmup_steps: 1000
-  grad_clip_warmup_factor: 10.0
-
-tokenizer:
-  identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
-  truncate_direction: right
-
-save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: -1
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: null  # getting errors on LUMI right now
-save_num_unsharded_checkpoints_to_keep: -1
-no_pre_train_checkpoint: true
-
-load_path: null
-
-max_duration: 50  # 2T tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-fsdp:
-  wrapping_strategy: null
-  precision: mixed
-
-max_grad_norm: 1.0
-max_grad_norm_ratio: null
-
-speed_monitor:
-  window_size: 20
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 0
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0