From d69e598e0dfd34f8d7ff3083924fd88a0eb07836 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:20:51 +0100
Subject: [PATCH 1/7] Olmo > OLMo

---
 configs/c4-medium_ckptfine.yaml               | 184 ++++++++++++++++++
 configs/c4-medium_ckptoneintwo.yaml           | 184 ++++++++++++++++++
 configs/c4-medium_ckptwhole.yaml              | 184 ++++++++++++++++++
 configs/c4-medium_nockpt.yaml                 | 182 +++++++++++++++++
 configs/c4-small_nockpt.yaml                  | 183 +++++++++++++++++
 configs/olmo_nockpt.yml                       |  86 ++++++++
 configs/olmo_wholeckpt.yml                    |  88 +++++++++
 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml |  98 ++++++++++
 docs/NOTES.md                                 |   4 +-
 hf_olmo/configuration_olmo.py                 |   4 +-
 hf_olmo/modeling_olmo.py                      |   6 +-
 hf_olmo/tokenization_olmo_fast.py             |   2 +-
 inference/NOTES.md                            |   4 +-
 .../AutoGPTQ/auto_gptq/modeling/auto.py       |   4 +-
 .../AutoGPTQ/auto_gptq/modeling/olmo.py       |   4 +-
 inference/compression/olmo_gptq_class.py      |   8 +-
 olmo/ckptavg.py                               |  17 ++
 olmo/config.py                                |   6 +-
 olmo/data/__init__.py                         |   8 +-
 olmo/data/memmap_dataset.py                   |   4 +-
 olmo/eval/__init__.py                         |   4 +-
 olmo/exceptions.py                            |  14 +-
 olmo/model.py                                 |  82 ++++----
 olmo/tokenizer.py                             |   6 +-
 olmo/train.py                                 |   8 +-
 olmo/util.py                                  |  28 +--
 scripts/average_ckpts.py                      |  71 +++++++
 scripts/average_ckpts_advanced.py             | 123 ++++++++++++
 scripts/avgckpts_stepbystep.py                | 127 ++++++++++++
 scripts/init_config.py                        |   4 +-
 scripts/inspect_train_data.py                 |   4 +-
 .../lumi/v1_5-mix-medium-mitch-ish_nockpt.sh  |  55 ++++++
 scripts/show_model_size.py                    |   8 +-
 scripts/train.py                              |  14 +-
 test_fixtures/test-olmo-model/config.json     |   2 +-
 tests/hf_olmo/hf_olmo_test.py                 |   4 +-
 tests/hf_olmo/modeling_olmo_test.py           |   4 +-
 tests/model_test.py                           |  12 +-
 38 files changed, 1706 insertions(+), 124 deletions(-)
 create mode 100644 configs/c4-medium_ckptfine.yaml
 create mode 100644 configs/c4-medium_ckptoneintwo.yaml
 create mode 100644 configs/c4-medium_ckptwhole.yaml
 create mode 100644 configs/c4-medium_nockpt.yaml
 create mode 100644 configs/c4-small_nockpt.yaml
 create mode 100644 configs/olmo_nockpt.yml
 create mode 100644 configs/olmo_wholeckpt.yml
 create mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
 create mode 100644 olmo/ckptavg.py
 create mode 100644 scripts/average_ckpts.py
 create mode 100644 scripts/average_ckpts_advanced.py
 create mode 100644 scripts/avgckpts_stepbystep.py
 create mode 100644 scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh

diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml
new file mode 100644
index 000000000..f5e77a958
--- /dev/null
+++ b/configs/c4-medium_ckptfine.yaml
@@ -0,0 +1,184 @@
+run_name: c4-medium-run-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+
+activation_checkpointing: fine_grained
+
+model:
+  d_model: 4096
+  n_heads: 16
+  n_layers: 30
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50257
+  embedding_size: 50304
+  eos_token_id: 50256
+  pad_token_id: 50256
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 2
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: gpt2
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 2
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 50000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 476837  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  ##########################
+  # Perplexity evaluations #
+  ##########################
+  - label: c4-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  - label: rp-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  # lump all the small datasets together (we still get separate metrics).
+  - label: all-small-ppl-validation
+    data:
+      datasets:
+        4chan-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
+        c4_100_domains-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
+        c4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
+        gab-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
+        ice-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
+        m2d2_s2orc-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
+        m2d2_wiki-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
+        manosphere-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
+        mc4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
+        pile-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
+        ptb-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
+        twitterAEE-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
+        wikitext_103-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
+      drop_last: true
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  # - label: boolq  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: copa
+    type: downstream
+
+  - label: rte
+    type: downstream
+
+  - label: commitment_bank
+    type: downstream
+
+  - label: mrpc
+    type: downstream
+
+  - label: sst2
+    type: downstream
diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml
new file mode 100644
index 000000000..4ceb2901f
--- /dev/null
+++ b/configs/c4-medium_ckptoneintwo.yaml
@@ -0,0 +1,184 @@
+run_name: c4-medium-run-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+
+activation_checkpointing: one_in_two
+
+model:
+  d_model: 4096
+  n_heads: 16
+  n_layers: 30
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50257
+  embedding_size: 50304
+  eos_token_id: 50256
+  pad_token_id: 50256
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 2
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: gpt2
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 2
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 50000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 476837  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  ##########################
+  # Perplexity evaluations #
+  ##########################
+  - label: c4-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  - label: rp-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  # lump all the small datasets together (we still get separate metrics).
+  - label: all-small-ppl-validation
+    data:
+      datasets:
+        4chan-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
+        c4_100_domains-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
+        c4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
+        gab-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
+        ice-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
+        m2d2_s2orc-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
+        m2d2_wiki-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
+        manosphere-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
+        mc4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
+        pile-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
+        ptb-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
+        twitterAEE-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
+        wikitext_103-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
+      drop_last: true
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  # - label: boolq  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: copa
+    type: downstream
+
+  - label: rte
+    type: downstream
+
+  - label: commitment_bank
+    type: downstream
+
+  - label: mrpc
+    type: downstream
+
+  - label: sst2
+    type: downstream
diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml
new file mode 100644
index 000000000..2e8084d32
--- /dev/null
+++ b/configs/c4-medium_ckptwhole.yaml
@@ -0,0 +1,184 @@
+run_name: c4-medium-run-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+
+activation_checkpointing: whole_layer
+
+model:
+  d_model: 4096
+  n_heads: 16
+  n_layers: 30
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50257
+  embedding_size: 50304
+  eos_token_id: 50256
+  pad_token_id: 50256
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 2
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: gpt2
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 2
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 50000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 476837  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  ##########################
+  # Perplexity evaluations #
+  ##########################
+  - label: c4-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  - label: rp-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  # lump all the small datasets together (we still get separate metrics).
+  - label: all-small-ppl-validation
+    data:
+      datasets:
+        4chan-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
+        c4_100_domains-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
+        c4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
+        gab-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
+        ice-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
+        m2d2_s2orc-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
+        m2d2_wiki-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
+        manosphere-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
+        mc4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
+        pile-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
+        ptb-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
+        twitterAEE-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
+        wikitext_103-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
+      drop_last: true
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  # - label: boolq  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: copa
+    type: downstream
+
+  - label: rte
+    type: downstream
+
+  - label: commitment_bank
+    type: downstream
+
+  - label: mrpc
+    type: downstream
+
+  - label: sst2
+    type: downstream
diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml
new file mode 100644
index 000000000..0d862117b
--- /dev/null
+++ b/configs/c4-medium_nockpt.yaml
@@ -0,0 +1,182 @@
+run_name: c4-medium-run-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+
+model:
+  d_model: 4096
+  n_heads: 16
+  n_layers: 30
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50257
+  embedding_size: 50304
+  eos_token_id: 50256
+  pad_token_id: 50256
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 2
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: gpt2
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 2
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 50000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 476837  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  ##########################
+  # Perplexity evaluations #
+  ##########################
+  - label: c4-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  - label: rp-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  # lump all the small datasets together (we still get separate metrics).
+  - label: all-small-ppl-validation
+    data:
+      datasets:
+        4chan-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
+        c4_100_domains-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
+        c4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
+        gab-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
+        ice-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
+        m2d2_s2orc-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
+        m2d2_wiki-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
+        manosphere-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
+        mc4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
+        pile-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
+        ptb-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
+        twitterAEE-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
+        wikitext_103-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
+      drop_last: true
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  # - label: boolq  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: copa
+    type: downstream
+
+  - label: rte
+    type: downstream
+
+  - label: commitment_bank
+    type: downstream
+
+  - label: mrpc
+    type: downstream
+
+  - label: sst2
+    type: downstream
diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml
new file mode 100644
index 000000000..bdc2e04a9
--- /dev/null
+++ b/configs/c4-small_nockpt.yaml
@@ -0,0 +1,183 @@
+run_name: c4-small-run-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: c4-small
+
+model:
+  d_model: 2048
+  n_heads: 16
+  n_layers: 16
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50257
+  embedding_size: 50304
+  eos_token_id: 50256
+  pad_token_id: 50256
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 2.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 2
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: gpt2
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 9
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 10000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 476837  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 8
+
+precision: amp_bf16
+  
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  ##########################
+  # Perplexity evaluations #
+  ##########################
+  - label: c4-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  - label: rp-validation
+    subset_num_batches: 10
+    data:
+      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
+      num_workers: 1
+      drop_last: true
+      pin_memory: true
+      persistent_workers: true
+      prefetch_factor: 4
+
+  # lump all the small datasets together (we still get separate metrics).
+  - label: all-small-ppl-validation
+    data:
+      datasets:
+        4chan-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
+        c4_100_domains-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
+        c4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
+        gab-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
+        ice-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
+        m2d2_s2orc-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
+        m2d2_wiki-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
+        manosphere-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
+        mc4_en-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
+        pile-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
+        ptb-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
+        twitterAEE-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
+        wikitext_103-validation:
+          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
+      drop_last: true
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  # - label: boolq  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
+    # type: downstream
+    #
+  - label: copa
+    type: downstream
+
+  - label: rte
+    type: downstream
+
+  - label: commitment_bank
+    type: downstream
+
+  - label: mrpc
+    type: downstream
+
+  - label: sst2
+    type: downstream
diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml
new file mode 100644
index 000000000..f09396738
--- /dev/null
+++ b/configs/olmo_nockpt.yml
@@ -0,0 +1,86 @@
+run_name: olmo-small-ablation
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: c4-small
+
+model:
+  d_model: 2048
+  n_heads: 16
+  n_layers: 16
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50277
+  embedding_size: 50304
+  eos_token_id: 50276
+  pad_token_id: 50276
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 4
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: EleutherAI/gpt-neox-20b
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 9
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 10000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 953674  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 1024
+device_train_microbatch_size: 8
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml
new file mode 100644
index 000000000..efc53dbf2
--- /dev/null
+++ b/configs/olmo_wholeckpt.yml
@@ -0,0 +1,88 @@
+run_name: olmo-small-ablation
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: c4-small
+
+activation_checkpointing: whole_layer
+
+model:
+  d_model: 2048
+  n_heads: 16
+  n_layers: 16
+  mlp_ratio: 8
+  alibi: true
+  alibi_bias_max: 8.0
+  attention_dropout: 0.0
+  attention_layer_norm: true
+  multi_query_attention: true
+  block_type: sequential
+  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50277
+  embedding_size: 50304
+  eos_token_id: 50276
+  pad_token_id: 50276
+  init_device: meta
+  init_std: 0.02
+
+compile: null  # causes instability on AMD GPUs
+
+optimizer:
+  name: lionw
+  learning_rate: 1.0e-4
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 2000
+  t_max: null
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 4
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
+
+tokenizer:
+  identifier: EleutherAI/gpt-neox-20b
+  truncate_direction: right
+
+save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: 9
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: 10000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+# max_duration: 953674  # 2T tokens
+max_duration: 50  # 200B tokens
+global_train_batch_size: 1024
+device_train_microbatch_size: 8
+
+precision: amp_bf16
+
+max_grad_norm: 1.0
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: ${save_interval}
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
new file mode 100644
index 000000000..0c0974f0e
--- /dev/null
+++ b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
@@ -0,0 +1,98 @@
+run_name: v1_5-mix-medium-mitch-ish
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-medium
+  group: v1_5-mix
+
+model:
+  d_model: 4096
+  n_heads: 32
+  n_layers: 32
+  # mlp_ratio: 6
+  mlp_hidden_size: 22016
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: false  # not available on AMD
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  multi_query_attention: false
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: default
+  layer_norm_with_affine: false
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 2048
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: meta
+  init_fn: mitchell
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 3.0e-4
+  weight_decay: 0.1
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: linear_with_warmup
+  t_warmup: 5000
+  alpha_f: 0.1
+  grad_clip_warmup_steps: 1000
+  grad_clip_warmup_factor: 10.0
+
+tokenizer:
+  identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
+  truncate_direction: right
+
+save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}}
+save_overwrite: false
+# Sharded checkpoints (best for restarts)
+save_interval: 1000
+save_num_checkpoints_to_keep: -1
+# Unsharded checkpoints (for final storage)
+save_interval_unsharded: null  # getting errors on LUMI right now
+save_num_unsharded_checkpoints_to_keep: -1
+no_pre_train_checkpoint: true
+
+load_path: null
+
+max_duration: 50  # 2T tokens
+global_train_batch_size: 2048
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+fsdp:
+  wrapping_strategy: null
+  precision: mixed
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 20
+
+data:
+  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
+  pad_direction: right
+  num_workers: 0
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
diff --git a/docs/NOTES.md b/docs/NOTES.md
index 6a8f3bfa7..c6611b33a 100644
--- a/docs/NOTES.md
+++ b/docs/NOTES.md
@@ -70,10 +70,10 @@ For example, checkpoints for the run [https://wandb.ai/ai2-llm/c4-small/runs/euo
 You can load a checkpoint like this:
 
 ```python
-from olmo import Olmo, Tokenizer
+from olmo import OLMo, Tokenizer
 
 checkpoint = "gs://ai2-olmo/ai2-llm/c4-small/euox4j8q/step73000-unsharded"
-model = Olmo.from_checkpoint(checkpoint, device="cuda")
+model = OLMo.from_checkpoint(checkpoint, device="cuda")
 tokenizer = Tokenizer.from_checkpoint(checkpoint)
 ```
 
diff --git a/hf_olmo/configuration_olmo.py b/hf_olmo/configuration_olmo.py
index 5b15fa194..cb7670f6c 100644
--- a/hf_olmo/configuration_olmo.py
+++ b/hf_olmo/configuration_olmo.py
@@ -21,8 +21,8 @@ def __init__(self, use_cache: bool = False, **kwargs):
         all_kwargs.update({"use_cache": use_cache})
         all_kwargs.update(
             {
-                "architectures": all_kwargs.get("architectures", ["OlmoModelForCausalLM"])
-                or ["OlmoModelForCausalLM"]
+                "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"])
+                or ["OLMoModelForCausalLM"]
             }
         )
         super().__init__(**all_kwargs)
diff --git a/hf_olmo/modeling_olmo.py b/hf_olmo/modeling_olmo.py
index 6a279cb10..a1cc569f7 100644
--- a/hf_olmo/modeling_olmo.py
+++ b/hf_olmo/modeling_olmo.py
@@ -7,7 +7,7 @@
 from transformers.models.auto import AutoModelForCausalLM
 
 from olmo.config import ModelConfig
-from olmo.model import Olmo
+from olmo.model import OLMo
 
 from .configuration_olmo import OLMoConfig
 
@@ -34,14 +34,14 @@ class OLMoForCausalLM(PreTrainedModel):
     base_model_prefix = "model"
     _no_split_modules = ["OLMoBlock"]
 
-    def __init__(self, config: OLMoConfig, model: Optional[Olmo] = None, init_params: bool = False):
+    def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False):
         super().__init__(config)
 
         if not model:
             model_config = create_model_config_from_pretrained_config(config)
             # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             model_config.init_device = "cpu"
-            self.model = Olmo(model_config, init_params=init_params)
+            self.model = OLMo(model_config, init_params=init_params)
         else:
             self.model = model
 
diff --git a/hf_olmo/tokenization_olmo_fast.py b/hf_olmo/tokenization_olmo_fast.py
index e2bd665d1..19543a6c7 100644
--- a/hf_olmo/tokenization_olmo_fast.py
+++ b/hf_olmo/tokenization_olmo_fast.py
@@ -4,7 +4,7 @@
 
 
 class OLMoTokenizerFast(PreTrainedTokenizerFast):
-    # Note: Olmo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
+    # Note: OLMo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
     pass
 
     # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
diff --git a/inference/NOTES.md b/inference/NOTES.md
index 0af0f9d09..ea57a322a 100644
--- a/inference/NOTES.md
+++ b/inference/NOTES.md
@@ -45,12 +45,12 @@ To add an `olmo.py` module, we can basically just imitate what was done for othe
 There's one important wrinkle here: some OLMo models use *fused linear attention*. I'm not sure how GPTQ handles this or whether any existing supported models implement attention the same way. This might be something to discuss with Dirk and Pete.
 
 ```python
-Olmo(
+OLMo(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (emb_drop): Dropout(p=0.1, inplace=False)
     (blocks): ModuleList(
-      (0-11): 12 x OlmoSequentialBlock(
+      (0-11): 12 x OLMoSequentialBlock(
         (dropout): Dropout(p=0.1, inplace=False)
         (norm): LayerNorm()
         (act): SwiGLU()
diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
index 46ce32e69..9beb2ff33 100644
--- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
+++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
@@ -13,7 +13,7 @@
 from .internlm import InternLMGPTQForCausalLM
 from .llama import LlamaGPTQForCausalLM
 from .moss import MOSSGPTQForCausalLM
-from .olmo import OlmoGPTQForCausalLM
+from .olmo import OLMoGPTQForCausalLM
 from .opt import OPTGPTQForCausalLM
 from .qwen import QwenGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
@@ -24,7 +24,7 @@
     "gptj": GPTJGPTQForCausalLM,
     "gpt2": GPT2GPTQForCausalLM,
     "llama": LlamaGPTQForCausalLM,
-    "olmo": OlmoGPTQForCausalLM,
+    "olmo": OLMoGPTQForCausalLM,
     "opt": OPTGPTQForCausalLM,
     "moss": MOSSGPTQForCausalLM,
     "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
index 0bf18fc8c..01264bfdb 100644
--- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
+++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
@@ -1,7 +1,7 @@
 from ._base import *
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -19,4 +19,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
diff --git a/inference/compression/olmo_gptq_class.py b/inference/compression/olmo_gptq_class.py
index 645349d7b..0f6580a59 100644
--- a/inference/compression/olmo_gptq_class.py
+++ b/inference/compression/olmo_gptq_class.py
@@ -1,7 +1,7 @@
 from auto_gptq.modeling._base import BaseGPTQForCausalLM
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -17,12 +17,12 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
 
 # NOTE: In progress; may change if OLMo model is updated.
 
 
-# class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+# class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     # Attribute name of Transformer layer block.
 #     layers_block_name = "transformer.blocks"  # NOTE(wadden) Correct
 #
@@ -51,4 +51,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     ]
 
 
-# __all__ = ["OlmoGPTQForCausalLM"]
+# __all__ = ["OLMoGPTQForCausalLM"]
diff --git a/olmo/ckptavg.py b/olmo/ckptavg.py
new file mode 100644
index 000000000..8531f62da
--- /dev/null
+++ b/olmo/ckptavg.py
@@ -0,0 +1,17 @@
+import torch
+
+STATEDICTS = [
+    "advaveraged25.pt",
+    "advaveraged2550.pt",
+    "advaveraged5075.pt",
+    "advaveraged75.pt",
+]
+
+sd = torch.load(STATEDICTS[0])
+for state_dict in STATEDICTS[1:]:
+    sd2 = torch.load(state_dict)
+    for k,v in sd2.items():
+        assert k not in sd
+        sd[k] = v
+
+torch.save(sd, "advaveraged.pt")
diff --git a/olmo/config.py b/olmo/config.py
index c0f26b08b..618ee97dd 100644
--- a/olmo/config.py
+++ b/olmo/config.py
@@ -23,7 +23,7 @@
 from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
 
 from .aliases import PathOrStr
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 from .util import StrEnum
 
 __all__ = [
@@ -116,7 +116,7 @@ def new(cls: Type[C], **kwargs) -> C:
                 conf = om.merge(conf, kwargs)
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     @classmethod
     def load(
@@ -139,7 +139,7 @@ def load(
                 conf = om.merge(conf, om.from_dotlist(overrides))
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     def save(self, path: PathOrStr) -> None:
         """Save to a YAML file."""
diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py
index 52421b57a..7d8fbb56b 100644
--- a/olmo/data/__init__.py
+++ b/olmo/data/__init__.py
@@ -5,7 +5,7 @@
 
 from ..aliases import PathOrStr
 from ..config import DataConfig, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..torch_util import barrier, get_global_rank, get_world_size
 from .collator import DataCollator
 from .iterable_dataset import IterableDataset
@@ -21,7 +21,7 @@ def build_memmap_dataset(
     metadata: List[Dict[str, Any]] = []
     if data_config.paths:
         if data_config.datasets:
-            raise OlmoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
+            raise OLMoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
         paths = data_config.paths
         for path in paths:
             metadata.append({"path": str(path)})
@@ -32,7 +32,7 @@ def build_memmap_dataset(
             paths.extend(label_paths)
             metadata.extend([{"label": label}] * len(label_paths))
     else:
-        raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+        raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
     return MemMapDataset(
         *paths,
         chunk_size=train_config.model.max_sequence_length,
@@ -87,7 +87,7 @@ def build_train_dataloader(train_config: TrainConfig) -> DataLoader:
     work_dir = Path(train_config.save_folder) / "train_data"
     if get_global_rank() == 0:
         if work_dir.is_dir() and not train_config.save_overwrite:
-            raise OlmoConfigurationError(
+            raise OLMoConfigurationError(
                 "train data working directory already exists, use --save_overwrite to overwrite"
             )
         else:
diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py
index 5af73c277..c00f29e06 100644
--- a/olmo/data/memmap_dataset.py
+++ b/olmo/data/memmap_dataset.py
@@ -7,7 +7,7 @@
 import torch
 from torch.utils.data import Dataset
 
-from olmo.exceptions import OlmoEnvironmentError
+from olmo.exceptions import OLMoEnvironmentError
 
 from ..aliases import PathOrStr
 from ..util import _get_s3_client, file_size, get_bytes_range
@@ -93,7 +93,7 @@ def offsets(self) -> List[Tuple[int, int]]:
         _get_s3_client("s3")
         try:
             _get_s3_client("r2")
-        except OlmoEnvironmentError:
+        except OLMoEnvironmentError:
             # R2 might not be needed, so ignore this error. We will get an error
             # later if R2 is needed.
             pass
diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py
index 4c53f4b25..17dcc77fe 100644
--- a/olmo/eval/__init__.py
+++ b/olmo/eval/__init__.py
@@ -5,7 +5,7 @@
 from torchmetrics import MeanMetric, Metric
 
 from ..config import EvaluatorConfig, EvaluatorType, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..tokenizer import Tokenizer
 from ..torch_util import get_global_rank, get_world_size
 from .downstream import ICLMetric, label_to_task_map
@@ -90,7 +90,7 @@ def make_metric():
         elif eval_config.data.datasets:
             eval_metric = {label: make_metric() for label in eval_config.data.datasets.keys()}
         else:
-            raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+            raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
 
         return Evaluator(
             label=eval_config.label,
diff --git a/olmo/exceptions.py b/olmo/exceptions.py
index 754580c95..5474facc3 100644
--- a/olmo/exceptions.py
+++ b/olmo/exceptions.py
@@ -1,37 +1,37 @@
-__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError", "OlmoEnvironmentError", "OlmoNetworkError"]
+__all__ = ["OLMoError", "OLMoConfigurationError", "OLMoCliError", "OLMoEnvironmentError", "OLMoNetworkError"]
 
 
-class OlmoError(Exception):
+class OLMoError(Exception):
     """
     Base class for all custom OLMo exceptions.
     """
 
 
-class OlmoConfigurationError(OlmoError):
+class OLMoConfigurationError(OLMoError):
     """
     An error with a configuration file.
     """
 
 
-class OlmoCliError(OlmoError):
+class OLMoCliError(OLMoError):
     """
     An error from incorrect CLI usage.
     """
 
 
-class OlmoEnvironmentError(OlmoError):
+class OLMoEnvironmentError(OLMoError):
     """
     An error from incorrect environment variables.
     """
 
 
-class OlmoNetworkError(OlmoError):
+class OLMoNetworkError(OLMoError):
     """
     An error with a network request.
     """
 
 
-class OlmoThreadError(Exception):
+class OLMoThreadError(Exception):
     """
     Raised when a thread fails.
     """
diff --git a/olmo/model.py b/olmo/model.py
index a11eceb71..6100005ee 100644
--- a/olmo/model.py
+++ b/olmo/model.py
@@ -42,7 +42,7 @@
     LayerNormType,
     ModelConfig,
 )
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 from .initialization import ModuleType, init_weights
 from .torch_util import ensure_finite_
 
@@ -63,12 +63,12 @@
     "GELU",
     "ReLU",
     "SwiGLU",
-    "OlmoBlock",
-    "OlmoSequentialBlock",
-    "OlmoParallelBlock",
-    "Olmo",
-    "OlmoOutput",
-    "OlmoGenerateOutput",
+    "OLMoBlock",
+    "OLMoSequentialBlock",
+    "OLMoParallelBlock",
+    "OLMo",
+    "OLMoOutput",
+    "OLMoGenerateOutput",
 ]
 
 
@@ -421,7 +421,7 @@ def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device
     return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
 
 
-class OlmoBlock(nn.Module):
+class OLMoBlock(nn.Module):
     """
     A base class for transformer block implementations.
     """
@@ -620,18 +620,18 @@ def forward(
         raise NotImplementedError
 
     @classmethod
-    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OlmoBlock:
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OLMoBlock:
         if config.block_type == BlockType.sequential:
-            return OlmoSequentialBlock(layer_id, config, cache)
+            return OLMoSequentialBlock(layer_id, config, cache)
         elif config.block_type == BlockType.parallel:
-            return OlmoParallelBlock(layer_id, config, cache)
+            return OLMoParallelBlock(layer_id, config, cache)
         elif config.block_type == BlockType.llama:
-            return OlmoLlamaBlock(layer_id, config, cache)
+            return OLMoLlamaBlock(layer_id, config, cache)
         else:
             raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
 
 
-class OlmoSequentialBlock(OlmoBlock):
+class OLMoSequentialBlock(OLMoBlock):
     """
     This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
@@ -717,11 +717,11 @@ def forward(
         return x, cache
 
 
-class OlmoParallelBlock(OlmoBlock):
+class OLMoParallelBlock(OLMoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))``
     as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))``
-    as in :class:`OlmoSequentialBlock` (ignoring some skip connections).
+    as in :class:`OLMoSequentialBlock` (ignoring some skip connections).
 
     The decoupling of the MLP and Attention functions allow us to fuse the separate input projections
     into a single linear layer to increase throughput. In this configuration it's also straight-forward
@@ -804,10 +804,10 @@ def forward(
             )
 
 
-class OlmoLlamaBlock(OlmoBlock):
+class OLMoLlamaBlock(OLMoBlock):
     """
     This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
-    (plus another skip connection). This block is similar to `OlmoSequentialBlock`
+    (plus another skip connection). This block is similar to `OLMoSequentialBlock`
     but some operations have slightly different implementations to imitate the
     behavior of Llama.
     """
@@ -922,7 +922,7 @@ def forward(
         return x, cache
 
 
-class OlmoOutput(NamedTuple):
+class OLMoOutput(NamedTuple):
     logits: torch.FloatTensor
     """
     A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
@@ -940,7 +940,7 @@ class OlmoOutput(NamedTuple):
     """
 
 
-class OlmoGenerateOutput(NamedTuple):
+class OLMoGenerateOutput(NamedTuple):
     token_ids: torch.LongTensor
     """
     The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
@@ -953,7 +953,7 @@ class OlmoGenerateOutput(NamedTuple):
     """
 
 
-class OlmoBlockGroup(nn.ModuleList):
+class OLMoBlockGroup(nn.ModuleList):
     def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
         super().__init__(modules)
         self.config = config
@@ -1009,7 +1009,7 @@ def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointin
             block.set_activation_checkpointing(strategy)
 
 
-class Olmo(nn.Module):
+class OLMo(nn.Module):
     def __init__(self, config: ModelConfig, init_params: bool = True):
         super().__init__()
         self.config = config
@@ -1017,14 +1017,14 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
 
         # Validate config.
         if self.config.alibi and self.config.flash_attention:
-            raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention")
+            raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention")
 
         if self.config.alibi and self.config.rope:
-            raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive")
+            raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive")
 
         if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
             if self.config.embedding_size < self.config.vocab_size:
-                raise OlmoConfigurationError("embedding size should be at least as big as vocab size")
+                raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
             elif self.config.embedding_size % 128 != 0:
                 import warnings
 
@@ -1039,7 +1039,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
             0 < self.config.block_group_size <= self.config.n_layers
             and self.config.n_layers % self.config.block_group_size == 0
         ):
-            raise OlmoConfigurationError("n layers must be divisible by block group size")
+            raise OLMoConfigurationError("n layers must be divisible by block group size")
 
         torch.backends.cuda.enable_flash_sdp(self.config.flash_attention)
         torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
@@ -1054,10 +1054,10 @@ def __init__(self, config: ModelConfig, init_params: bool = True):
             )
         )
 
-        blocks = [OlmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
         if self.config.block_group_size > 1:
             block_groups = [
-                OlmoBlockGroup(config, i, blocks[i : i + config.block_group_size])
+                OLMoBlockGroup(config, i, blocks[i : i + config.block_group_size])
                 for i in range(0, config.n_layers, config.block_group_size)
             ]
             self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
@@ -1156,7 +1156,7 @@ def forward(
         use_cache: bool = False,
         last_logits_only: bool = False,
         output_hidden_states: Optional[bool] = None,
-    ) -> OlmoOutput:
+    ) -> OLMoOutput:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
@@ -1334,7 +1334,7 @@ def forward(
         if self.config.scale_logits:
             logits.mul_(1 / math.sqrt(self.config.d_model))
 
-        return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+        return OLMoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
 
     def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None):
         if wrap_strategy is None:
@@ -1354,7 +1354,7 @@ def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None)
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlock)
+                wrap = isinstance(module, OLMoBlock)
                 if recurse:
                     return True
                 else:
@@ -1365,7 +1365,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, (OlmoBlock,)) or module in size_based_module_to_wrap
+                wrap = isinstance(module, (OLMoBlock,)) or module in size_based_module_to_wrap
                 if recurse:
                     return True
                 else:
@@ -1374,13 +1374,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
             return fsdp_wrap_fn
         elif wrap_strategy == FSDPWrapStrategy.by_block_group:
             if self.config.block_group_size <= 1:
-                raise OlmoConfigurationError(
+                raise OLMoConfigurationError(
                     "'by_block_group' FSDP wrapping strategy requires block group size greater than 1"
                 )
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlockGroup)
+                wrap = isinstance(module, OLMoBlockGroup)
                 if recurse:
                     return True
                 else:
@@ -1389,13 +1389,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
             return fsdp_wrap_fn
         elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size:
             if self.config.block_group_size <= 1:
-                raise OlmoConfigurationError(
+                raise OLMoConfigurationError(
                     "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1"
                 )
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, (OlmoBlockGroup,)) or module in size_based_module_to_wrap
+                wrap = isinstance(module, (OLMoBlockGroup,)) or module in size_based_module_to_wrap
                 if recurse:
                     return True
                 else:
@@ -1421,7 +1421,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
 
             def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
                 del nonwrapped_numel
-                wrap = isinstance(module, OlmoBlock) and module.layer_id % c == 0
+                wrap = isinstance(module, OLMoBlock) and module.layer_id % c == 0
                 if recurse:
                     return True
                 else:
@@ -1472,7 +1472,7 @@ def generate(
         min_steps: Optional[int] = None,
         final_sequence_scorer: Optional[FinalSequenceScorer] = None,
         constraints: Optional[List[Constraint]] = None,
-    ) -> OlmoGenerateOutput:
+    ) -> OLMoGenerateOutput:
         """
         Generate token IDs using beam search.
 
@@ -1582,7 +1582,7 @@ def step(
         with torch.no_grad():
             token_ids, scores = beam_search.search(initial_preds, state, step)
 
-        return OlmoGenerateOutput(
+        return OLMoGenerateOutput(
             token_ids=token_ids,  # type: ignore[arg-type]
             scores=scores,  # type: ignore[arg-type]
         )
@@ -1590,7 +1590,7 @@ def step(
     @classmethod
     def from_checkpoint(
         cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: Optional[CheckpointType] = None
-    ) -> Olmo:
+    ) -> OLMo:
         """
         Load an OLMo model from a checkpoint.
         """
@@ -1613,7 +1613,7 @@ def from_checkpoint(
         if checkpoint_type == CheckpointType.unsharded:
             # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             model_config.init_device = "cpu"
-            model = Olmo(model_config)
+            model = OLMo(model_config)
 
             # Load state dict directly to target device.
             state_dict_path = resource_path(checkpoint_dir, "model.pt")
@@ -1626,7 +1626,7 @@ def from_checkpoint(
             # Initialize model on target device. In this case the state dict is loaded in-place
             # so it's not necessary to start on CPU if the target device is a GPU.
             model_config.init_device = device
-            model = Olmo(model_config)
+            model = OLMo(model_config)
 
             # Load state dict in place.
             load_model_state(checkpoint_dir, model)
diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py
index a833d3c21..3ed064097 100644
--- a/olmo/tokenizer.py
+++ b/olmo/tokenizer.py
@@ -8,7 +8,7 @@
 
 from .aliases import PathOrStr
 from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 
 __all__ = ["Tokenizer"]
 
@@ -68,7 +68,7 @@ def from_train_config(cls, config: TrainConfig) -> Tokenizer:
                 pad_token_id=config.model.pad_token_id,
             )
         if config.model.vocab_size != tokenizer.vocab_size:
-            raise OlmoConfigurationError("vocab size mismatch between config and tokenizer")
+            raise OLMoConfigurationError("vocab size mismatch between config and tokenizer")
         return tokenizer
 
     @classmethod
@@ -117,7 +117,7 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer:
             pad_token_id=model_config.pad_token_id,
         )
         if model_config.vocab_size != tokenizer.vocab_size:
-            raise OlmoConfigurationError("vocab size mismatch between config and tokenizer")
+            raise OLMoConfigurationError("vocab size mismatch between config and tokenizer")
         return tokenizer
 
     def add_special_tokens(self, input_ids: List[int]) -> List[int]:
diff --git a/olmo/train.py b/olmo/train.py
index 79132f0fc..43d4ee5fc 100644
--- a/olmo/train.py
+++ b/olmo/train.py
@@ -33,8 +33,8 @@
 )
 from .data import IterableDataset
 from .eval import Evaluator
-from .exceptions import OlmoConfigurationError
-from .model import Olmo
+from .exceptions import OLMoConfigurationError
+from .model import OLMo
 from .optim import Optimizer, Scheduler
 from .torch_util import (
     barrier,
@@ -96,7 +96,7 @@ def check(self) -> Dict[str, float]:
 @dataclass
 class Trainer:
     cfg: TrainConfig
-    model: Olmo
+    model: OLMo
     fsdp_model: FSDP
     optim: Optimizer
     scheduler: Scheduler
@@ -351,7 +351,7 @@ def _save_checkpoint(
                 upload_to=remote_checkpoint_dir,
             )
         except FileExistsError:
-            raise OlmoConfigurationError(
+            raise OLMoConfigurationError(
                 f"Checkpoint for step {self.global_step} already exists, use --save-overwrite to overwrite it"
             )
 
diff --git a/olmo/util.py b/olmo/util.py
index 71ee67e60..3473ff43f 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -25,11 +25,11 @@
 
 from .aliases import PathOrStr
 from .exceptions import (
-    OlmoCliError,
-    OlmoEnvironmentError,
-    OlmoError,
-    OlmoNetworkError,
-    OlmoThreadError,
+    OLMoCliError,
+    OLMoEnvironmentError,
+    OLMoError,
+    OLMoNetworkError,
+    OLMoThreadError,
 )
 from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
 
@@ -148,9 +148,9 @@ def excepthook(exctype, value, traceback):
     """
     if issubclass(exctype, KeyboardInterrupt):
         sys.__excepthook__(exctype, value, traceback)
-    elif issubclass(exctype, OlmoCliError):
+    elif issubclass(exctype, OLMoCliError):
         rich.get_console().print(f"[yellow]{value}[/]", highlight=False)
-    elif issubclass(exctype, OlmoError):
+    elif issubclass(exctype, OLMoError):
         rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False)
     else:
         log.critical("Uncaught %s: %s", exctype.__name__, value, exc_info=(exctype, value, traceback))
@@ -448,7 +448,7 @@ def _get_s3_profile_name(scheme: str) -> Optional[str]:
     if scheme == "r2":
         profile_name = os.environ.get("R2_PROFILE")
         if profile_name is None:
-            raise OlmoEnvironmentError(
+            raise OLMoEnvironmentError(
                 "R2 profile name is not set. Did you forget to set the 'R2_PROFILE' env var?"
             )
 
@@ -463,7 +463,7 @@ def _get_s3_endpoint_url(scheme: str) -> Optional[str]:
     if scheme == "r2":
         r2_endpoint_url = os.environ.get("R2_ENDPOINT_URL")
         if r2_endpoint_url is None:
-            raise OlmoEnvironmentError(
+            raise OLMoEnvironmentError(
                 "R2 endpoint url is not set. Did you forget to set the 'R2_ENDPOINT_URL' env var?"
             )
 
@@ -509,12 +509,12 @@ def _s3_upload(
                 _wait_before_retry(attempt)
 
         if err is not None:
-            raise OlmoNetworkError("Failed to check object existence during s3 upload") from err
+            raise OLMoNetworkError("Failed to check object existence during s3 upload") from err
 
     try:
         _get_s3_client(scheme).upload_file(source, bucket_name, key)
     except boto_exceptions.ClientError as e:
-        raise OlmoNetworkError("Failed to upload to s3") from e
+        raise OLMoNetworkError("Failed to upload to s3") from e
 
 
 def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3) -> int:
@@ -531,7 +531,7 @@ def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3
             log.warning("%s failed attempt %d with retriable error: %s", _s3_file_size.__name__, attempt, err)
             _wait_before_retry(attempt)
 
-    raise OlmoNetworkError("Failed to get s3 file size") from err
+    raise OLMoNetworkError("Failed to get s3 file size") from err
 
 
 def _s3_get_bytes_range(
@@ -570,7 +570,7 @@ def _s3_get_bytes_range(
     # This can cause an irrelevant exception (e.g. KeyError: 'error'), resulting
     # in us losing the true exception info. To avoid this, we change the exception
     # to a type that has a single-parameter constructor.
-    raise OlmoNetworkError("Failed to get bytes range from s3") from err
+    raise OLMoNetworkError("Failed to get bytes range from s3") from err
 
 
 def _s3_find_latest_checkpoint(scheme: str, bucket_name: str, prefix: str) -> Optional[str]:
@@ -624,7 +624,7 @@ def fill_queue():
 
     for x in iter(q.get, sentinel):
         if isinstance(x, Exception):
-            raise OlmoThreadError(f"generator thread {thread_name} failed") from x
+            raise OLMoThreadError(f"generator thread {thread_name} failed") from x
         else:
             yield x
 
diff --git a/scripts/average_ckpts.py b/scripts/average_ckpts.py
new file mode 100644
index 000000000..2be4533fe
--- /dev/null
+++ b/scripts/average_ckpts.py
@@ -0,0 +1,71 @@
+import os
+
+import torch
+
+CKPTS = [
+    "step456000-unsharded-lumi/model.pt",
+    "step556000-unsharded-mosaic/model.pt"
+]
+
+OUTDIR = "step456000-unsharded-lumi-mosaic"
+
+CKPTS = [
+    "step456000-unsharded-lumi/model.pt",
+    "step432410-unsharded-mosaic/model.pt"
+]
+
+
+
+
+
+
+import os
+
+import torch
+
+CKPTS = [
+"step551000-unsharded/model.pt",
+"step552000-unsharded/model.pt",
+"step553000-unsharded/model.pt",
+"step554000-unsharded/model.pt",
+"step555000-unsharded/model.pt",
+"step556000-unsharded/model.pt",
+"step557000-unsharded/model.pt",
+]
+
+OUTDIR = "last7_avg"
+
+first_sd = torch.load(CKPTS[0])
+for k in first_sd:
+    first_sd[k] = torch.stack([sd[k] for sd in [torch.load(ckpt) for ckpt in CKPTS]], dim=0).mean(dim=0)
+
+os.makedirs(OUTDIR, exist_ok=True)
+torch.save(first_sd, os.path.join(OUTDIR, "model.pt"))
+
+
+
+
+
+import os
+
+import torch
+
+CKPTS = [
+"step551000-unsharded/model.pt",
+"step552000-unsharded/model.pt",
+"step553000-unsharded/model.pt",
+"step554000-unsharded/model.pt",
+"step555000-unsharded/model.pt",
+"step556000-unsharded/model.pt",
+"step557000-unsharded/model.pt",
+]
+
+OUTDIR = "last7_avg"
+
+keys = list(torch.load(CKPTS[0]).keys())
+new_sd = {}
+for k in keys:
+    new_sd[k] = torch.stack([torch.load(ckpt)[k] for ckpt in CKPTS], dim=0).mean(dim=0)
+
+os.makedirs(OUTDIR, exist_ok=True)
+torch.save(new_sd, os.path.join(OUTDIR, "model.pt"))
diff --git a/scripts/average_ckpts_advanced.py b/scripts/average_ckpts_advanced.py
new file mode 100644
index 000000000..ebde8b0a3
--- /dev/null
+++ b/scripts/average_ckpts_advanced.py
@@ -0,0 +1,123 @@
+#python avgadvanced.py --input ./ --filter "step*/model.pt" --output advaveraged.pt --no-sort
+#!/usr/bin/env python
+""" Checkpoint Averaging Script
+
+This script averages all model weights for checkpoints in specified path that match
+the specified filter wildcard. All checkpoints must be from the exact same model.
+
+For any hope of decent results, the checkpoints should be from the same or child
+(via resumes) training session. This can be viewed as similar to maintaining running
+EMA (exponential moving average) of the model weights or performing SWA (stochastic
+weight averaging), but post-training.
+
+Hacked together by Ross Wightman (https://github.com/rwightman)
+"""
+import torch
+import argparse
+import os
+import glob
+import hashlib
+from timm.models.helpers import load_state_dict
+
+parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager')
+parser.add_argument('--input', default='', type=str, metavar='PATH',
+                    help='path to base input folder containing checkpoints')
+parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD',
+                    help='checkpoint filter (path wildcard)')
+parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH',
+                    help='output filename')
+parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true',
+                    help='Force not using ema version of weights (if present)')
+parser.add_argument('--descending', dest='descending', action='store_true',
+                    help='Set if eval metric is descending (like loss)')
+parser.add_argument('--no-sort', dest='no_sort', action='store_true',
+                    help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant')
+parser.add_argument('-n', type=int, default=10, metavar='N',
+                    help='Number of checkpoints to average')
+
+
+def checkpoint_metric(checkpoint_path):
+    if not checkpoint_path or not os.path.isfile(checkpoint_path):
+        return {}
+    print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path))
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    metric = None
+    if 'metric' in checkpoint:
+        metric = checkpoint['metric']
+    return metric
+
+
+def main():
+    args = parser.parse_args()
+    # by default use the EMA weights (if present)
+    args.use_ema = not args.no_use_ema
+    # by default sort by checkpoint metric (if present) and avg top n checkpoints
+    args.sort = not args.no_sort
+
+    if os.path.exists(args.output):
+        print("Error: Output filename ({}) already exists.".format(args.output))
+        exit(1)
+
+    pattern = args.input
+    if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep):
+        pattern += os.path.sep
+    pattern += args.filter
+    checkpoints = glob.glob(pattern, recursive=True)
+    if not checkpoints:
+        print("Error: No checkpoints to average.")
+        exit(1)
+    
+    if args.sort:
+        checkpoint_metrics = []
+        for c in checkpoints:
+            metric = checkpoint_metric(c)
+            if metric is not None:
+                checkpoint_metrics.append((metric, c))
+        checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending))
+        checkpoint_metrics = checkpoint_metrics[:args.n]
+        print("Selected checkpoints:")
+        [print(m, c) for m, c in checkpoint_metrics]
+        avg_checkpoints = [c for m, c in checkpoint_metrics]
+    else:
+        avg_checkpoints = checkpoints
+        print("Selected checkpoints:")
+        [print(c) for c in checkpoints]
+
+    avg_state_dict = {}
+    avg_counts = {}
+    for c in avg_checkpoints:
+        new_state_dict = load_state_dict(c, args.use_ema)
+        if not new_state_dict:
+            print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
+            continue
+
+        for k, v in new_state_dict.items():
+            if k not in avg_state_dict:
+                avg_state_dict[k] = v.clone().to(dtype=torch.float32)
+                avg_counts[k] = 1
+            else:
+                avg_state_dict[k] += v.to(dtype=torch.float32)
+                avg_counts[k] += 1
+
+    for k, v in avg_state_dict.items():
+        v.div_(avg_counts[k])
+
+    # float32 overflow seems unlikely based on weights seen to date, but who knows
+    float32_info = torch.finfo(torch.float32)
+    final_state_dict = {}
+    for k, v in avg_state_dict.items():
+        v = v.clamp(float32_info.min, float32_info.max)
+        final_state_dict[k] = v.to(dtype=torch.float32)
+
+    try:
+        torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False)
+    except:
+        torch.save(final_state_dict, args.output)
+
+    with open(args.output, 'rb') as f:
+        sha_hash = hashlib.sha256(f.read()).hexdigest()
+    print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/avgckpts_stepbystep.py b/scripts/avgckpts_stepbystep.py
new file mode 100644
index 000000000..4694fa0ba
--- /dev/null
+++ b/scripts/avgckpts_stepbystep.py
@@ -0,0 +1,127 @@
+#python avckpt25.py --input ./ --filter "step*/model.pt" --output advaveraged25.pt --no-sort
+#!/usr/bin/env python
+""" Checkpoint Averaging Script
+
+This script averages all model weights for checkpoints in specified path that match
+the specified filter wildcard. All checkpoints must be from the exact same model.
+
+For any hope of decent results, the checkpoints should be from the same or child
+(via resumes) training session. This can be viewed as similar to maintaining running
+EMA (exponential moving average) of the model weights or performing SWA (stochastic
+weight averaging), but post-training.
+
+Hacked together by Ross Wightman (https://github.com/rwightman)
+"""
+import torch
+import argparse
+import os
+import glob
+import hashlib
+
+parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager')
+parser.add_argument('--input', default='', type=str, metavar='PATH',
+                    help='path to base input folder containing checkpoints')
+parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD',
+                    help='checkpoint filter (path wildcard)')
+parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH',
+                    help='output filename')
+parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true',
+                    help='Force not using ema version of weights (if present)')
+parser.add_argument('--descending', dest='descending', action='store_true',
+                    help='Set if eval metric is descending (like loss)')
+parser.add_argument('--no-sort', dest='no_sort', action='store_true',
+                    help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant')
+parser.add_argument('-n', type=int, default=10, metavar='N',
+                    help='Number of checkpoints to average')
+
+
+def checkpoint_metric(checkpoint_path):
+    if not checkpoint_path or not os.path.isfile(checkpoint_path):
+        return {}
+    print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path))
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    metric = None
+    if 'metric' in checkpoint:
+        metric = checkpoint['metric']
+    return metric
+
+
+def main():
+    args = parser.parse_args()
+    # by default use the EMA weights (if present)
+    args.use_ema = not args.no_use_ema
+    # by default sort by checkpoint metric (if present) and avg top n checkpoints
+    args.sort = not args.no_sort
+
+    if os.path.exists(args.output):
+        print("Error: Output filename ({}) already exists.".format(args.output))
+        exit(1)
+
+    pattern = args.input
+    if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep):
+        pattern += os.path.sep
+    pattern += args.filter
+    checkpoints = glob.glob(pattern, recursive=True)
+    if not checkpoints:
+        print("Error: No checkpoints to average.")
+        exit(1)
+    
+    if args.sort:
+        checkpoint_metrics = []
+        for c in checkpoints:
+            metric = checkpoint_metric(c)
+            if metric is not None:
+                checkpoint_metrics.append((metric, c))
+        checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending))
+        checkpoint_metrics = checkpoint_metrics[:args.n]
+        print("Selected checkpoints:")
+        [print(m, c) for m, c in checkpoint_metrics]
+        avg_checkpoints = [c for m, c in checkpoint_metrics]
+    else:
+        avg_checkpoints = checkpoints
+        print("Selected checkpoints:")
+        [print(c) for c in checkpoints]
+
+    avg_state_dict = {}
+    avg_counts = {}
+    for c in avg_checkpoints:
+        new_state_dict = torch.load(c)
+        keys = list(new_state_dict.keys())
+        # Subselect 50%
+        keys = keys[:int(len(keys) * 0.25)]
+        new_state_dict = {k: new_state_dict[k] for k in keys}
+        if not new_state_dict:
+            print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
+            continue
+
+        for k in keys:
+            if k not in avg_state_dict:
+                avg_state_dict[k] = new_state_dict[k].clone().to(dtype=torch.float64)
+                avg_counts[k] = 1
+            else:
+                avg_state_dict[k] += new_state_dict[k].to(dtype=torch.float64)
+                avg_counts[k] += 1
+            del new_state_dict[k]
+
+    for k, v in avg_state_dict.items():
+        v.div_(avg_counts[k])
+
+    # float32 overflow seems unlikely based on weights seen to date, but who knows
+    float32_info = torch.finfo(torch.float32)
+    final_state_dict = {}
+    for k, v in avg_state_dict.items():
+        v = v.clamp(float32_info.min, float32_info.max)
+        final_state_dict[k] = v.to(dtype=torch.float32)
+
+    try:
+        torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False)
+    except:
+        torch.save(final_state_dict, args.output)
+
+    with open(args.output, 'rb') as f:
+        sha_hash = hashlib.sha256(f.read()).hexdigest()
+    print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/init_config.py b/scripts/init_config.py
index 22143d401..22c223d7b 100644
--- a/scripts/init_config.py
+++ b/scripts/init_config.py
@@ -7,7 +7,7 @@
 from typing import List
 
 from olmo import TrainConfig
-from olmo.exceptions import OlmoCliError
+from olmo.exceptions import OLMoCliError
 from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger(__name__)
@@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None:
     try:
         save_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]")
 
     main(Path(save_path), [clean_opt(s) for s in args_list])
diff --git a/scripts/inspect_train_data.py b/scripts/inspect_train_data.py
index 871702b8c..bed8b432f 100644
--- a/scripts/inspect_train_data.py
+++ b/scripts/inspect_train_data.py
@@ -9,7 +9,7 @@
 
 from olmo.config import TrainConfig
 from olmo.data import build_memmap_dataset
-from olmo.exceptions import OlmoCliError
+from olmo.exceptions import OLMoCliError
 from olmo.tokenizer import Tokenizer
 from olmo.util import clean_opt, prepare_cli_environment
 
@@ -51,6 +51,6 @@ def main(save_folder: Path, *steps: int, rank: Optional[int] = None):
     try:
         save_folder, rank, steps = sys.argv[1], int(sys.argv[2]), [int(i) for i in sys.argv[3:]]
     except (IndexError, ValueError):
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]")
 
     main(Path(save_folder), *steps, rank=rank if rank >= 0 else None)
diff --git a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh
new file mode 100644
index 000000000..6b70307f5
--- /dev/null
+++ b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+#SBATCH --job-name=v1-mix-medium
+#SBATCH --account=project_462000229
+#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log
+#SBATCH --nodes=32             # Total number of nodes 
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8       # Allocate one gpu per MPI rank
+#SBATCH --cpus-per-task=6
+#SBATCH --time=1:00:00
+#SBATCH --mem=0			# All memory on the node
+#SBATCH --partition=standard-g
+
+module load LUMI/22.08 partition/G
+
+export OLMO_CONTAINER=llm-lumi_latest.sif
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export MPICH_GPU_SUPPORT_ENABLED=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET_GDR_LEVEL=3
+export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID}
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+export CXI_FORK_SAFE=1
+export CXI_FORK_SAFE_HP=1
+export FI_CXI_DISABLE_CQ_HUGETLB=1
+
+# We need to set this to avoid "Cassini Event Queue overflow detected." errors.
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+
+#export NCCL_DEBUG=INFO
+export PYTHONPATH=.:${PYTHONPATH}
+export ROCM_PATH=/opt/rocm
+export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64
+
+# Try playing with max_split_size_mb if you run into OOM errors.
+#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128
+
+export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix
+export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints
+export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data
+
+srun \
+  --cpus-per-task=$SLURM_CPUS_PER_TASK \
+  --distribution=block:block \
+  --kill-on-bad-exit \
+  scripts/run_with_environment.sh \
+    singularity exec \
+    -B"$PROJECT_DIR:$PROJECT_DIR" \
+    -B"$FLASH_DIR:$FLASH_DIR" \
+    -B"$SCRATCH_DIR:$SCRATCH_DIR" \
+    -B /opt/cray:/opt/cray \
+    -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
+    -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
+    $PROJECT_DIR/containers/$OLMO_CONTAINER \
+    python scripts/train.py configs/v1_5-mix-medium-mitch-ish_nockpt.yaml --run_name=${SLURM_JOB_ID} ${@}
diff --git a/scripts/show_model_size.py b/scripts/show_model_size.py
index 3740137dc..cf2ca1e22 100644
--- a/scripts/show_model_size.py
+++ b/scripts/show_model_size.py
@@ -10,8 +10,8 @@
 import logging
 import sys
 
-from olmo import Olmo, TrainConfig
-from olmo.exceptions import OlmoCliError
+from olmo import OLMo, TrainConfig
+from olmo.exceptions import OLMoCliError
 from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger(__name__)
@@ -23,7 +23,7 @@ def main(cfg: TrainConfig) -> None:
     n_layers = cfg.model.n_layers
     cfg.model.n_layers = 1
 
-    single_layer_model = Olmo(cfg.model)
+    single_layer_model = OLMo(cfg.model)
     block = single_layer_model.transformer.blocks[0]  # type: ignore
     params_per_block = sum(p.numel() for p in block.parameters())  # type: ignore
 
@@ -42,7 +42,7 @@ def main(cfg: TrainConfig) -> None:
     try:
         yaml_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
 
     cfg = TrainConfig.load(
         yaml_path,
diff --git a/scripts/train.py b/scripts/train.py
index de97e31be..d2974c0a9 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -15,8 +15,8 @@
 from olmo.config import CheckpointType, TrainConfig
 from olmo.data import build_train_dataloader
 from olmo.eval import build_evaluators
-from olmo.exceptions import OlmoCliError, OlmoConfigurationError
-from olmo.model import Olmo
+from olmo.exceptions import OLMoCliError, OLMoConfigurationError
+from olmo.model import OLMo
 from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler
 from olmo.torch_util import (
     barrier,
@@ -36,7 +36,7 @@
 def main(cfg: TrainConfig) -> None:
     # Ensure run name set.
     if cfg.run_name is None:
-        raise OlmoConfigurationError("--run_name is required")
+        raise OLMoConfigurationError("--run_name is required")
     log_extra_field("run_name", cfg.run_name)
 
     # Sanity check
@@ -76,7 +76,7 @@ def main(cfg: TrainConfig) -> None:
             # Save config.
             save_path = Path(cfg.save_folder) / "config.yaml"
             if save_path.is_file() and not cfg.save_overwrite:
-                raise OlmoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite")
+                raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite")
             else:
                 log.info(f"Saving config to {save_path}")
                 save_path.parent.mkdir(exist_ok=True, parents=True)
@@ -113,7 +113,7 @@ def main(cfg: TrainConfig) -> None:
 
     # Initialize the model.
     log.info("Building model...")
-    olmo_model = Olmo(cfg.model)
+    olmo_model = OLMo(cfg.model)
     log.info(f"Total number of parameters: {olmo_model.num_params():,d}")
     log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}")
     log.info(f"Peak GPU Memory (MB) before FSDP: {int(peak_gpu_memory() or 0)}")
@@ -158,7 +158,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
     if cfg.save_data_indices:
         indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz"
         if indices_file_path.exists() and not cfg.save_overwrite:
-            raise OlmoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite")
+            raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite")
         indices_file_path.parent.mkdir(exist_ok=True, parents=True)
         indices_file = gzip.open(indices_file_path, "wt")
 
@@ -248,7 +248,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None:
     try:
         yaml_path, args_list = sys.argv[1], sys.argv[2:]
     except IndexError:
-        raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
+        raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
 
     cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list])
     main(cfg)
diff --git a/test_fixtures/test-olmo-model/config.json b/test_fixtures/test-olmo-model/config.json
index 71e7b981e..352a4c976 100644
--- a/test_fixtures/test-olmo-model/config.json
+++ b/test_fixtures/test-olmo-model/config.json
@@ -3,7 +3,7 @@
   "alibi": false,
   "alibi_bias_max": 8.0,
   "architectures": [
-    "OlmoModelForCausalLM"
+    "OLMoModelForCausalLM"
   ],
   "attention_dropout": 0.1,
   "attention_layer_norm": false,
diff --git a/tests/hf_olmo/hf_olmo_test.py b/tests/hf_olmo/hf_olmo_test.py
index 0b323c4e8..6f70c0090 100644
--- a/tests/hf_olmo/hf_olmo_test.py
+++ b/tests/hf_olmo/hf_olmo_test.py
@@ -3,7 +3,7 @@
 
 from olmo import BlockType, Tokenizer, TrainConfig
 from olmo.data import DataCollator
-from olmo.model import Olmo
+from olmo.model import OLMo
 from olmo.torch_util import seed_all
 
 
@@ -188,7 +188,7 @@ def test_forward(
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
     seed_all(1234)
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     hf_config = OLMoConfig(**model.config.asdict())
 
diff --git a/tests/hf_olmo/modeling_olmo_test.py b/tests/hf_olmo/modeling_olmo_test.py
index fda1bd715..e4bb02f54 100644
--- a/tests/hf_olmo/modeling_olmo_test.py
+++ b/tests/hf_olmo/modeling_olmo_test.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from olmo.model import Olmo
+from olmo.model import OLMo
 
 
 def test_olmo_model(model_path: str):
@@ -11,7 +11,7 @@ def test_olmo_model(model_path: str):
 
     from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast  # noqa: F401
 
-    model = Olmo.from_checkpoint(model_path)
+    model = OLMo.from_checkpoint(model_path)
     hf_model = AutoModelForCausalLM.from_pretrained(model_path)
 
     tokenizer = AutoTokenizer.from_pretrained(model_path)
diff --git a/tests/model_test.py b/tests/model_test.py
index 18dd5401f..ce1100037 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -3,7 +3,7 @@
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 
-from olmo import BlockType, LayerNorm, Olmo, Tokenizer, TrainConfig
+from olmo import BlockType, LayerNorm, OLMo, Tokenizer, TrainConfig
 from olmo.config import ModelConfig, PaddingDirection
 from olmo.data import DataCollator
 from olmo.model import AMDLayerNorm
@@ -174,7 +174,7 @@ def test_forward(
 
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     input1 = tokenizer.encode("My name is OLMo!")
     input2 = tokenizer.encode("I'm a delightful large open language model :)")
@@ -294,7 +294,7 @@ def test_backward(
     else:
         train_config.model.init_device = "cpu"
 
-    model = Olmo(train_config.model).train()
+    model = OLMo(train_config.model).train()
 
     with torch.autocast(
         device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype
@@ -365,7 +365,7 @@ def test_generate(
         train_config.model.init_device = "cpu"
     use_amp = dtype in {torch.float16, torch.bfloat16}
 
-    model = Olmo(train_config.model).eval()
+    model = OLMo(train_config.model).eval()
 
     input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False)
     input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False)
@@ -435,8 +435,8 @@ def test_layer_norm(train_config: TrainConfig, elementwise_affine: bool, include
 
 
 def test_block_groups():
-    model_with_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval()
-    model_without_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval()
+    model_with_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval()
+    model_without_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval()
 
     # We should be able to load the state dict from one model into the other, and vice-versa.
     state_dict_to_load, og_keys_to_new_keys = model_with_block_groups._make_state_dict_compatible(

From 9798c088f820885f1cb3ce6a03ecc4bb8b72ec4c Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:22:36 +0100
Subject: [PATCH 2/7] Rm confs

---
 configs/c4-medium_ckptfine.yaml               | 184 ------------------
 configs/c4-medium_ckptoneintwo.yaml           | 184 ------------------
 configs/c4-medium_ckptwhole.yaml              | 184 ------------------
 configs/c4-medium_nockpt.yaml                 | 182 -----------------
 configs/c4-small_nockpt.yaml                  | 183 -----------------
 configs/olmo_nockpt.yml                       |  86 --------
 configs/olmo_wholeckpt.yml                    |  88 ---------
 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml |  98 ----------
 8 files changed, 1189 deletions(-)
 delete mode 100644 configs/c4-medium_ckptfine.yaml
 delete mode 100644 configs/c4-medium_ckptoneintwo.yaml
 delete mode 100644 configs/c4-medium_ckptwhole.yaml
 delete mode 100644 configs/c4-medium_nockpt.yaml
 delete mode 100644 configs/c4-small_nockpt.yaml
 delete mode 100644 configs/olmo_nockpt.yml
 delete mode 100644 configs/olmo_wholeckpt.yml
 delete mode 100644 configs/v1_5-mix-medium-mitch-ish_nockpt.yaml

diff --git a/configs/c4-medium_ckptfine.yaml b/configs/c4-medium_ckptfine.yaml
deleted file mode 100644
index f5e77a958..000000000
--- a/configs/c4-medium_ckptfine.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: fine_grained
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_ckptoneintwo.yaml b/configs/c4-medium_ckptoneintwo.yaml
deleted file mode 100644
index 4ceb2901f..000000000
--- a/configs/c4-medium_ckptoneintwo.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: one_in_two
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_ckptwhole.yaml b/configs/c4-medium_ckptwhole.yaml
deleted file mode 100644
index 2e8084d32..000000000
--- a/configs/c4-medium_ckptwhole.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-activation_checkpointing: whole_layer
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-medium_nockpt.yaml b/configs/c4-medium_nockpt.yaml
deleted file mode 100644
index 0d862117b..000000000
--- a/configs/c4-medium_nockpt.yaml
+++ /dev/null
@@ -1,182 +0,0 @@
-run_name: c4-medium-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-
-model:
-  d_model: 4096
-  n_heads: 16
-  n_layers: 30
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 2
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 50000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/c4-small_nockpt.yaml b/configs/c4-small_nockpt.yaml
deleted file mode 100644
index bdc2e04a9..000000000
--- a/configs/c4-small_nockpt.yaml
+++ /dev/null
@@ -1,183 +0,0 @@
-run_name: c4-small-run-001
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50257
-  embedding_size: 50304
-  eos_token_id: 50256
-  pad_token_id: 50256
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 2.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 2
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: gpt2
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 476837  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-  
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
-evaluators:
-  ##########################
-  # Perplexity evaluations #
-  ##########################
-  - label: c4-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  - label: rp-validation
-    subset_num_batches: 10
-    data:
-      paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
-      num_workers: 1
-      drop_last: true
-      pin_memory: true
-      persistent_workers: true
-      prefetch_factor: 4
-
-  # lump all the small datasets together (we still get separate metrics).
-  - label: all-small-ppl-validation
-    data:
-      datasets:
-        4chan-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/4chan/val.npy
-        c4_100_domains-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_100_domains/val.npy
-        c4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/c4_en/val.npy
-        gab-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/gab/val.npy
-        ice-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ice/val.npy
-        m2d2_s2orc-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_s2orc/val.npy
-        m2d2_wiki-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/m2d2_wiki/val.npy
-        manosphere-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/manosphere/val.npy
-        mc4_en-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/mc4_en/val.npy
-        pile-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/pile/val.npy
-        ptb-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/ptb/val.npy
-        twitterAEE-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/twitterAEE/val.npy
-        wikitext_103-validation:
-          - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small/wikitext_103/val.npy
-      drop_last: true
-
-  ##########################
-  # Downstream evaluations #
-  ##########################
-  - label: piqa
-    type: downstream
-
-  - label: hellaswag
-    type: downstream
-
-  - label: winogrande
-    type: downstream
-
-  - label: openbook_qa
-    type: downstream
-
-  # - label: boolq  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: sciq
-    type: downstream
-
-  - label: arc_easy
-    type: downstream
-
-  # - label: arc_challenge  # requires implemention of the pmi_dc matrix
-    # type: downstream
-    #
-  - label: copa
-    type: downstream
-
-  - label: rte
-    type: downstream
-
-  - label: commitment_bank
-    type: downstream
-
-  - label: mrpc
-    type: downstream
-
-  - label: sst2
-    type: downstream
diff --git a/configs/olmo_nockpt.yml b/configs/olmo_nockpt.yml
deleted file mode 100644
index f09396738..000000000
--- a/configs/olmo_nockpt.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-run_name: olmo-small-ablation
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50277
-  embedding_size: 50304
-  eos_token_id: 50276
-  pad_token_id: 50276
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 4
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: EleutherAI/gpt-neox-20b
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 953674  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 1024
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/olmo_wholeckpt.yml b/configs/olmo_wholeckpt.yml
deleted file mode 100644
index efc53dbf2..000000000
--- a/configs/olmo_wholeckpt.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-run_name: olmo-small-ablation
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: c4-small
-
-activation_checkpointing: whole_layer
-
-model:
-  d_model: 2048
-  n_heads: 16
-  n_layers: 16
-  mlp_ratio: 8
-  alibi: true
-  alibi_bias_max: 8.0
-  attention_dropout: 0.0
-  attention_layer_norm: true
-  multi_query_attention: true
-  block_type: sequential
-  layer_norm_type: low_precision  # if not compiling, use 'low_precision'
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50277
-  embedding_size: 50304
-  eos_token_id: 50276
-  pad_token_id: 50276
-  init_device: meta
-  init_std: 0.02
-
-compile: null  # causes instability on AMD GPUs
-
-optimizer:
-  name: lionw
-  learning_rate: 1.0e-4
-  weight_decay: 0.01
-  betas:
-  - 0.9
-  - 0.95
-
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 2000
-  t_max: null
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 4
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0
-
-tokenizer:
-  identifier: EleutherAI/gpt-neox-20b
-  truncate_direction: right
-
-save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: 9
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: 10000
-save_num_unsharded_checkpoints_to_keep: -1
-
-load_path: null
-
-# max_duration: 953674  # 2T tokens
-max_duration: 50  # 200B tokens
-global_train_batch_size: 1024
-device_train_microbatch_size: 8
-
-precision: amp_bf16
-
-max_grad_norm: 1.0
-
-speed_monitor:
-  window_size: 20
-
-eval_interval: ${save_interval}
-eval_subset_num_batches: -1
-device_eval_batch_size: ${device_train_microbatch_size}
\ No newline at end of file
diff --git a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml b/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
deleted file mode 100644
index 0c0974f0e..000000000
--- a/configs/v1_5-mix-medium-mitch-ish_nockpt.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-run_name: v1_5-mix-medium-mitch-ish
-seed: 6198
-dry_run: false
-
-wandb:
-  name: ${run_name}
-  project: olmo-medium
-  group: v1_5-mix
-
-model:
-  d_model: 4096
-  n_heads: 32
-  n_layers: 32
-  # mlp_ratio: 6
-  mlp_hidden_size: 22016
-  weight_tying: false
-  alibi: false
-  rope: true
-  flash_attention: false  # not available on AMD
-  attention_dropout: 0.0
-  attention_layer_norm: false
-  multi_query_attention: false
-  include_bias: false
-  block_type: sequential
-  layer_norm_type: default
-  layer_norm_with_affine: false
-  bias_for_layer_norm: false
-  attention_layer_norm_with_affine: false
-  activation_type: swiglu
-  residual_dropout: 0.0
-  embedding_dropout: 0.0
-  max_sequence_length: 2048
-  vocab_size: 50280
-  embedding_size: 50304
-  eos_token_id: 0
-  pad_token_id: 1
-  init_device: meta
-  init_fn: mitchell
-
-compile: null
-
-optimizer:
-  name: adamw
-  learning_rate: 3.0e-4
-  weight_decay: 0.1
-  betas:
-  - 0.9
-  - 0.95
-  metrics_log_interval: 10
-
-scheduler:
-  name: linear_with_warmup
-  t_warmup: 5000
-  alpha_f: 0.1
-  grad_clip_warmup_steps: 1000
-  grad_clip_warmup_factor: 10.0
-
-tokenizer:
-  identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
-  truncate_direction: right
-
-save_folder: ${oc.env:CHECKPOINTS_PATH}/${oc.env:SLURM_JOB_ID,${run_name}}
-save_overwrite: false
-# Sharded checkpoints (best for restarts)
-save_interval: 1000
-save_num_checkpoints_to_keep: -1
-# Unsharded checkpoints (for final storage)
-save_interval_unsharded: null  # getting errors on LUMI right now
-save_num_unsharded_checkpoints_to_keep: -1
-no_pre_train_checkpoint: true
-
-load_path: null
-
-max_duration: 50  # 2T tokens
-global_train_batch_size: 2048
-device_train_microbatch_size: 2
-
-precision: amp_bf16
-
-fsdp:
-  wrapping_strategy: null
-  precision: mixed
-
-max_grad_norm: 1.0
-max_grad_norm_ratio: null
-
-speed_monitor:
-  window_size: 20
-
-data:
-  paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
-  pad_direction: right
-  num_workers: 0
-  drop_last: true
-  pin_memory: true
-  prefetch_factor: 16
-  persistent_workers: true
-  timeout: 0

From 9be1519cd6ccc781b5ec8c034d66a48f8d07e5ba Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:25:19 +0100
Subject: [PATCH 3/7] Rm scripts

---
 olmo/ckptavg.py                   |  17 ----
 scripts/average_ckpts.py          |  71 -----------------
 scripts/average_ckpts_advanced.py | 123 -----------------------------
 scripts/avgckpts_stepbystep.py    | 127 ------------------------------
 4 files changed, 338 deletions(-)
 delete mode 100644 olmo/ckptavg.py
 delete mode 100644 scripts/average_ckpts.py
 delete mode 100644 scripts/average_ckpts_advanced.py
 delete mode 100644 scripts/avgckpts_stepbystep.py

diff --git a/olmo/ckptavg.py b/olmo/ckptavg.py
deleted file mode 100644
index 8531f62da..000000000
--- a/olmo/ckptavg.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import torch
-
-STATEDICTS = [
-    "advaveraged25.pt",
-    "advaveraged2550.pt",
-    "advaveraged5075.pt",
-    "advaveraged75.pt",
-]
-
-sd = torch.load(STATEDICTS[0])
-for state_dict in STATEDICTS[1:]:
-    sd2 = torch.load(state_dict)
-    for k,v in sd2.items():
-        assert k not in sd
-        sd[k] = v
-
-torch.save(sd, "advaveraged.pt")
diff --git a/scripts/average_ckpts.py b/scripts/average_ckpts.py
deleted file mode 100644
index 2be4533fe..000000000
--- a/scripts/average_ckpts.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-
-import torch
-
-CKPTS = [
-    "step456000-unsharded-lumi/model.pt",
-    "step556000-unsharded-mosaic/model.pt"
-]
-
-OUTDIR = "step456000-unsharded-lumi-mosaic"
-
-CKPTS = [
-    "step456000-unsharded-lumi/model.pt",
-    "step432410-unsharded-mosaic/model.pt"
-]
-
-
-
-
-
-
-import os
-
-import torch
-
-CKPTS = [
-"step551000-unsharded/model.pt",
-"step552000-unsharded/model.pt",
-"step553000-unsharded/model.pt",
-"step554000-unsharded/model.pt",
-"step555000-unsharded/model.pt",
-"step556000-unsharded/model.pt",
-"step557000-unsharded/model.pt",
-]
-
-OUTDIR = "last7_avg"
-
-first_sd = torch.load(CKPTS[0])
-for k in first_sd:
-    first_sd[k] = torch.stack([sd[k] for sd in [torch.load(ckpt) for ckpt in CKPTS]], dim=0).mean(dim=0)
-
-os.makedirs(OUTDIR, exist_ok=True)
-torch.save(first_sd, os.path.join(OUTDIR, "model.pt"))
-
-
-
-
-
-import os
-
-import torch
-
-CKPTS = [
-"step551000-unsharded/model.pt",
-"step552000-unsharded/model.pt",
-"step553000-unsharded/model.pt",
-"step554000-unsharded/model.pt",
-"step555000-unsharded/model.pt",
-"step556000-unsharded/model.pt",
-"step557000-unsharded/model.pt",
-]
-
-OUTDIR = "last7_avg"
-
-keys = list(torch.load(CKPTS[0]).keys())
-new_sd = {}
-for k in keys:
-    new_sd[k] = torch.stack([torch.load(ckpt)[k] for ckpt in CKPTS], dim=0).mean(dim=0)
-
-os.makedirs(OUTDIR, exist_ok=True)
-torch.save(new_sd, os.path.join(OUTDIR, "model.pt"))
diff --git a/scripts/average_ckpts_advanced.py b/scripts/average_ckpts_advanced.py
deleted file mode 100644
index ebde8b0a3..000000000
--- a/scripts/average_ckpts_advanced.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#python avgadvanced.py --input ./ --filter "step*/model.pt" --output advaveraged.pt --no-sort
-#!/usr/bin/env python
-""" Checkpoint Averaging Script
-
-This script averages all model weights for checkpoints in specified path that match
-the specified filter wildcard. All checkpoints must be from the exact same model.
-
-For any hope of decent results, the checkpoints should be from the same or child
-(via resumes) training session. This can be viewed as similar to maintaining running
-EMA (exponential moving average) of the model weights or performing SWA (stochastic
-weight averaging), but post-training.
-
-Hacked together by Ross Wightman (https://github.com/rwightman)
-"""
-import torch
-import argparse
-import os
-import glob
-import hashlib
-from timm.models.helpers import load_state_dict
-
-parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager')
-parser.add_argument('--input', default='', type=str, metavar='PATH',
-                    help='path to base input folder containing checkpoints')
-parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD',
-                    help='checkpoint filter (path wildcard)')
-parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH',
-                    help='output filename')
-parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true',
-                    help='Force not using ema version of weights (if present)')
-parser.add_argument('--descending', dest='descending', action='store_true',
-                    help='Set if eval metric is descending (like loss)')
-parser.add_argument('--no-sort', dest='no_sort', action='store_true',
-                    help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant')
-parser.add_argument('-n', type=int, default=10, metavar='N',
-                    help='Number of checkpoints to average')
-
-
-def checkpoint_metric(checkpoint_path):
-    if not checkpoint_path or not os.path.isfile(checkpoint_path):
-        return {}
-    print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path))
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-    metric = None
-    if 'metric' in checkpoint:
-        metric = checkpoint['metric']
-    return metric
-
-
-def main():
-    args = parser.parse_args()
-    # by default use the EMA weights (if present)
-    args.use_ema = not args.no_use_ema
-    # by default sort by checkpoint metric (if present) and avg top n checkpoints
-    args.sort = not args.no_sort
-
-    if os.path.exists(args.output):
-        print("Error: Output filename ({}) already exists.".format(args.output))
-        exit(1)
-
-    pattern = args.input
-    if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep):
-        pattern += os.path.sep
-    pattern += args.filter
-    checkpoints = glob.glob(pattern, recursive=True)
-    if not checkpoints:
-        print("Error: No checkpoints to average.")
-        exit(1)
-    
-    if args.sort:
-        checkpoint_metrics = []
-        for c in checkpoints:
-            metric = checkpoint_metric(c)
-            if metric is not None:
-                checkpoint_metrics.append((metric, c))
-        checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending))
-        checkpoint_metrics = checkpoint_metrics[:args.n]
-        print("Selected checkpoints:")
-        [print(m, c) for m, c in checkpoint_metrics]
-        avg_checkpoints = [c for m, c in checkpoint_metrics]
-    else:
-        avg_checkpoints = checkpoints
-        print("Selected checkpoints:")
-        [print(c) for c in checkpoints]
-
-    avg_state_dict = {}
-    avg_counts = {}
-    for c in avg_checkpoints:
-        new_state_dict = load_state_dict(c, args.use_ema)
-        if not new_state_dict:
-            print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
-            continue
-
-        for k, v in new_state_dict.items():
-            if k not in avg_state_dict:
-                avg_state_dict[k] = v.clone().to(dtype=torch.float32)
-                avg_counts[k] = 1
-            else:
-                avg_state_dict[k] += v.to(dtype=torch.float32)
-                avg_counts[k] += 1
-
-    for k, v in avg_state_dict.items():
-        v.div_(avg_counts[k])
-
-    # float32 overflow seems unlikely based on weights seen to date, but who knows
-    float32_info = torch.finfo(torch.float32)
-    final_state_dict = {}
-    for k, v in avg_state_dict.items():
-        v = v.clamp(float32_info.min, float32_info.max)
-        final_state_dict[k] = v.to(dtype=torch.float32)
-
-    try:
-        torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False)
-    except:
-        torch.save(final_state_dict, args.output)
-
-    with open(args.output, 'rb') as f:
-        sha_hash = hashlib.sha256(f.read()).hexdigest()
-    print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash))
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/scripts/avgckpts_stepbystep.py b/scripts/avgckpts_stepbystep.py
deleted file mode 100644
index 4694fa0ba..000000000
--- a/scripts/avgckpts_stepbystep.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#python avckpt25.py --input ./ --filter "step*/model.pt" --output advaveraged25.pt --no-sort
-#!/usr/bin/env python
-""" Checkpoint Averaging Script
-
-This script averages all model weights for checkpoints in specified path that match
-the specified filter wildcard. All checkpoints must be from the exact same model.
-
-For any hope of decent results, the checkpoints should be from the same or child
-(via resumes) training session. This can be viewed as similar to maintaining running
-EMA (exponential moving average) of the model weights or performing SWA (stochastic
-weight averaging), but post-training.
-
-Hacked together by Ross Wightman (https://github.com/rwightman)
-"""
-import torch
-import argparse
-import os
-import glob
-import hashlib
-
-parser = argparse.ArgumentParser(description='PyTorch Checkpoint Averager')
-parser.add_argument('--input', default='', type=str, metavar='PATH',
-                    help='path to base input folder containing checkpoints')
-parser.add_argument('--filter', default='*.pth.tar', type=str, metavar='WILDCARD',
-                    help='checkpoint filter (path wildcard)')
-parser.add_argument('--output', default='./averaged.pth', type=str, metavar='PATH',
-                    help='output filename')
-parser.add_argument('--no-use-ema', dest='no_use_ema', action='store_true',
-                    help='Force not using ema version of weights (if present)')
-parser.add_argument('--descending', dest='descending', action='store_true',
-                    help='Set if eval metric is descending (like loss)')
-parser.add_argument('--no-sort', dest='no_sort', action='store_true',
-                    help='Do not sort and select by checkpoint metric, also makes "n" argument irrelevant')
-parser.add_argument('-n', type=int, default=10, metavar='N',
-                    help='Number of checkpoints to average')
-
-
-def checkpoint_metric(checkpoint_path):
-    if not checkpoint_path or not os.path.isfile(checkpoint_path):
-        return {}
-    print("=> Extracting metric from checkpoint '{}'".format(checkpoint_path))
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-    metric = None
-    if 'metric' in checkpoint:
-        metric = checkpoint['metric']
-    return metric
-
-
-def main():
-    args = parser.parse_args()
-    # by default use the EMA weights (if present)
-    args.use_ema = not args.no_use_ema
-    # by default sort by checkpoint metric (if present) and avg top n checkpoints
-    args.sort = not args.no_sort
-
-    if os.path.exists(args.output):
-        print("Error: Output filename ({}) already exists.".format(args.output))
-        exit(1)
-
-    pattern = args.input
-    if not args.input.endswith(os.path.sep) and not args.filter.startswith(os.path.sep):
-        pattern += os.path.sep
-    pattern += args.filter
-    checkpoints = glob.glob(pattern, recursive=True)
-    if not checkpoints:
-        print("Error: No checkpoints to average.")
-        exit(1)
-    
-    if args.sort:
-        checkpoint_metrics = []
-        for c in checkpoints:
-            metric = checkpoint_metric(c)
-            if metric is not None:
-                checkpoint_metrics.append((metric, c))
-        checkpoint_metrics = list(sorted(checkpoint_metrics, reverse=not args.descending))
-        checkpoint_metrics = checkpoint_metrics[:args.n]
-        print("Selected checkpoints:")
-        [print(m, c) for m, c in checkpoint_metrics]
-        avg_checkpoints = [c for m, c in checkpoint_metrics]
-    else:
-        avg_checkpoints = checkpoints
-        print("Selected checkpoints:")
-        [print(c) for c in checkpoints]
-
-    avg_state_dict = {}
-    avg_counts = {}
-    for c in avg_checkpoints:
-        new_state_dict = torch.load(c)
-        keys = list(new_state_dict.keys())
-        # Subselect 50%
-        keys = keys[:int(len(keys) * 0.25)]
-        new_state_dict = {k: new_state_dict[k] for k in keys}
-        if not new_state_dict:
-            print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
-            continue
-
-        for k in keys:
-            if k not in avg_state_dict:
-                avg_state_dict[k] = new_state_dict[k].clone().to(dtype=torch.float64)
-                avg_counts[k] = 1
-            else:
-                avg_state_dict[k] += new_state_dict[k].to(dtype=torch.float64)
-                avg_counts[k] += 1
-            del new_state_dict[k]
-
-    for k, v in avg_state_dict.items():
-        v.div_(avg_counts[k])
-
-    # float32 overflow seems unlikely based on weights seen to date, but who knows
-    float32_info = torch.finfo(torch.float32)
-    final_state_dict = {}
-    for k, v in avg_state_dict.items():
-        v = v.clamp(float32_info.min, float32_info.max)
-        final_state_dict[k] = v.to(dtype=torch.float32)
-
-    try:
-        torch.save(final_state_dict, args.output, _use_new_zipfile_serialization=False)
-    except:
-        torch.save(final_state_dict, args.output)
-
-    with open(args.output, 'rb') as f:
-        sha_hash = hashlib.sha256(f.read()).hexdigest()
-    print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash))
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From 98904685973ddbc58e52bc5262face42678caa04 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:26:45 +0100
Subject: [PATCH 4/7] Rm scripts

---
 .../lumi/v1_5-mix-medium-mitch-ish_nockpt.sh  | 55 -------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh

diff --git a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh b/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh
deleted file mode 100644
index 6b70307f5..000000000
--- a/scripts/lumi/v1_5-mix-medium-mitch-ish_nockpt.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=v1-mix-medium
-#SBATCH --account=project_462000229
-#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log
-#SBATCH --nodes=32             # Total number of nodes 
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus-per-node=8       # Allocate one gpu per MPI rank
-#SBATCH --cpus-per-task=6
-#SBATCH --time=1:00:00
-#SBATCH --mem=0			# All memory on the node
-#SBATCH --partition=standard-g
-
-module load LUMI/22.08 partition/G
-
-export OLMO_CONTAINER=llm-lumi_latest.sif
-
-export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-export MPICH_GPU_SUPPORT_ENABLED=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET_GDR_LEVEL=3
-export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID}
-export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
-export CXI_FORK_SAFE=1
-export CXI_FORK_SAFE_HP=1
-export FI_CXI_DISABLE_CQ_HUGETLB=1
-
-# We need to set this to avoid "Cassini Event Queue overflow detected." errors.
-export FI_CXI_DEFAULT_CQ_SIZE=131072
-
-#export NCCL_DEBUG=INFO
-export PYTHONPATH=.:${PYTHONPATH}
-export ROCM_PATH=/opt/rocm
-export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64
-
-# Try playing with max_split_size_mb if you run into OOM errors.
-#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128
-
-export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix
-export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints
-export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data
-
-srun \
-  --cpus-per-task=$SLURM_CPUS_PER_TASK \
-  --distribution=block:block \
-  --kill-on-bad-exit \
-  scripts/run_with_environment.sh \
-    singularity exec \
-    -B"$PROJECT_DIR:$PROJECT_DIR" \
-    -B"$FLASH_DIR:$FLASH_DIR" \
-    -B"$SCRATCH_DIR:$SCRATCH_DIR" \
-    -B /opt/cray:/opt/cray \
-    -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
-    -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
-    $PROJECT_DIR/containers/$OLMO_CONTAINER \
-    python scripts/train.py configs/v1_5-mix-medium-mitch-ish_nockpt.yaml --run_name=${SLURM_JOB_ID} ${@}

From 1d97aa5f2b42bc595c953f38245b20cd4eeb2989 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 25 Feb 2024 10:29:32 +0100
Subject: [PATCH 5/7] Add OLMo change

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fae7e99eb..9070f9579 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`.
 - Added support for Python 3.8.
 - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet.
+- Rename `Olmo` to `OLMo` everywhere in the codebase
 
 ### Added
 - Added `output_hidden_states` argument and associated functionality to `OLMo` and `OLMoForCausalLM` to return model intermediate hidden states.

From 9e9e9c0dc0780fc7855ab50c9a1de98057584c47 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Thu, 7 Mar 2024 17:24:58 -0800
Subject: [PATCH 6/7] Bump version

---
 olmo/version.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/olmo/version.py b/olmo/version.py
index 3f9d92c5b..e75c8373e 100644
--- a/olmo/version.py
+++ b/olmo/version.py
@@ -1,8 +1,8 @@
 _MAJOR = "0"
-_MINOR = "2"
+_MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "5"
+_PATCH = "0"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""

From afb547368f167c3c6781c54283d9a395b653d3c5 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Thu, 7 Mar 2024 21:33:26 -0800
Subject: [PATCH 7/7] Fix changelog

---
 CHANGELOG.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be127a61e..b93d52fb9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Changed
+
+- Rename `Olmo` to `OLMo` everywhere in the codebase
+
+### Removed
+
+- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore.
+
+
 ## [v0.2.5](https://github.com/allenai/OLMo/releases/tag/v0.2.5) - 2024-03-06
 
 ### Fixed
@@ -15,7 +24,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`.
 - Added support for Python 3.8.
 - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet.
-- Rename `Olmo` to `OLMo` everywhere in the codebase
 - Fixed running with data loading workers on LUMI
 
 ### Added
@@ -29,10 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Changed legacy checkpoint unsharding to use processes and shared memory instead of threads
 
-### Removed
-
-- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore.
-
 
 ## [v0.2.4](https://github.com/allenai/OLMo/releases/tag/v0.2.4) - 2024-02-02