From ab38f6095838d35df65270f3b1cd08d7b7758bfa Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 9 Aug 2023 03:27:25 -0400
Subject: [PATCH 01/64] add support for flash attention 2

---
 megatron/model/transformer.py                | 67 +++++++-------------
 requirements/requirements-flashattention.txt |  2 +-
 2 files changed, 24 insertions(+), 45 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 4e81b70b6..bd2d6af81 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -344,15 +344,10 @@ def __init__(
             )
         else:
             if self.use_flash_attention:
-                from megatron.model.flash_attention import (
-                    flash_attn_unpadded_qkvpacked_func_cuda,
-                    flash_attn_unpadded_kvpacked_func_cuda,
-                    flash_attn_unpadded_unpacked_func_triton,
-                )
-
-                self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
-                self.flash_qkv_fn = flash_attn_unpadded_qkvpacked_func_cuda
-                self.flash_kv_fn = flash_attn_unpadded_kvpacked_func_cuda
+                from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_func
+                self.flash_triton_fn = None
+                self.flash_qkv_fn = flash_attn_func
+                self.flash_varlen_qkv_fn = flash_attn_varlen_func
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,
@@ -497,10 +492,10 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
             # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
             key_layer = key_layer.transpose(0, 1).reshape(
-                output_size[0] * output_size[3], 1, output_size[1], -1
+                output_size[0], output_size[3], output_size[1], -1 
             )
             value_layer = value_layer.transpose(0, 1).reshape(
-                output_size[0] * output_size[3], 1, output_size[1], -1
+                output_size[0], output_size[3], output_size[1], -1 
             )
 
             batch_size = output_size[0]
@@ -523,51 +518,35 @@ def flash_attention(self, query_layer, key_layer, value_layer):
                 device=key_layer.device,
             )
 
-            if not self.training:
-
-                # [sq, b, np, hn] -> [b * sq, np, hn]
-                query_layer = query_layer.transpose(0, 1).reshape(
-                    output_size[0] * output_size[2], output_size[1], -1
-                )
+            # [sq, b, np, hn] -> [b, sq, np, hn]
+            query_layer = query_layer.transpose(0, 1).reshape(
+                output_size[0], output_size[2], output_size[1], -1
+            )
 
-                # Combined k/v into [b * sk, 2, np, hn].
-                kv = torch.cat([key_layer, value_layer], dim=1)
 
-                output = self.flash_kv_fn(
-                    query_layer,
-                    kv,
-                    cu_seqlens_q,
-                    cu_seqlens_k,
-                    max_seqlen_q,
-                    max_seqlen_k,
-                    self.dropout_p if self.training else 0.0,
+            if not self.training:
+                q_shape = query_layer.shape
+                k_shape = key_layer.shape
+                v_shape = value_layer.shape
+                output = self.flash_varlen_qkv_fn(
+                    query_layer.reshape((q_shape[0]*q_shape[1], q_shape[2], q_shape[3])),
+                    key_layer.reshape((k_shape[0]*k_shape[1], k_shape[2], k_shape[3])), 
+                    value_layer.reshape((v_shape[0]*v_shape[1], v_shape[2], v_shape[3])),
+                    cu_seqlens_q, cu_seqlens_k,
+                    max_seqlen_q, max_seqlen_k,
                     softmax_scale=None,
                     causal=True,
                 )
-
+                output = output.reshape(q_shape)
             else:
-
-                # [sq, b, np, hn] -> [b * sq, 1, np, hn]
-                query_layer = query_layer.transpose(0, 1).reshape(
-                    output_size[0] * output_size[2], 1, output_size[1], -1
-                )
-
-                # Combined q/k/v into [b * s, 3, np, hn].
-                qkv = torch.cat([query_layer, key_layer, value_layer], dim=1)
-
                 output = self.flash_qkv_fn(
-                    qkv,
-                    cu_seqlens_q,
-                    max_seqlen_q,
+                    query_layer, key_layer, value_layer,
                     self.dropout_p if self.training else 0.0,
                     softmax_scale=None,
                     causal=True,
                 )
 
-            # [b * sq, np, hn] -> [b, sq, np, hn]
-            matmul_result = output.view(
-                output_size[0], output_size[2], output.shape[1], output.shape[2]
-            )
+            matmul_result = output
             # [b, sq, np, hn] -> [b, np, sq, hn]
             matmul_result = matmul_result.transpose(1, 2)
 
diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt
index 0c7d41e59..8397ebe75 100644
--- a/requirements/requirements-flashattention.txt
+++ b/requirements/requirements-flashattention.txt
@@ -1 +1 @@
-flash-attn==0.2.2
+flash-attn==2.0.0.post1

From 840c09f5db5bc92b3d259f8954fccbd412b0c9ec Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 9 Aug 2023 03:50:05 -0400
Subject: [PATCH 02/64] change cosine decay to chinchilla style

---
 megatron/learning_rates.py           | 18 +++++++++++++-----
 megatron/neox_arguments/neox_args.py | 10 ++++++++++
 megatron/training.py                 |  8 +++++++-
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index cbf2cc2fc..ffecbb95f 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -34,6 +34,7 @@ def __init__(
         decay_style,
         last_iter,
         min_lr=0.0,
+        final_lr=None,
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
@@ -55,6 +56,8 @@ def __init__(
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
             )
+        assert not (final_lr and decay_style!="cosine")
+        self.final_lr = final_lr
         # Set the learning rate
         self.step(self.num_iters)
 
@@ -73,11 +76,16 @@ def get_lr(self):
         if self.decay_style == "linear":
             lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
         elif self.decay_style == "cosine":
-            lr = self.min_lr + (
-                (self.start_lr-self.min_lr)
-                / 2.0
-                * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
-            )
+            half_period = self.end_iter - self.warmup_iter
+            lr = ( 
+                    self.start_lr * (
+                        self.decay_lr_to +
+                        (1 - self.final_lr) *
+                        0.5 * (
+                            math.cos(math.pi * num_iters_/half_period) + 1
+                        )
+                    )
+            )   
         elif self.decay_style == "exponential":
             # exp(-0.693) = 1/2
             lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index e1ea16a16..baa1af13b 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -451,11 +451,21 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
     Minimum value for learning rate. The scheduler clips values below this threshold.
     """
 
+    final_lr: float = 0.1
+    """
+    If using cosine decay, the learning rate at the final step, prior to any clipping based on the value of `min_lr`.
+    """
+
     warmup: float = 0.01
     """
     Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
     """
 
+    warmup_iters: int = None
+    """
+    Number of iterations to warm up for. Incompatible with `warmup`.
+    """
+
     override_lr_scheduler: bool = False
     """
     Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
diff --git a/megatron/training.py b/megatron/training.py
index 96a94a1d0..65fc84d9a 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -593,7 +593,12 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         num_iters = neox_args.train_iters
     num_iters = max(1, num_iters)
     init_step = 0
-    warmup_iter = neox_args.warmup * num_iters
+
+    assert not (neox_args.warmup_iters and neox_args.warmup)
+    if neox_args.warmup:
+        warmup_iter = neox_args.warmup * num_iters
+    elif neox_args.warmup_iters:
+        warmup_iter = neox_args.warmup_iters
     lr_scheduler = AnnealingLR(
         optimizer,
         start_lr=neox_args.lr,
@@ -602,6 +607,7 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         decay_style=neox_args.lr_decay_style,
         last_iter=init_step,
         min_lr=neox_args.min_lr,
+        final_lr=neox_args.final_lr,
         use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=neox_args.override_lr_scheduler,
         use_mup=neox_args.use_mup,

From ae26360c83bc5b24136c12f6b798bde6693e3363 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 9 Aug 2023 02:27:13 -0600
Subject: [PATCH 03/64] set default warmup to none so that warmup_iters can be
 set

---
 megatron/neox_arguments/neox_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index baa1af13b..bf5a64c5c 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -456,7 +456,7 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
     If using cosine decay, the learning rate at the final step, prior to any clipping based on the value of `min_lr`.
     """
 
-    warmup: float = 0.01
+    warmup: float = None
     """
     Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
     """

From bf4cab508ca62d957a20e4da51716fcb1f0f4056 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 9 Aug 2023 14:11:35 -0600
Subject: [PATCH 04/64] fixed bug

---
 megatron/learning_rates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index ffecbb95f..7d0aeacd4 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -79,7 +79,7 @@ def get_lr(self):
             half_period = self.end_iter - self.warmup_iter
             lr = ( 
                     self.start_lr * (
-                        self.decay_lr_to +
+                        self.final_lr +
                         (1 - self.final_lr) *
                         0.5 * (
                             math.cos(math.pi * num_iters_/half_period) + 1

From ff86462599eacd7433e6dfa646cc70f9b665fc58 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 9 Aug 2023 15:09:40 -0600
Subject: [PATCH 05/64] fixed chinchilla lr

---
 megatron/learning_rates.py           | 10 +++++-----
 megatron/neox_arguments/neox_args.py |  4 ++--
 megatron/training.py                 |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 7d0aeacd4..e68cb7a56 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -34,7 +34,7 @@ def __init__(
         decay_style,
         last_iter,
         min_lr=0.0,
-        final_lr=None,
+        decay_lr_to=None,
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
@@ -56,8 +56,8 @@ def __init__(
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
             )
-        assert not (final_lr and decay_style!="cosine")
-        self.final_lr = final_lr
+        assert not (decay_lr_to and decay_style!="cosine")
+        self.decay_lr_to = decay_lr_to
         # Set the learning rate
         self.step(self.num_iters)
 
@@ -79,8 +79,8 @@ def get_lr(self):
             half_period = self.end_iter - self.warmup_iter
             lr = ( 
                     self.start_lr * (
-                        self.final_lr +
-                        (1 - self.final_lr) *
+                        self.decay_lr_to +
+                        (1 - self.decay_lr_to) *
                         0.5 * (
                             math.cos(math.pi * num_iters_/half_period) + 1
                         )
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index bf5a64c5c..a47c6a269 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -451,9 +451,9 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
     Minimum value for learning rate. The scheduler clips values below this threshold.
     """
 
-    final_lr: float = 0.1
+    decay_lr_to: float = None
     """
-    If using cosine decay, the learning rate at the final step, prior to any clipping based on the value of `min_lr`.
+    If using cosine decay, the ratio of max lr that the lr is decayed to, prior to any clipping based on the value of `min_lr`.
     """
 
     warmup: float = None
diff --git a/megatron/training.py b/megatron/training.py
index 65fc84d9a..58ee564d1 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -607,7 +607,7 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         decay_style=neox_args.lr_decay_style,
         last_iter=init_step,
         min_lr=neox_args.min_lr,
-        final_lr=neox_args.final_lr,
+        decay_lr_to=neox_args.decay_lr_to,
         use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=neox_args.override_lr_scheduler,
         use_mup=neox_args.use_mup,

From 757320b154767bd038de2d97658950034344081d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 9 Aug 2023 23:36:58 +0000
Subject: [PATCH 06/64] add s3 checkpoint syncing

---
 megatron/checkpointing.py            | 122 ++++++++++++++++++++++++++-
 megatron/neox_arguments/neox_args.py |  10 +++
 megatron/training.py                 |   1 +
 requirements/requirements.txt        |   2 +
 4 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 8ad2a88c7..2c08f46c2 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -18,13 +18,17 @@
 """Input/output checkpointing."""
 
 import json
+import math
 import os
 import re
 import shutil
+import time
 import random
 import sys
 import numpy as np
 
+import boto3
+import hf_transfer
 import torch
 from glob import glob
 
@@ -137,6 +141,10 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False, mp_rank=None
     )
 
 
+def get_checkpoint_tag(iteration: int) -> str:
+    return f"global_step{iteration}"
+
+
 def delete_old_checkpoints(save_dir, n_to_keep):
     if torch.distributed.get_rank() == 0:
         ckpt_dir_regex = r"global_step[\d]*"
@@ -188,7 +196,7 @@ def save_ds_checkpoint(iteration, model, neox_args):
         sd["checkpoint_validation_logits"] = logits
 
     # checkpoint folder name
-    tag = f"global_step{iteration}"
+    tag = get_checkpoint_tag(iteration)
 
     # save checkpoint
     model.save_checkpoint(neox_args.save, tag=tag, client_state=sd)
@@ -203,6 +211,111 @@ def save_ds_checkpoint(iteration, model, neox_args):
                     f.write(config_data)
                 else:
                     json.dump(config_data, f)
+def multiprocessing_starmap(func, args, num_processes=None):
+    """Wrapper to allow for re-usable multiprocessing pools with `spawn` context handling
+    Args:
+        func (Callable): Function to call
+        args (Iterable): Iterable of arguments to pass to `func`
+        num_processes (int, optional): Number of processes to spawn. Defaults to `multiprocessing.cpu_count() - 1`
+    """
+    import multiprocessing
+    num_processes = num_processes or (multiprocessing.cpu_count() - 1)
+    with multiprocessing.get_context("spawn").Pool(processes=num_processes) as process_pool:
+        process_pool.starmap(func, args)
+        process_pool.terminate()
+        process_pool.join()
+        del process_pool
+
+
+def _upload(
+    file_path: str,
+    s3_key: str,
+    chunk_size: int = 104_857_600,
+    max_files: int = 64,
+    parallel_failures: int = 63,
+    max_retries: int = 5,
+):
+    """Upload local file to S3 using `hf_transfer` library
+    Args:
+        file_path (str): Local filename to upload
+        s3_key (str): S3 key to upload to. E.g. `s3://bucket-name/path/to/file`
+        chunk_size (int, optional): Chunk size to use for multipart upload.
+            Defaults to 100MiB = 104_857_600
+        max_files (int, optional):  Number of open file handles, which determines
+            the maximum number of parallel downloads. Defaults to 64
+        parallel_failures (int, optional): Number of maximum failures of different
+            chunks in parallel (cannot exceed max_files). Defaults to 63
+        max_retries (int, optional): Number of retries for each chunk. Defaults to 5
+    """
+    s3 = boto3.client('s3')
+    bucket = s3_key.split("s3://")[1].split("/")[0]
+    key = s3_key.split(bucket)[1].lstrip("/")
+
+    # 1. Init multipart upload and obtain unique upload identifier
+    upload = s3.create_multipart_upload(
+        ACL="bucket-owner-full-control",
+        Bucket=bucket,
+        Key=key,
+    )
+    upload_id = upload["UploadId"]
+
+    # 2. Generate presigned URLs for each part
+    file_size = os.stat(file_path).st_size
+    urls = []
+    nb_parts = math.ceil(file_size / chunk_size)
+    for part_number in range(1, nb_parts + 1):
+        params = {
+            "Bucket": bucket,
+            "Key": key,
+            "PartNumber": part_number,
+            "UploadId": upload_id,
+        }
+        urls.append(
+            s3.generate_presigned_url(
+                ClientMethod="upload_part", Params=params, ExpiresIn=86400
+            )
+        )
+
+    # 3. Upload parts in parallel
+    responses = hf_transfer.multipart_upload(
+        file_path=file_path,
+        parts_urls=urls,
+        chunk_size=chunk_size,
+        max_files=max_files,
+        parallel_failures=parallel_failures,
+        max_retries=max_retries,
+    )
+
+    # 4. Complete multipart upload request with ETag values
+    etag_with_parts = []
+    for part_number, header in enumerate(responses):
+        etag = header.get("etag")
+        etag_with_parts.append({"ETag": etag, "PartNumber": part_number + 1})
+    parts = {"Parts": etag_with_parts}
+    s3.complete_multipart_upload(
+        Bucket=bucket, Key=key, MultipartUpload=parts, UploadId=upload_id
+    )
+
+
+def upload_checkpoint(iteration, neox_args):
+    local_checkpoint_path = os.path.join(os.path.abspath(neox_args.save), get_checkpoint_tag(iteration))
+    local_checkpoint_list = sorted(filter(
+        lambda x: os.path.isfile(x),
+        [str(p) for p in Path(local_checkpoint_path).rglob("*")],
+    ))
+    remote_checkpoint_path = os.path.join(
+        neox_args.s3_path, os.path.basename(neox_args.save), get_checkpoint_tag(iteration))
+    remote_checkpoint_list = [
+        os.path.join(remote_checkpoint_path, os.path.relpath(local_checkpoint, local_checkpoint_path))
+        for local_checkpoint in local_checkpoint_list
+    ]
+    inputs = zip(local_checkpoint_list, remote_checkpoint_list, [neox_args.s3_chunk_size] * len(local_checkpoint_list))
+
+    print_rank_0(f"[RANK {torch.distributed.get_rank()}] Uploading checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}`...")
+    start = time.time()
+    multiprocessing_starmap(_upload, inputs)
+    total_time = time.time() - start
+    print_rank_0(f"[RANK {torch.distributed.get_rank()}] Uploaded checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}` in {total_time:.2f}s")
 
 
 def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
@@ -213,6 +326,11 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
     else:
         raise ValueError("Must be using deepspeed to use neox")
 
+    torch.distributed.barrier()
+    upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
+    if upload_to_s3:
+        upload_checkpoint(iteration, neox_args)
+
     # Wait so everyone is done (necessary)
     torch.distributed.barrier()
     if neox_args.keep_last_n_checkpoints is not None:
@@ -233,7 +351,7 @@ def load_checkpoint(
         if neox_args.finetune:
             load_optim_and_scheduler = False
         if iteration is not None:
-            tag = f"global_step{iteration}"
+            tag = get_checkpoint_tag(iteration)
         else:
             tag = None
         checkpoint_name, state_dict = model.load_checkpoint(
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index e1ea16a16..a3173246e 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -793,6 +793,16 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Output directory to save checkpoints to.
     """
 
+    s3_path: str = None
+    """
+    Path to s3 bucket for saving checkpoints.
+    """
+
+    s3_chunk_size: int = 104_857_600
+    """
+    The number of bytes in each file chunk when uploading to s3. Defaults to 100MiB.
+    """ 
+
     config_files: dict = None
     """
     Store of original config files mapping config filename to file contents
diff --git a/megatron/training.py b/megatron/training.py
index 96a94a1d0..9e03262f3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -30,6 +30,7 @@
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
 import numpy as np
 
+from CPCargo import Heartbeat
 from megatron.utils import (
     Timers,
     init_wandb,
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3f3a70882..443f162e6 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -3,6 +3,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed
 ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
+hf-transfer>=0.1.3
 lm_eval>=0.3.0
 mpi4py>=3.0.3
 numpy>=1.22.0
@@ -13,3 +14,4 @@ six
 tiktoken>=0.1.2
 tokenizers>=0.12.1
 transformers>=4.24.0
+git+https://github.com/samikama/CPCargo@main

From 8a11029f37cf5a42be44cb8f9cbeb8a91c703f5f Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Thu, 10 Aug 2023 18:16:11 -0400
Subject: [PATCH 07/64] rotary embedding in fp32

---
 megatron/model/positional_embeddings.py | 44 ++++++++++++++-----------
 megatron/model/transformer.py           |  5 ++-
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 68815075a..c91a59db2 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -36,31 +36,35 @@ def forward(self, x, seq_dim=1):
 
 
 class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000, precision=torch.half):
+    def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         super().__init__()
+        self.precision = precision
+        self.max_seq_len = max_seq_len
+        
+        # precompute cos_cached, sin_cached in fp32
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
-        self.seq_len_cached = None
-        self.cos_cached = None
-        self.sin_cached = None
-        self.precision = precision
+
+        t = torch.arange(max_seq_len).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+
+        self.cos_cached = torch.nn.parameter(
+                emb.cos()[:, None, None, :].to(precision), 
+                requires_grad=False,
+        )
+        self.sin_cached = torch.nn.parameter(
+                emb.sin()[:, None, None, :].to(precision), 
+                requires_grad=False,
+        )
 
     def forward(self, x, seq_dim=1, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[seq_dim]
-        if seq_len != self.seq_len_cached:
-            self.seq_len_cached = seq_len
-            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            if self.precision == torch.bfloat16:
-                emb = emb.float()
-            self.cos_cached = emb.cos()[:, None, None, :]
-            self.sin_cached = emb.sin()[:, None, None, :]
-            if self.precision == torch.bfloat16:
-                self.cos_cached = self.cos_cached.bfloat16()
-                self.sin_cached = self.sin_cached.bfloat16()
-        return self.cos_cached, self.sin_cached
+        seq_len = x.shape[seq_dim]
+        assert seq_len <= self.max_seq_len
+
+        cos_trunced = self.cos_cached[:seq_len].to(x.device)
+        sin_trunced = self.sin_cached[:seq_len].to(x.device)
+        return cos_trunced, sin_trunced
 
 
 # rotary pos emb helpers:
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 4e81b70b6..e5d198d1f 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -327,7 +327,10 @@ def __init__(
                 else self.hidden_size_per_attention_head
             )
             self.rotary_emb = RotaryEmbedding(
-                dim, base=neox_args.rotary_emb_base, precision=neox_args.params_dtype
+                dim, 
+                base=neox_args.rotary_emb_base, 
+                max_seq_len=neox_args.seq_len,
+                precision=neox_args.params_dtype
             )
         else:
             self.rotary_emb = None

From d869e47c3507e3e27ff736a9e48e6072f586bd1a Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Thu, 10 Aug 2023 18:33:53 -0400
Subject: [PATCH 08/64] fix for seq_len < max_seq_len

---
 megatron/model/positional_embeddings.py | 35 ++++++++++++++-----------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index c91a59db2..2f077ffa9 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -40,31 +40,36 @@ def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         super().__init__()
         self.precision = precision
         self.max_seq_len = max_seq_len
+        self.base = base
         
         # precompute cos_cached, sin_cached in fp32
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        cos_cached, sin_cached, inv_freq = self._prepare_cache(max_seq_len, precision, base)
+
         self.register_buffer("inv_freq", inv_freq)
+        self.cos_cached = torch.nn.Parameter(cos_cached, requires_grad=False)
+        self.sin_cached = torch.nn.Parameter(sin_cached, requires_grad=False)
+    
+    def _prepare_cache(seq_len, precision, base):
+        # precompute cos_cached, sin_cached in fp32
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 
-        t = torch.arange(max_seq_len).type_as(self.inv_freq)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        t = torch.arange(seq_len).type_as(inv_freq)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
 
-        self.cos_cached = torch.nn.parameter(
-                emb.cos()[:, None, None, :].to(precision), 
-                requires_grad=False,
-        )
-        self.sin_cached = torch.nn.parameter(
-                emb.sin()[:, None, None, :].to(precision), 
-                requires_grad=False,
-        )
+        cos_cached = emb.cos()[:, None, None, :]
+        sin_cached = emb.sin()[:, None, None, :]
+        
+        return cos_cached.to(precision), sin_cached.to(precision), inv_freq.to(precision)
 
     def forward(self, x, seq_dim=1, seq_len=None):
         seq_len = x.shape[seq_dim]
         assert seq_len <= self.max_seq_len
-
-        cos_trunced = self.cos_cached[:seq_len].to(x.device)
-        sin_trunced = self.sin_cached[:seq_len].to(x.device)
-        return cos_trunced, sin_trunced
+        if seq_len != self.max_seq_len:
+            cos, sin, _ = self._prepare_cache(seq_len, self.precision, self.base)
+            return cos.to(x.device), sin.to(x.device)
+        else:
+            return self.cos_cached.to(x.device), self.sin_cached(x.device)
 
 
 # rotary pos emb helpers:

From 52ba5e44b645c7f42098bffb16fcc556b1b538f7 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Fri, 11 Aug 2023 17:25:33 -0600
Subject: [PATCH 09/64] some fixes, still not working

---
 megatron/model/positional_embeddings.py | 13 +++++++------
 megatron/model/transformer.py           |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 2f077ffa9..dd9ecc756 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -41,17 +41,18 @@ def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         self.precision = precision
         self.max_seq_len = max_seq_len
         self.base = base
+        self.dim = dim
         
         # precompute cos_cached, sin_cached in fp32
         cos_cached, sin_cached, inv_freq = self._prepare_cache(max_seq_len, precision, base)
 
         self.register_buffer("inv_freq", inv_freq)
-        self.cos_cached = torch.nn.Parameter(cos_cached, requires_grad=False)
-        self.sin_cached = torch.nn.Parameter(sin_cached, requires_grad=False)
-    
-    def _prepare_cache(seq_len, precision, base):
+        self.cos_cached = cos_cached
+        self.sin_cached = sin_cached    
+
+    def _prepare_cache(self, seq_len, precision, base):
         # precompute cos_cached, sin_cached in fp32
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float() / self.dim))
 
         t = torch.arange(seq_len).type_as(inv_freq)
         freqs = torch.einsum("i,j->ij", t, inv_freq)
@@ -69,7 +70,7 @@ def forward(self, x, seq_dim=1, seq_len=None):
             cos, sin, _ = self._prepare_cache(seq_len, self.precision, self.base)
             return cos.to(x.device), sin.to(x.device)
         else:
-            return self.cos_cached.to(x.device), self.sin_cached(x.device)
+            return self.cos_cached.to(x.device), self.sin_cached.to(x.device)
 
 
 # rotary pos emb helpers:
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index e5d198d1f..9723d0312 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -329,7 +329,7 @@ def __init__(
             self.rotary_emb = RotaryEmbedding(
                 dim, 
                 base=neox_args.rotary_emb_base, 
-                max_seq_len=neox_args.seq_len,
+                max_seq_len=neox_args.seq_length,
                 precision=neox_args.params_dtype
             )
         else:

From dfedf0511d066b278122a0290a29edc084fd3372 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Fri, 11 Aug 2023 17:29:09 -0600
Subject: [PATCH 10/64] ?' :

---
 megatron/model/positional_embeddings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index dd9ecc756..50ee68985 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -67,8 +67,8 @@ def forward(self, x, seq_dim=1, seq_len=None):
         seq_len = x.shape[seq_dim]
         assert seq_len <= self.max_seq_len
         if seq_len != self.max_seq_len:
-            cos, sin, _ = self._prepare_cache(seq_len, self.precision, self.base)
-            return cos.to(x.device), sin.to(x.device)
+            y, z, _ = self._prepare_cache(seq_len, self.precision, self.base)
+            return y.to(x.device), z.to(x.device)
         else:
             return self.cos_cached.to(x.device), self.sin_cached.to(x.device)
 

From fcbd8a117d3cc589fd619cab03f34513226862a7 Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Sun, 13 Aug 2023 01:39:16 -0600
Subject: [PATCH 11/64] fix bugs; evaluate on step 0

---
 megatron/model/positional_embeddings.py | 11 +++++++----
 megatron/training.py                    | 12 ++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 50ee68985..28e64fc5f 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -60,16 +60,19 @@ def _prepare_cache(self, seq_len, precision, base):
 
         cos_cached = emb.cos()[:, None, None, :]
         sin_cached = emb.sin()[:, None, None, :]
-        
+         
         return cos_cached.to(precision), sin_cached.to(precision), inv_freq.to(precision)
 
     def forward(self, x, seq_dim=1, seq_len=None):
-        seq_len = x.shape[seq_dim]
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
         assert seq_len <= self.max_seq_len
         if seq_len != self.max_seq_len:
-            y, z, _ = self._prepare_cache(seq_len, self.precision, self.base)
-            return y.to(x.device), z.to(x.device)
+            cos_new, sin_new, _ = self._prepare_cache(seq_len, self.precision, self.base)
+            # print(f"##############\nseq_len!=max_seq_len\ninput: {x.shape}\ncos_new: {cos_new.shape}\nsin_new: {sin_new.shape}\nseq_dim:{seq_dim}")
+            return cos_new.to(x.device), sin_new.to(x.device)
         else:
+            # print(f"##############\nseq_len==max_seq_len\ninput: {x.shape}\ncos_cached: {self.cos_cached.shape}\nsin_cached: {self.cos_cached.shape}")
             return self.cos_cached.to(x.device), self.sin_cached.to(x.device)
 
 
diff --git a/megatron/training.py b/megatron/training.py
index 58ee564d1..1257a2793 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -778,6 +778,18 @@ def train(
     valid_data_iterator,
 ):
     """Train the model function."""
+    # evaluate on step 0
+    prefix = "iteration 0"
+    evaluate_and_print_results(
+        neox_args=neox_args,
+        prefix=prefix,
+        forward_step_func=forward_step,
+        data_iterator=valid_data_iterator,
+        model=model,
+        iteration=0,
+        verbose=False,
+        timers=timers,
+    )
 
     # Turn on training mode which enables dropout.
     model.train()

From 334bbd5815efe6e1472a4c90bdd8e30d1581fb6e Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Wed, 23 Aug 2023 17:40:35 -0400
Subject: [PATCH 12/64] first attempt at gqa

---
 megatron/model/transformer.py        | 103 +++++++++++++++++++++------
 megatron/neox_arguments/neox_args.py |  12 +++-
 2 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 958cff3e3..8a75d51a7 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -284,15 +284,34 @@ def __init__(
         )
         self.pos_emb = neox_args.pos_emb
 
-        # Strided linear layer.
-        self.query_key_value = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=3 * neox_args.hidden_size,
-            gather_output=False,
-            init_method=init_method,
-            bias=neox_args.use_bias_in_attn_linear,
-        )
+        self.attention_type = neox_args.attention_type
+        if self.attention_type != "multihead":
+            self.num_kv_heads_per_partition = mpu.divide(neox_args.num_kv_heads, world_size) # TODO: we want to clone single-kv heads across ranks...
+            self.kv_hidden_size = neox_args.num_kv_heads * self.hidden_size_per_attention_head
+        else:
+            self.num_kv_heads_per_partition = None
+            self.kv_hidden_size = None
+
+        if self.attention_type == "multihead":
+            # Strided linear layer.
+            self.query_key_value = mpu.ColumnParallelLinear(
+                    neox_args=neox_args,
+                    input_size=neox_args.hidden_size,
+                    output_size=3 * neox_args.hidden_size,
+                    gather_output=False,
+                    init_method=init_method,
+                    bias=neox_args.use_bias_in_attn_linear,
+                )
+        else:
+            self.query_key_value = mpu.ColumnParallelLinear(
+                neox_args=neox_args,
+                input_size=neox_args.hidden_size,
+                output_size=neox_args.hidden_size + 2 * self.kv_hidden_size,
+                gather_output=False,
+                init_method=init_method,
+                bias=neox_args.use_bias_in_attn_linear,
+            )
+
 
         coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -599,20 +618,62 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         # Query, Key, and Value
         # =====================
 
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer, _ = self.query_key_value(hidden_states)
+        if self.attention_type=="multihead":
+            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-        new_tensor_shape = mixed_x_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
+                mixed_x_layer, 3
+            )
+        else: 
+            # Attention heads [sq, b, h] --> [sq, b, (np + 2 * num. (query / num. kv)) * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+            # TODO: instead split here into [sq, b, np * hn], 2 [sq, b, np/kv_ratio * hn] and then reshape?
+            # TODO: check equivalence (in the multihead case(?))
+            # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
+            (query_layer, key_layer, value_layer) = [
+                    x.contiguous() for x in torch.split(
+                        mixed_x_layer, [
+                            self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, 
+                            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head, 
+                            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head
+                        ], 
+                        dim=mixed_x_layer.dim() - 1
+                    )
+            ]
+
+            # [sq, b, (np * (1 + 2 * num. (query / num. kv)) * hn)] --> [sq, b, np, (1 + 2 * nq / nkv) * hn]
+            #new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            #     self.num_attention_heads_per_partition + ???,
+            #     self.hidden_size_per_attention_head,
+
+            # [sq, b, np * hn] --> [sq, b, np, hn]
+            new_query_shape = (query_layer.size(0), query_layer.size(1), self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+
+            query_layer = query_layer.view(*new_query_shape)
+
+            new_kv_shape = new_query_shape[:2] + (self.num_kv_heads_per_partition, self.hidden_size_per_attention_head,)
+
+            key_layer = key_layer.view(*new_kv_shape)
+
+            value_layer = value_layer.view(*new_kv_shape)
+
+            # mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            ## [sq, b, np, 3 * hn
+            #(query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
+            #    mixed_x_layer, 3
+            #)
 
-        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-        (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
-            mixed_x_layer, 3
-        )
 
         if exists(self.rotary_emb):
             if exists(self.rotary_ndims):
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index d7a14b458..b707724e8 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -107,7 +107,17 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     num_attention_heads: int = None
     """
-    Number of transformer attention heads.
+    Number of transformer attention heads. Used for number of query heads only, if num_kv_heads is set.
+    """
+
+    num_kv_heads: int = None 
+    """
+    Number of transformer key/value attention heads. Need not be set if using attention_type `multihead` or `multiquery`.
+    """
+
+    attention_type: Literal["multihead", "multiquery", "groupedquery"] = "multihead"
+    """
+    Whether to use multi-head, multi-query (https://arxiv.org/pdf/1911.02150.pdf), or grouped-query (https://arxiv.org/pdf/2305.13245.pdf) attention.
     """
 
     seq_length: int = None

From 3c8616fece37cbe7aad6466ec23cb51e28df9779 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 8 Sep 2023 16:12:54 +0000
Subject: [PATCH 13/64] gqa works in kv_heads==query_heads case

---
 megatron/model/transformer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 8a75d51a7..ee9c8c0d5 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -289,8 +289,8 @@ def __init__(
             self.num_kv_heads_per_partition = mpu.divide(neox_args.num_kv_heads, world_size) # TODO: we want to clone single-kv heads across ranks...
             self.kv_hidden_size = neox_args.num_kv_heads * self.hidden_size_per_attention_head
         else:
-            self.num_kv_heads_per_partition = None
-            self.kv_hidden_size = None
+            self.num_kv_heads_per_partition = self.num_attention_heads_per_partition #None
+            self.kv_hidden_size = neox_args.hidden_size #None
 
         if self.attention_type == "multihead":
             # Strided linear layer.
@@ -640,12 +640,13 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # TODO: instead split here into [sq, b, np * hn], 2 [sq, b, np/kv_ratio * hn] and then reshape?
             # TODO: check equivalence (in the multihead case(?))
             # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
+            mixed_x_layer = mixed_x_layer.reshape((mixed_x_layer.shape[0], mixed_x_layer.shape[1], self.num_attention_heads_per_partition, self.hidden_size_per_attention_head * (1 + 2 * (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition))))
             (query_layer, key_layer, value_layer) = [
                     x.contiguous() for x in torch.split(
                         mixed_x_layer, [
-                            self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, 
-                            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head, 
-                            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head
+                            self.hidden_size_per_attention_head, 
+                            (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head, 
+                            (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head
                         ], 
                         dim=mixed_x_layer.dim() - 1
                     )

From e59c873ee779df2d7f182deb6ad34f290a077ea4 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 15 Sep 2023 19:45:07 +0000
Subject: [PATCH 14/64] gqa working

---
 megatron/model/transformer.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index ee9c8c0d5..aaeefb5ec 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -514,10 +514,10 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
             # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
             key_layer = key_layer.transpose(0, 1).reshape(
-                output_size[0], output_size[3], output_size[1], -1 
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1 
             )
             value_layer = value_layer.transpose(0, 1).reshape(
-                output_size[0], output_size[3], output_size[1], -1 
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1 
             )
 
             batch_size = output_size[0]
@@ -544,7 +544,9 @@ def flash_attention(self, query_layer, key_layer, value_layer):
             query_layer = query_layer.transpose(0, 1).reshape(
                 output_size[0], output_size[2], output_size[1], -1
             )
-
+            
+            #print(key_layer.shape)
+            #print(value_layer.shape)
 
             if not self.training:
                 q_shape = query_layer.shape
@@ -640,13 +642,13 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # TODO: instead split here into [sq, b, np * hn], 2 [sq, b, np/kv_ratio * hn] and then reshape?
             # TODO: check equivalence (in the multihead case(?))
             # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
-            mixed_x_layer = mixed_x_layer.reshape((mixed_x_layer.shape[0], mixed_x_layer.shape[1], self.num_attention_heads_per_partition, self.hidden_size_per_attention_head * (1 + 2 * (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition))))
+            mixed_x_layer = mixed_x_layer.reshape((mixed_x_layer.shape[0], mixed_x_layer.shape[1], self.num_attention_heads_per_partition, int(self.hidden_size_per_attention_head * (1 + 2 * (self.num_kv_heads_per_partition / self.num_attention_heads_per_partition)))))
             (query_layer, key_layer, value_layer) = [
                     x.contiguous() for x in torch.split(
                         mixed_x_layer, [
                             self.hidden_size_per_attention_head, 
-                            (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head, 
-                            (self.num_kv_heads_per_partition // self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head
+                            int((self.num_kv_heads_per_partition / self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head), 
+                            int((self.num_kv_heads_per_partition / self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head)
                         ], 
                         dim=mixed_x_layer.dim() - 1
                     )
@@ -662,7 +664,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
             query_layer = query_layer.view(*new_query_shape)
 
-            new_kv_shape = new_query_shape[:2] + (self.num_kv_heads_per_partition, self.hidden_size_per_attention_head,)
+            new_kv_shape = (key_layer.size(0), key_layer.size(1), self.num_kv_heads_per_partition, self.hidden_size_per_attention_head,)
 
             key_layer = key_layer.view(*new_kv_shape)
 

From 801192e8fe3715c3a35b76000c12a06032e166b8 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 15 Sep 2023 19:45:24 +0000
Subject: [PATCH 15/64] workaround for FSX quota

---
 megatron/checkpointing.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 2c08f46c2..74a0ec491 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -327,19 +327,19 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
         raise ValueError("Must be using deepspeed to use neox")
 
     torch.distributed.barrier()
-    upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
-    if upload_to_s3:
-        upload_checkpoint(iteration, neox_args)
 
-    # Wait so everyone is done (necessary)
-    torch.distributed.barrier()
     if neox_args.keep_last_n_checkpoints is not None:
         delete_old_checkpoints(neox_args.save, neox_args.keep_last_n_checkpoints)
 
     # Wait so everyone is done (not necessary)
     torch.distributed.barrier()
+    upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
+    if upload_to_s3:
+        upload_checkpoint(iteration, neox_args)
 
-
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+   
 def load_checkpoint(
     neox_args, model, optimizer, lr_scheduler, inference=False, iteration=None
 ):

From e52b7495abf2cb2ab75ccb0ed2044e3e5d08eecd Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Thu, 5 Oct 2023 17:42:13 -0400
Subject: [PATCH 16/64] update with llemma

---
 34b_launch_script.sh                          |   39 +
 README.md                                     |  415 +---
 configs/1-3B.yml                              |   91 -
 configs/125M-json.yml                         |   78 -
 configs/125M.yml                              |   94 -
 configs/13B.yml                               |   92 -
 configs/175B.yml                              |   90 -
 configs/19M.yml                               |   95 -
 configs/2-7B.yml                              |   91 -
 configs/20B.yml                               |  111 -
 configs/350M.yml                              |   90 -
 configs/49M.yml                               |   91 -
 configs/6-7B.yml                              |   91 -
 configs/760M.yml                              |   91 -
 configs/800M.yml                              |   84 -
 configs/README.md                             |  314 ---
 configs/autotuning_configs/small_tune.json    |   78 -
 configs/autotuning_configs/tune.json          |   72 -
 configs/autotuning_configs/tune_1-3B.json     |   86 -
 configs/autotuning_configs/tune_6-7B.json     |   77 -
 configs/bf16_125M.yml                         |   78 -
 configs/bnb_125M.yml                          |   85 -
 configs/cpu_mock_config.yml                   |    5 -
 configs/data_mixture.yml                      |    6 +
 configs/eleutherai_cluster.yml                |   29 -
 configs/finetuning_configs/6-9B.yml           |   89 -
 configs/gen_docs.py                           |   93 -
 configs/gmlp_small.yml                        |   72 -
 configs/llama/13B.yml                         |   26 -
 configs/llama/30B.yml                         |   26 -
 configs/llama/65B.yml                         |   26 -
 configs/llama/7B.yml                          |   26 -
 configs/llemma_34b.yml                        |  108 +
 configs/llemma_7b.yml                         |  105 +
 configs/local_setup.yml                       |   30 -
 configs/neox_arguments.md                     | 2023 -----------------
 configs/pythia/1-4B.yml                       |   85 -
 configs/pythia/12B.yml                        |   84 -
 configs/pythia/160M.yml                       |   85 -
 configs/pythia/1B.yml                         |   86 -
 configs/pythia/2-8B.yml                       |   87 -
 configs/pythia/410M.yml                       |   85 -
 configs/pythia/6-9B.yml                       |   84 -
 configs/pythia/70M.yml                        |   85 -
 configs/slurm_125M.yml                        |   64 -
 configs/slurm_local.json                      |   11 -
 configs/slurm_local.yml                       |   12 -
 configs/sparse.yml                            |   15 -
 configs/text_generation.yml                   |   21 -
 requirements.txt                              |  116 +
 requirements/requirements-dev.txt             |    7 -
 requirements/requirements-flashattention.txt  |    1 -
 requirements/requirements-onebitadam.txt      |    1 -
 requirements/requirements-sparseattention.txt |    1 -
 requirements/requirements-tensorboard.txt     |    1 -
 requirements/requirements-wandb.txt           |    1 -
 requirements/requirements.txt                 |   17 -
 57 files changed, 410 insertions(+), 5636 deletions(-)
 create mode 100644 34b_launch_script.sh
 delete mode 100644 configs/1-3B.yml
 delete mode 100644 configs/125M-json.yml
 delete mode 100644 configs/125M.yml
 delete mode 100644 configs/13B.yml
 delete mode 100644 configs/175B.yml
 delete mode 100644 configs/19M.yml
 delete mode 100644 configs/2-7B.yml
 delete mode 100644 configs/20B.yml
 delete mode 100644 configs/350M.yml
 delete mode 100644 configs/49M.yml
 delete mode 100644 configs/6-7B.yml
 delete mode 100644 configs/760M.yml
 delete mode 100644 configs/800M.yml
 delete mode 100644 configs/README.md
 delete mode 100644 configs/autotuning_configs/small_tune.json
 delete mode 100644 configs/autotuning_configs/tune.json
 delete mode 100644 configs/autotuning_configs/tune_1-3B.json
 delete mode 100644 configs/autotuning_configs/tune_6-7B.json
 delete mode 100644 configs/bf16_125M.yml
 delete mode 100644 configs/bnb_125M.yml
 delete mode 100644 configs/cpu_mock_config.yml
 create mode 100644 configs/data_mixture.yml
 delete mode 100644 configs/eleutherai_cluster.yml
 delete mode 100755 configs/finetuning_configs/6-9B.yml
 delete mode 100644 configs/gen_docs.py
 delete mode 100644 configs/gmlp_small.yml
 delete mode 100644 configs/llama/13B.yml
 delete mode 100644 configs/llama/30B.yml
 delete mode 100644 configs/llama/65B.yml
 delete mode 100644 configs/llama/7B.yml
 create mode 100644 configs/llemma_34b.yml
 create mode 100644 configs/llemma_7b.yml
 delete mode 100644 configs/local_setup.yml
 delete mode 100644 configs/neox_arguments.md
 delete mode 100755 configs/pythia/1-4B.yml
 delete mode 100755 configs/pythia/12B.yml
 delete mode 100755 configs/pythia/160M.yml
 delete mode 100755 configs/pythia/1B.yml
 delete mode 100755 configs/pythia/2-8B.yml
 delete mode 100755 configs/pythia/410M.yml
 delete mode 100755 configs/pythia/6-9B.yml
 delete mode 100755 configs/pythia/70M.yml
 delete mode 100644 configs/slurm_125M.yml
 delete mode 100644 configs/slurm_local.json
 delete mode 100644 configs/slurm_local.yml
 delete mode 100644 configs/sparse.yml
 delete mode 100644 configs/text_generation.yml
 create mode 100644 requirements.txt
 delete mode 100644 requirements/requirements-dev.txt
 delete mode 100644 requirements/requirements-flashattention.txt
 delete mode 100644 requirements/requirements-onebitadam.txt
 delete mode 100644 requirements/requirements-sparseattention.txt
 delete mode 100644 requirements/requirements-tensorboard.txt
 delete mode 100644 requirements/requirements-wandb.txt
 delete mode 100644 requirements/requirements.txt

diff --git a/34b_launch_script.sh b/34b_launch_script.sh
new file mode 100644
index 000000000..3718c5862
--- /dev/null
+++ b/34b_launch_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#... your SLURM arguments here
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=8         
+#SBATCH --cpus-per-task=12
+#SBATCH --gres=gpu:8
+#SBATCH --output=34b_replication_%j.out
+#SBATCH --error=34b_replication_%j.out
+#SBATCH --exclusive
+#SBATCH --open-mode=append
+#SBATCH --requeue
+
+# setup the environment using the script we created before
+source /fsx/proj-mathlm/conda_setup_deeperspeed.sh
+#source /fsx/quentin/setup.sh
+
+ds_report
+
+# set distributed env variable flags such as NCCL_DEBUG here
+
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+# Move to the gpt-neox install
+TRAIN_PATH=/path/to/gpt-neox
+cd $TRAIN_PATH
+
+# Write the hostfile for this job here
+# Should write to a hostfile that contains lines of format `<machine IP> slots=<NUM_GPUS_PER_NODE>`
+bash /helper/script/write_hostfile.sh
+export DLTS_HOSTFILE=path/to/hostfile/hosts_$SLURM_JOBID
+
+
+# launch distributed job. If using `"deepspeed_slurm": true` and `"launcher": "slurm"` on a SLURM cluster, 
+# then NeoX will handle the creation of a distributed run across 256 gpus.
+python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
+        --conf_dir /path/to/math-lm/pretraining llemma_34b.yml data_mixture.yml   
\ No newline at end of file
diff --git a/README.md b/README.md
index c96692c60..56c21b08c 100644
--- a/README.md
+++ b/README.md
@@ -1,416 +1,73 @@
-[![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
+# LLeMA Pretraining
 
-# GPT-NeoX
+This is a modified version of the `EleutherAI/GPT-NeoX` repository used for the Llemma project. This branch diverged from `main` at commit `009018e`. This branch implements the following features that are not present in `009018e` (some of these features may have subsequently been merged into `main`):
+- [FlashAttention-2](https://arxiv.org/abs/2307.08691)
+- Grouped Query Attention
+- A numerical precision fix for RoPE    
+- Saving checkpoints to Amazon S3.
 
-This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training.
+The remaining portion of this `README` contains instructions to replicate pretraining of the LLeMA models. 
 
-For those looking for a TPU-centric codebase, we recommend [Mesh Transformer JAX](https://github.com/kingoflolz/mesh-transformer-jax).
+Training was performed across 256 A100 GPUs. We include configuration files and sample SLURM job script for the library to replicate training on a SLURM-managed cluster.
 
-**If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
 
-## GPT-NeoX 2.0
+## Replicating Training
 
-Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
 
-- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
-- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
+### Set up environment
 
-# Contents
-
-* [Quick Start](#quick-start)
-  * [Environment and Dependencies](#environment-and-dependencies)
-  * [Usage](#usage)
-* [Configuration](#configuration)
-* [Datasets](#datasets)
-  * [Preconfigured Datasets](#preconfigured-datasets)
-  * [Using Custom Data](#using-custom-data)
-* [Training and Finetuning](#training-and-finetuning)
-  * [Select Pretrained Models](#pretrained-models)
-    * [GPT-NeoX-20B](#gpt-neox-20b)
-    * [Pythia](#pythia)
-    * [Polyglot](#polyglot)
-* [Inference](#inference)
-* [Evaluation](#evaluation)
-* [Exporting to Hugging Face](#exporting-to-hugging-face)
-* [Monitoring](#monitoring)
-  * [Weights & Biases](#wandb)
-  * [TensorBoard](#tensorboard)
-* [Administrative Notes](#administrative-notes)
-  * [Citing GPT-NeoX](#citing-gpt-neox)
-  * [Licensing](#licensing)
-  * [Publications](#publications)
-  * [Acknowledgements](#acknowledgements)
-
-# Quick Start
-
-## Environment and Dependencies
-
-### Host Setup
-
-First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
-
-To install the remaining basic dependencies, run:
+We provide a file containing a dump of our training environment.
 
+You can install all required packages via
 ```bash
-pip install -r requirements/requirements.txt
-pip install -r requirements/requirements-wandb.txt
-pip install -r requirements/requirements-tensorboard.txt
-python ./megatron/fused_kernels/setup.py install # optional if not using fused kernels
-```
-
-from the repository root.
-
-<aside>
-
-**Warning:** Our codebase relies on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), our fork of the [DeepSpeed](https://github.com/microsoft/DeepSpeed) library with some added changes. We strongly recommend using Anaconda, a virtual machine, or some other form of environment isolation before continuing. Failure to do so may cause other repositories that rely on DeepSpeed to break.
-
-</aside>
-
-### TensorBoard
-=======
-### Flash Attention
-
-To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
-
-
-### Containerized Setup
-
-We also provide a Dockerfile if you prefer to run NeoX in a container. To use this option, first build an image named `gpt-neox` from the repository root directory with `docker build -t gpt-neox -f Dockerfile .`. We also host pre-built images on [Docker Hub at `leogao2/gpt-neox`](https://hub.docker.com/r/leogao2/gpt-neox/tags).
-
-You can then run a container based on this image. For instance, the below snippet mounts the cloned repository (`gpt-neox`) directory to `/gpt-neox` in the container and uses [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to make four GPUs (numbers 0-3) accessible to the container. [As noted by the NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data), both `--shm-size=1g` and `--ulimit memlock=-1` are important to prevent Docker from allocating too little shared memory.
-```
-nvidia-docker run --rm -it -e NVIDIA_VISIBLE_DEVICES=0,1,2,3 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox gpt-neox
+pip install -r requirements.txt
 ```
+Make sure you are installing https://github.com/EleutherAI/DeeperSpeed/tree/new-fix for your DeepSpeed version and install fused kernels for GPT-NeoX via `python ./megatron/fused_kernels/setup.py install` from within your GPT-NeoX install.
 
-## Usage
 
-All functionality (inference included), should be launched using `deepy.py`, a wrapper around the `deepspeed` launcher.
+### Converting Llama 2 checkpoints into NeoX format
 
-We currently offer three main functions:
-1. `train.py` is used for training and finetuning models.
-2. `evaluate.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
-3. `generate.py` is used to sample text from a trained model.
+First, download CodeLlama 7b or 34b from the Meta AI repo and rename the download folder to 7B or 34B within the CodeLlama repository.
 
-which can be launched with:
+Then, to convert either model into the format expected by GPT-NeoX for checkpoints:
 
+Sample command for 7b Meta->NeoX format:
 ```bash
-./deepy.py [script.py] [./path/to/config_1.yml] [./path/to/config_2.yml] ... [./path/to/config_n.yml]
+python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_7b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 2}
 ```
 
-E.G To generate text unconditionally with the GPT-NeoX-20B model, you can use the following:
+Sample command for 34b Meta->NeoX format:
+(Requires large amounts of GPU VRAM or CPU RAM. Pass `CUDA_VISIBLE_DEVICES=""` to perform conversion on CPU. 34b conversion may take a while)
 ```bash
-./deepy.py generate.py ./configs/20B.yml
+CUDA_VISIBLE_DEVICES="" python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_34b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 8}
 ```
 
-Or optionally pass in a text file (e.g `prompt.txt`) to use as the prompt, which should be a plain `.txt` file with each prompt separated by newline characters, also passing in the path to an output file.
-
-```bash
-./deepy.py generate.py ./configs/20B.yml -i prompt.txt -o sample_outputs.txt
-```
-
-To reproduce our evaluation numbers on, for example, TriviaQA and PIQA use:
-
-```bash
-./deepy.py evaluate.py ./configs/20B.yml --eval_tasks triviaqa piqa
-```
-
-You can add an arbitrary list of evaluation tasks here, for details of all tasks available, see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
-
-For more details on each entry point, see the [Training and Finetuning](#training-and-finetuning), [Inference](#inference) and [Evaluation](#evaluation)
-# Configuration
-
-GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yaml files in [configs](./configs/), including one for GPT-NeoX-20B, and example configuration files for other model sizes.
-
-These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
-
-For a more detailed guide to all the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
-
-# Datasets
-
-## Preconfigured Datasets
-
-Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
-
-E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
-
-```
-python prepare_data.py -d ./data
-```
-
-or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
-
-```
-python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
-```
-
-The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
-
-```yaml
-  "data-path": "./data/enwik8/enwik8_text_document",
-```
-
-## Using Custom Data
-
-To prepare your own dataset for training with custom data, format it as one large [jsonl](https://jsonlines.org/)-formatted file with each item in the list of dictionaries being a separate document. The document text should be grouped under one JSON key, i.e `"text"`. Any auxiliary data stored in other fields will not be used.
-
-Next make sure to download the GPT2 tokenizer vocab, and merge files from the following links:
-
-- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-
-Or use the 20B tokenizer (for which only a single Vocab file is needed):
-
-- Vocab: https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
-
-(alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
-
-You can now pretokenize your data using `tools/preprocess_data.py`, the arguments for which are detailed below:
-
-```
-usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
-                          [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] [--log-interval LOG_INTERVAL]
-
-optional arguments:
-  -h, --help            show this help message and exit
-
-input data:
-  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
-  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
-                        space separate listed of keys to extract from jsonl. Defa
-  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
-
-tokenizer:
-  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
-                        What type of tokenizer to use.
-  --vocab-file VOCAB_FILE
-                        Path to the vocab file
-  --merge-file MERGE_FILE
-                        Path to the BPE merge file (if necessary).
-  --append-eod          Append an <eod> token to the end of a document.
-  --ftfy                Use ftfy to clean text
-
-output data:
-  --output-prefix OUTPUT_PREFIX
-                        Path to binary output file without suffix
-  --dataset-impl {lazy,cached,mmap}
-                        Dataset implementation to use. Default: mmap
-
-runtime:
-  --workers WORKERS     Number of worker processes to launch
-  --log-interval LOG_INTERVAL
-                        Interval between progress updates
-
-```
-
-For example:
-
-```bash
-python tools/preprocess_data.py \
-            --input ./data/mydataset.jsonl.zst \
-            --output-prefix ./data/mydataset \
-            --vocab ./data/gpt2-vocab.json \
-            --merge-file gpt2-merges.txt \
-            --dataset-impl mmap \
-            --tokenizer-type GPT2BPETokenizer \
-            --append-eod
-```
-
-You would then run training with the following settings added to your configuration file:
-
-```yaml
-  "data-path": "data/mydataset/mydataset",
-```
-
-# Training and Finetuning
-
-Training is launched using `deepy.py`, a wrapper around DeepSpeed's launcher, which launches the same script in parallel across many GPUs / nodes.
-
-The general usage pattern is:
-
-```bash
-python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
-```
-
-You can pass in an arbitrary number of configs which will all be merged at runtime.
-
-You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
-
-E.G:
-
-```bash
-python ./deepy.py train.py -d configs 125M.yml local_setup.yml
-```
-
-This will deploy the `train.py` script on all nodes with one process per GPU. The worker nodes and number of GPUs are specified in the `/job/hostfile` file (see [parameter documentation](configs/README.md)), or can simply be passed in as the `num_gpus` arg if running on a single node setup.
-
-Although this is not strictly necessary, we find it useful to define the model parameters in one config file (e.g `configs/125M.yml`) and the data path parameters in another (e.g `configs/local_setup.yml`).
-
-
-## Pretrained Models
-
-### GPT-NeoX-20B
-
-GPT-NeoX-20B is a 20 billion parameter autoregressive language model trained on [the Pile](https://arxiv.org/abs/2101.00027). Technical details about GPT-NeoX-20B can be found in [the associated paper](https://arxiv.org/abs/2204.06745). The configuration file for this model is both available at [`./configs/20B.yml`](./configs/20B.yml) and included in the download links below.
-
-[Slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/) - (No optimizer states, for inference or finetuning, 39GB)
-
-To download from the command line to a folder named `20B_checkpoints`, use the following command:
-
-```bash
-wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
-```
-
-[Full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/) - (Including optimizer states, 268GB)
-
-To download from the command line to a folder named `20B_checkpoints`, use the following command:
-
-```bash
-wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/ -P 20B_checkpoints
-```
-
-Weights can be alternatively be downloaded using a BitTorrent client. Torrent files can be downloaded here: [slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent), [full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights.torrent).
-
-We additionally have 150 checkpoints saved throughout training, one every 1,000 steps. We are working on figuring out how to best serve these at scale, but in the meanwhile people interested in working with the partially trained checkpoints can email us at contact@eleuther.ai to arrange access.
-
-### Pythia
-
-The Pythia Scaling Suite is a suite of models ranging from 70M parameters to 12B parameters trained on [the Pile](https://pile.eleuther.ai) intended to promote research on interpretability and training dynamics of large language models. Further details about the project and links to the models can be found in the [in the paper](https://arxiv.org/abs/2304.01373) and [on the project's GitHub](https://github.com/EleutherAI/pythia).
-
-### Polyglot
-
-The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
-
-# Inference
-
-**For most uses we recommend deploying models trained using the GPT-NeoX library via the Hugging Face Transformers library which is better optimized for inference.**
-
-We support three types of generation from a pretrained model:
-1. Unconditional generation
-2. Conditional generation based on an input read from a file
-3. Interactive generation, which allows for multiple rounds of back-and-forth between a user and the language model via a command line interface
-
-All three types of text generation can be launched via `python ./deepy.py generate.py -d configs 125M.yml local_setup.yml text_generation.yml` with the appropriate values set in `configs/text_generation.yml`.
-
-# Evaluation
-
-GPT-NeoX supports evaluation on downstream tasks through the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
-
-To evaluate a trained model on the evaluation harness, simply run:
-
-```bash
-python ./deepy.py evaluate.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
-```
-
-where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
-
-# Exporting to Hugging Face
-
-GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) GPTNeoXModel format.
-
-To convert a NeoX checkpoint (with pipeline-parallel-size>=1) to Hugging Face-loadable format, run:
-```bash
-python ./tools/convert_module_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
-```
-
-To convert a sequential model to Hugging Face format, run:
-```bash
-python  ./tools/convert_sequential_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
-```
-(Note: this script should be used for v2.0 checkpoints saved on a v2.0 commit prior to https://github.com/EleutherAI/gpt-neox/pull/866 and which used `pipe-parallel-size=1`. Using `pipe-parallel-size=0` will also save models in this format.)
-
-Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
-```bash
-huggingface-cli login
-python ./tools/upload.py
-```
-and input the requested information, including HF hub user token.
-
-Note, however, that this compatibility is not one-to-one, and only certain configurations from GPT-NeoX are supported in the Hugging Face GPTNeoXModel class. Advanced features such as alternative positional embeddings may require new Transformers modeling code and new conversion script tweaks.
-
-# Monitoring
-
-In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/)
-
-<h2 id="wandb">Weights & Biases</h2>
-
-EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account.
-
-## TensorBoard
-
-We also support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
-
-# Running on multi-node
-
-If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
-
-# Administrative Notes
-
-## Citing GPT-NeoX
-
-If you have found the GPT-NeoX library helpful in your work, you can cite this repository as
-
-```bibtex
-@software{gpt-neox-library,
-  title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
-  author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Purohit, Shivanshu and Songz, Tri and Phil, Wang and Weinbach, Samuel},
-  url = {https://www.github.com/eleutherai/gpt-neox},
-  doi = {10.5281/zenodo.5879544},
-  month = {8},
-  year = {2021},
-  version = {0.0.1},
-}
-```
-
-To cite our 20 billion parameter model, please use
-
-```bibtex
-@inproceedings{gpt-neox-20b,
-  title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
-  author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
-  booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
-  url={https://arxiv.org/abs/2204.06745},
-  year={2022}
-}
-```
 
-Citation instructions for other pretrained models can be found [in the appropriate repository](#pretrained-models).
+### Check Out Codebase
 
-## Licensing
+Next, check out the commit used to train the model you are replicating.
 
-This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2021, EleutherAI. Licensed under the Apache License:
+* 7b / 34b: https://github.com/EleutherAI/gpt-neox/commit/{this_commit_hash}
 
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
+### Launching Training
 
-        http://www.apache.org/licenses/LICENSE-2.0
+Then, edit the provided YML files to set paths based on your own system's saved locations for checkpoints and data files, and edit the SLURM job script as specified (using ) or run the job across multiple nodes using your own system's orchestration.
 
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
+**Tip**: Note that the global batch size will be scaled by your number of nodes. Therefore, if running on a number of nodes different from 32 you should scale gradient accumulation steps accordingly. 
 
-This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
+We used a batch size of 4M tokens. To calculate global batch size, you should compute `seq_len * num_gpus * ( train_microbatch_size_per_gpu * gradient_accumulation_steps) / (model_parallel_size * max(pipeline_parallel_size, 1))` .
 
-This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
 
-For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email us at contact@eleuther.ai.
+## Contents
 
-## Publications
+The files in this folder are as follows:
 
-The following publications have come out of this project:
+* `34b_launch_script.sh` contains a skeleton SLURM job script to launch training with NeoX across 32 nodes.
 
- - Black, Biderman, Hallahan, Anthony, Gao, Golding, He, Leahy, McDonell, Phang, Pieler, Prashanth, Purohit, Reynolds, Tow, Wang, and Weinbach. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*. 2022.
- - Biderman, Schoelkopf, Anthony, Bradley, O'Brien, Hallahan, Khan, Purohit, Prashanth, Raff, Skowron, Sutawika, and van der Wal. "[Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling](https://arxiv.org/abs/2304.01373)." *arXiv preprint arXiv:2304.01373*. 2023.
+* `configs/data_mixture.yml` contains a list of the domain weights for the final training run.
 
-The following publications by other research groups use this library:
-- Chi, Fan, Ramadge, and Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)". _arXiv preprint arXiv:2205.09921_. 2022.
-- Horawalavithana, Ayton, Sharma, Howland, Subramanian, Vasquez, Cosbey, Glenski, and Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://openreview.net/pdf?id=SLX-I2MHUZ9)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*. 2022.
-- Kolak, Martins, Le Goues, and Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://openreview.net/forum?id=rHlzJh_b1-5)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*. 2022.
-- Xu, Alon, Neubig, and Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*. 2022.
+* `configs/llemma_7b.yml` is a cleaned-up version of the config file used to train Llemma-7b.
 
-## Acknowledgements
+* `configs/llemma_34b.yml` is a cleaned-up version of the config file used to train Llemma-34b.
 
-We run our experiments on a Kubernetes cluster generously provided by [CoreWeave](https://coreweave.com/) and a SLURM cluster provided by [Stability AI](https://stability.ai).
+* `requirements.txt` is a dump of the virtual environmment used in training, created via `pip freeze`.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
deleted file mode 100644
index 3e80ae7fc..000000000
--- a/configs/1-3B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 2048,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0002,
-       "betas": [0.9, 0.95],
-       "eps":  1.0e-8,
-     }
-   },
-   "min_lr": 0.00002,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/125M-json.yml b/configs/125M-json.yml
deleted file mode 100644
index 95a76ebfc..000000000
--- a/configs/125M-json.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 12,
-  "hidden_size": 768,
-  "num_attention_heads": 12,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "norm": "layernorm",
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0006,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00006,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-
-  "train_micro_batch_size_per_gpu": 4,
-  "data_impl": "mmap",
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0.0,
-  "attention_dropout": 0.0,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 320000,
-  "lr_decay_iters": 320000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 10000,
-  "eval_interval": 1000,
-  "eval_iters": 10,
-
-  "log_interval": 100,
-  "steps_per_print": 10,
-  "keep_last_n_checkpoints": 4,
-  "wall_clock_breakdown": true,
-
-  "hostfile": "/mock_path"
-}
diff --git a/configs/125M.yml b/configs/125M.yml
deleted file mode 100644
index 15a4b3b01..000000000
--- a/configs/125M.yml
+++ /dev/null
@@ -1,94 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.00006,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-
-  #  networking
-  "hostfile": "/mock_path"
-}
diff --git a/configs/13B.yml b/configs/13B.yml
deleted file mode 100644
index 7af3208ef..000000000
--- a/configs/13B.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 40,
-   "hidden_size": 5120,
-   "num_attention_heads": 40,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0001,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   "min_lr": 0.00001,
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/175B.yml b/configs/175B.yml
deleted file mode 100644
index cc5c5c23f..000000000
--- a/configs/175B.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 96,
-   "hidden_size": 12288,
-   "num_attention_heads": 96,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00006,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000006,
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/19M.yml b/configs/19M.yml
deleted file mode 100644
index 83e5c594a..000000000
--- a/configs/19M.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 6,
-  "hidden_size": 512,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.001,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.0001,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  "train_micro_batch_size_per_gpu": 4, #32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  # additional deepspeed args not specified above
-  "deepspeed_extra_args": {
-    "comms_logger": {
-        "enabled": true,
-        "verbose": true,
-        "prof_all": true,
-        "debug": false
-    },
-  }
-
-}
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
deleted file mode 100644
index 2bddda0ed..000000000
--- a/configs/2-7B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 32,
-   "hidden_size": 2560,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00016,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000016,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/20B.yml b/configs/20B.yml
deleted file mode 100644
index 243f794d0..000000000
--- a/configs/20B.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
-# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
-# the model in memory.
-
-{
-  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
-  "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
-  "save": "./20B_checkpoints",
-  "load": "./20B_checkpoints",
-
-  # If finetuning, edit the following to the location of your finetuning dataset:
-  "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
-
-  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-  # across the node boundaries )
-  "pipe_parallel_size": 4,
-  "model_parallel_size": 2,
-
-  # model settings
-  "num_layers": 44,
-  "hidden_size": 6144,
-  "num_attention_heads": 64,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "norm": "layernorm",
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  # optimizer settings
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.97e-4,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-      }
-      },
-
-  "min_lr": 0.97e-5,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-  "stage": 1,
-  "allgather_partitions": True,
-  "allgather_bucket_size": 1260000000,
-  "overlap_comm": True,
-  "reduce_scatter": True,
-  "reduce_bucket_size": 1260000000,
-  "contiguous_gradients": True,
-  },
-
-  # batch / data settings (assuming 96 GPUs)
-  "train_micro_batch_size_per_gpu": 4,
-  "gradient_accumulation_steps": 32,
-  "data_impl": "mmap",
-  "split": "995,4,1",
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": false,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.01,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-    },
-
-  # misc. training settings
-  "train_iters": 150000,
-  "lr_decay_iters": 150000,
-
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 500, # this variable previously called `save-interval`
-  "eval_interval": 1000,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 2,
-  "steps_per_print": 2,
-  "wall_clock_breakdown": false,
-
-  ### NEW DATA: ####
-  "tokenizer_type": "HFTokenizer",
-  "tensorboard-dir": "./tensorboard",
-  "log_dir": "./logs",
-
-}
diff --git a/configs/350M.yml b/configs/350M.yml
deleted file mode 100644
index 1247ea88d..000000000
--- a/configs/350M.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 1024,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0003,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.00003,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/49M.yml b/configs/49M.yml
deleted file mode 100644
index 9852320b0..000000000
--- a/configs/49M.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-  # parallelism settings
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 10,
-  "hidden_size": 640,
-  "num_attention_heads": 10,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  # these should provide some speedup but takes a while to build, set to true if desired
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  # optimizer settings
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0008,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.00008,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  # batch / data settings
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  # misc. training settings
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-}
diff --git a/configs/6-7B.yml b/configs/6-7B.yml
deleted file mode 100644
index 8054c3ff2..000000000
--- a/configs/6-7B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 32,
-   "hidden_size": 4096,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   "min_lr": 0.000012,
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/760M.yml b/configs/760M.yml
deleted file mode 100644
index 5cbb39559..000000000
--- a/configs/760M.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 1536,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00025,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000025,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/800M.yml b/configs/800M.yml
deleted file mode 100644
index f522b40c7..000000000
--- a/configs/800M.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 16,
-  "hidden_size": 2048,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00025,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.000025,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  "train_micro_batch_size_per_gpu": 16,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 40000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-}
diff --git a/configs/README.md b/configs/README.md
deleted file mode 100644
index 5b065d2cd..000000000
--- a/configs/README.md
+++ /dev/null
@@ -1,314 +0,0 @@
-# Configuration and parameters
-
-GPT-NeoX parameters are defined in a YAML configuration file which is passed to the `deepy.py` launcher - for examples see the files contained in this folder.
-Parameters originate from either the [DeepSpeed runner CLI (DSL)](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/runner.py#L33), [DeepSpeed configuration file (DSC)](https://www.deepspeed.ai/docs/config-json/), [Megatron-LM CLI (Meg)](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L224) or are GPT-NeoX (NeoX) modifications.
-
-## Example Configuration (GPT3 Small):
-
-Below is an example configuration `.yaml` to train a ~160M parameter GPT model. This readme will go through each section in the configuration and the options available.
-
-For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
-
-Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
-```yaml
-# GPT-3 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "rmsnorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-    # this should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "train_iters": 320000,
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "max_grad_norm": 1.0,
-       "betas": [0.9, 0.95]
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 1,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # lr decay settings
-   "lr_decay_iters": 320000,
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-
-   # misc. training settings
-   "distributed_backend": "nccl",
-   "save_interval": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
-```
-
-### Parallelism Settings:
-
-The parallelism settings are left at 1 in all configs, as the settings you want will be highly dependent on your compute setup and network topology.
-We have found it best to do model parallelism within a node, and schedule pipeline stages across node boundaries.
-
-```yaml
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-```
-
-These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must be divisible by `pipe_parallel_size` * `model_parallel_size`.
-
-
-### Model Settings:
-```yaml
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "rmsnorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-    # this should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "train_iters": 320000,
-```
-An example of some basic settings used to configure your model's architecture and number of training steps.
-
-### Optimizer Settings:
-
-Our optimizer configuration has a similar syntax to deepspeed's. Different optimizers will have different arguments for "params".
-Learning rate should be configured from here using the `"lr"` field of `optimizer["params"]`.
-
-```yaml
-  # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "max_grad_norm": 1.0,
-       "betas": [0.9, 0.95]
-     }
-   }
-   ```
-Available optimizer types are:
-
-- `"Adam"`: regular Adam optimizer
-- `"OneBitAdam"`: Deepspeed's [OneBitAdam optimizer](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). To use 1-bit adam, you'll also need to add the `freeze_step`, `cuda_aware`, and `comm_backend_name` fields, like so:
-```yaml
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": false,
-       "comm_backend_name": "nccl"
-     }
-```
-
-- `"CPU_Adam"`/`"CPU_torch_adam"`: Adam optimizer on CPU. Either megatron's version ("CPU_Adam") or torch's ("CPU_torch_adam")
-- `"SM3"`: SM3 or [Memory adaptive efficient optimization optimizer](https://arxiv.org/pdf/1901.11150.pdf). We have found this doesn't work well with fp16 training.
-- `"madgrad_wd"`: MADGRAD or [A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
-    Optimizer] weight decay has been implemented AdamW style instead of the original madgrad Adam style. https://arxiv.org/abs/2101.11075
-
-### ZeRO Optimization:
-
-```yaml
-# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-        "stage": 0,
-        "allgather_partitions": True,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": True,
-        "reduce_scatter": True,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": True,
-  },
-  "zero_allow_untested_optimizer": false,
-
-```
-
-ZeRO optimization in NeoX is currently configured identically to how deepspeed configures it, please see [the deepspeed docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for more information.
-
-If you want to combine an optimizer untested by DeepSpeed with ZeRO (i.e, not ADAM or LAMB), you must pass `"zero_allow_untested_optimizer": true` *outside* of the `"zero_optimization"` dictionary (see above).
-
-N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pipe-parallel-size"` to 0 if you want to use ZeRO stage 2 or more.
-
-### Batch Size Settings:
-
-```yaml
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 1,
-```
-Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
-- `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
-- `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
-- `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
-
-### Extra DeepSpeed Settings
-
-```yaml
-# additional deepspeed args not specified above
-"deepspeed_extra_args": {
-    "comms_logger": {
-        "enabled": true,
-        "verbose": true,
-        "prof_all": true,
-        "debug": false
-    },
-}
-```
-Additional DeepSpeed settings besides those mentioned above should be wrapped in the `"deepspeed_extra_args` argument, as in the example above. This functionality is designed to allow arguments not specified by existing dataclasses to be passed to DeepSpeed (e.g. when new functionalities are implemented). If any settings are duplicated here from elsewhere in the YAML, the system will throw an exception and notify the user.
-
-### Dataset / Tokenizer / Checkpoint / Logging Settings:
-
-```yaml
-   "data_impl": "mmap",
-   "split": "949,50,1",
-   # Suggested data paths when using GPT-NeoX locally
-   "data_path": "data/enwik8/enwik8_text_document",
-   #"train_data_path": "data/enwik8/enwik8_text_document",
-   #"test_data_path": "data/enwik8/enwik8_text_document",
-   #"valid_data_path": "data/enwik8/enwik8_text_document",
-   "vocab_file": "data/gpt2-vocab.json",
-   "merge_file": "data/gpt2-merges.txt",
-   "save": "checkpoints",
-   "load": "checkpoints",
-   "tensorboard_dir": "tensorboard",
-   "log_dir": "logs",
-   "save_interval": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-```
-
-### LR Scheduler settings
-
-```yaml
-   "lr_decay_iters": 320000,
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-```
-
-Settings used to modify the learning rate over time.
-
-N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because reasons. Currently the lr decay style defaults to deepspeed's `WarmupDecay
-
-### Activation Checkpointing Settings:
-
-```yaml
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-```
-
-Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
-
-### Mixed Precision Training Settings:
-gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
-An example config for fp16 training:
-
-```yaml
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-```
-
-Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
-
-
-### SLURM Settings
-
-If you are running GPT-NeoX on a SLURM cluster and wish to use SLURM to coordinate nodes, then you must set the following variables in your config:
-
-```yaml
-    "launcher": "slurm",
-    "deepspeed_slurm": true
-```
-
-Additionally, you need to modify _all_ of your configs to conform to the JSON. When launching a GPT-NeoX job you can specify multiple YAML config files. Internally, all of these files are merged into one config and then passed as a single long command line argument to Deep(er)Speed. When using SLURM and its internal command `srun`, python fails to parse this long command line argument unless it is in the more restrictive JSON format. In practice, the example NeoX configs are already very close to JSON. As an example, this is a snippet of a YAML-compatible config, N.B. the comment the capital-F `False`:
-
-```yaml
-    # optimizer settings
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": False,
-       "comm_backend_name": "nccl"
-     }
-```
-
-To make this JSON just remove the comment and use all lowercase for the boolean:
-
-```yaml
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": false,
-       "comm_backend_name": "nccl"
-     }
-```
diff --git a/configs/autotuning_configs/small_tune.json b/configs/autotuning_configs/small_tune.json
deleted file mode 100644
index 52c99449b..000000000
--- a/configs/autotuning_configs/small_tune.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-
-   "num-layers": 12,
-   "hidden-size": 768,
-   "num-attention-heads": 12,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-
-   "train_micro_batch_size_per_gpu": 1,
-   "data-impl": "mmap",
-   "split": "949,50,1",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.0,
-   "hidden-dropout": 0.0,
-   "attention-dropout": 0.0,
-
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 320000,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "save-interval": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox",
-   "autotuning": {
-       "enabled": true,
-       "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "zero_optimization": {
-      "stage": [0, 1, 2, 3]
-   },
-  "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
-  "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
-  "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
-}
diff --git a/configs/autotuning_configs/tune.json b/configs/autotuning_configs/tune.json
deleted file mode 100644
index b2f114539..000000000
--- a/configs/autotuning_configs/tune.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-   "num-layers": 12,
-   "hidden-size": 768,
-   "num-attention-heads": 12,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-   "scaled-upper-triang-masked-softmax-fusion": true,
-   "bias-gelu-fusion": true,
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-   "train_micro_batch_size_per_gpu": 1,
-   "autotuning_config": {
-     "enabled": true,
-     "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "data-impl": "mmap",
-   "split": "949,50,1",
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.0,
-   "hidden-dropout": 0.0,
-   "attention-dropout": 0.0,
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-   "train-iters": 200,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "save-interval": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox"
-}
diff --git a/configs/autotuning_configs/tune_1-3B.json b/configs/autotuning_configs/tune_1-3B.json
deleted file mode 100644
index 8207d0cfc..000000000
--- a/configs/autotuning_configs/tune_1-3B.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-
-   "num-layers": 24,
-   "hidden-size": 2048,
-   "num-attention-heads": 16,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-   "attention_config": [[["flash"], 24]],
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0002,
-       "betas": [0.9, 0.95],
-       "eps":  1.0e-8
-     }
-   },
-   "min_lr": 0.00002,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-  "train_micro_batch_size_per_gpu": 1,
-   "autotuning": {
-     "enabled": true,
-     "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "data-impl": "mmap",
-
-   "checkpoint-activations": false,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.1,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 320000,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "no_ssh_check": true,
-
-   "log-interval": 10,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 1,
-   "wall_clock_breakdown": true
-}
diff --git a/configs/autotuning_configs/tune_6-7B.json b/configs/autotuning_configs/tune_6-7B.json
deleted file mode 100644
index 3d7aadf68..000000000
--- a/configs/autotuning_configs/tune_6-7B.json
+++ /dev/null
@@ -1,77 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 8,
-
-   "num-layers": 32,
-   "hidden-size": 4096,
-   "num-attention-heads": 32,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-
-   "train_micro_batch_size_per_gpu": 1,
-   "zero_optimization": {
-      "stage": [0, 1, 2, 3]
-   },
-   "data-impl": "mmap",
-   "split": "949,50,1",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 100,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "no_ssh_check": true,
-   "comment": "neox",
-   "autotuning": {
-       "enabled": true,
-       "mp_size": 8,
-       "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   }
-}
diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
deleted file mode 100644
index 93f808d35..000000000
--- a/configs/bf16_125M.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   "precision": "bfloat16",
-
-   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
deleted file mode 100644
index 66f733803..000000000
--- a/configs/bnb_125M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "use_bnb_optimizer": true,
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/cpu_mock_config.yml b/configs/cpu_mock_config.yml
deleted file mode 100644
index 653aa21d8..000000000
--- a/configs/cpu_mock_config.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-# CPU unit tests should be independent of the presence of GPUs on the test server
-# host. This configuration mocks these GPU resources and other dependencies.
-{
-  "global_num_gpus": 1
-}
diff --git a/configs/data_mixture.yml b/configs/data_mixture.yml
new file mode 100644
index 000000000..7c6a1b9f1
--- /dev/null
+++ b/configs/data_mixture.yml
@@ -0,0 +1,6 @@
+{
+  "train-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/train/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/train/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/train/code-with-proofsteps/code-with-proofsteps_text_document", "/fsx/proj-mathlm/proof-pile_llama/train/pile-sample/pile-sample_text_document", "/fsx/proj-mathlm/code-rp_llama/train/code-rp/code-rp_text_document"],
+  "train-data-weights": [2, 4, 1, 0.147368, 0.221053],
+  "valid-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/validation/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/validation/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/validation/code-with-proofsteps/code-with-proofsteps_text_document"],
+  "test-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/test/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/test/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/test/code-with-proofsteps/code-with-proofsteps_text_document"],
+}
\ No newline at end of file
diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
deleted file mode 100644
index 36e75d8b3..000000000
--- a/configs/eleutherai_cluster.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Data paths and options when using EleutherAI cluster
-{
-  # you may include multiple distinct datasets if desired
-  "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
-  "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
-  "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
-
-  # if using multiple datasets, provide weights for them to be sampled with
-  # "train-data-weights": [1., 2.],
-  # "test-data-weights": [2., 1.],
-  # "valid-data-weights": [0.5, 0.4],
-
-
-  # If you would like the code to create val and test datasets from your training set use the following instead
-  # "split" determines the relative size of train, val, and test
-
-  # "split" 995,4,1
-  # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
-
-  "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
-  "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
-  "save": "/mnt/ssd-1/checkpoints",
-  "load": "/mnt/ssd-1/checkpoints",
-  "tensorboard_dir": "/mnt/ssd-1/tensorboard",
-  "log_dir": "/mnt/ssd-1/logs",
-  "wandb_team": "eleutherai",
-  "wandb_project": "neox",
-  "wandb_group": "example"
-}
diff --git a/configs/finetuning_configs/6-9B.yml b/configs/finetuning_configs/6-9B.yml
deleted file mode 100755
index c6120d6d5..000000000
--- a/configs/finetuning_configs/6-9B.yml
+++ /dev/null
@@ -1,89 +0,0 @@
-{
-  # finetuning option
-  "load": "/path/to/checkpoint",
-  "finetune": true,
-
-  "pipe-parallel-size": 1,
-  "model-parallel-size": 2,
-
-   "num-layers": 32,
-   "hidden-size": 4096,
-   "num-attention-heads": 32,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "rotary_pct": 0.25,
-   "no-weight-tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-   
-   "attention-config": [[["flash"], 32]],
-   
-   "scaled-upper-triang-masked-softmax-fusion": true,
-   "bias-gelu-fusion": true,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-   
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-    "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data-impl": "mmap",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.1,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 143000,
-   "lr-decay-iters": 143000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 1000,
-   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval-interval": 143000,
-   "eval-iters": 10,
-
-   "log-interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/gen_docs.py b/configs/gen_docs.py
deleted file mode 100644
index 08431e6c6..000000000
--- a/configs/gen_docs.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import sys
-import os
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-from megatron.neox_arguments import neox_args, deepspeed_args
-from inspect import getmembers, getsource
-from dataclasses import field, is_dataclass
-from itertools import tee, zip_longest
-import pathlib
-
-
-def pairwise(iterable):
-    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
-    a, b = tee(iterable)
-    next(b, None)
-    return zip_longest(a, b)
-
-
-def get_docs(module):
-    ARGS_CLASSES = getmembers(module, is_dataclass)
-    results = {}
-    for name, dcls in ARGS_CLASSES:
-        assert is_dataclass(dcls)
-        src = getsource(dcls)
-        d = dcls()
-        loc = 0
-        results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
-        for cur, _next in pairwise(d.__dataclass_fields__.items()):
-            field_name, field_def = cur
-            field_type = field_def.type
-            if hasattr(field_type, "__name__"):
-                field_type = field_type.__name__
-            else:
-                field_type = str(field_type)
-
-            field_default = field_def.default
-
-            # try to find the field definition
-            loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
-
-            if _next is not None:
-                next_field_name, _ = _next
-                # try to find the next field definition
-                next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
-            else:
-                next_loc = len(src)
-
-            # try to get the docstring
-            _src = src[loc:next_loc].strip()
-            if '"""' in _src:
-                doc = _src.split('"""')[1].strip()
-            elif "'''" in _src:
-                doc = _src.split("'''")[1].strip()
-            else:
-                doc = ""
-            results[name]["attributes"][field_name] = {
-                "name": field_name,
-                "type": field_type,
-                "default": field_default,
-                "doc": doc,
-            }
-    return results
-
-
-def to_md(docs, intro_str=""):
-    """
-    Writes the docs dictionary to markdown format
-    """
-    lines = []
-    lines.append(intro_str)
-    for name, doc in docs.items():
-        lines.append(f"## {name}")
-        lines.append(f"{doc['doc']}")
-        lines.append("")
-        for field_name, field_def in doc["attributes"].items():
-            # attribute name and type
-            lines.append(f"- **{field_name}**: {field_def['type']}")
-            # default value
-            lines.append(f"    Default = {str(field_def['default'])}")
-            lines.append(f"    {field_def['doc']}")
-            lines.append("")
-    return "\n\n".join(lines)
-
-
-if __name__ == "__main__":
-    docs = get_docs(neox_args)
-    docs.update(get_docs(deepspeed_args))
-    intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
-    md = to_md(docs, intro_str=intro_str)
-    with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
-        f.write(md)
diff --git a/configs/gmlp_small.yml b/configs/gmlp_small.yml
deleted file mode 100644
index 2a5b02d60..000000000
--- a/configs/gmlp_small.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-   "attention_config": [[["gmlp"], "all"]],
-
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
-   "gmlp_attn_dim": 64,
-   "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e_8,
-     }
-   },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": false,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
deleted file mode 100644
index 305567be1..000000000
--- a/configs/llama/13B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 2,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 40,
-  "hidden_size": 5120,
-  "num_attention_heads": 40,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
deleted file mode 100644
index 450f8da38..000000000
--- a/configs/llama/30B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 4,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 60,
-  "hidden_size": 6656,
-  "num_attention_heads": 52,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
deleted file mode 100644
index 85f199ce2..000000000
--- a/configs/llama/65B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 8,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 80,
-  "hidden_size": 8192,
-  "num_attention_heads": 64,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
deleted file mode 100644
index ecbf187a8..000000000
--- a/configs/llama/7B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 32,
-  "hidden_size": 4096,
-  "num_attention_heads": 32,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llemma_34b.yml b/configs/llemma_34b.yml
new file mode 100644
index 000000000..88a714575
--- /dev/null
+++ b/configs/llemma_34b.yml
@@ -0,0 +1,108 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 2,
+   "gradient_accumulation_steps": 16,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 12000,
+   "lr_decay_iters": 12000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 250,
+   "eval_interval": 250,
+   "eval_iters": 25,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
+   # "load": "" # set to same as "save" to resume from intermediate finetuning step
+   "load": "/path/to/converted/codellama_34b_weights_with_mp8",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+
+   "use_wandb": true,
+   "wandb_group": "34b-codellama-5e-5lr",
+   "wandb_project": "math-lm",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
\ No newline at end of file
diff --git a/configs/llemma_7b.yml b/configs/llemma_7b.yml
new file mode 100644
index 000000000..c77c1c841
--- /dev/null
+++ b/configs/llemma_7b.yml
@@ -0,0 +1,105 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 2,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 48000,
+   "lr_decay_iters": 48000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 500,
+   "eval_interval": 250,
+   "eval_iters": 50,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/path/to/save/llema-replication",
+   #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+   "load": "/path/to/converted/codellama_7b_weights_with_mp2",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+   "use_wandb": true,
+   "wandb_group": "codellama_200btok_mp2_32node_2gas_deeperspeed",
+   "wandb_project": "math-lm",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
\ No newline at end of file
diff --git a/configs/local_setup.yml b/configs/local_setup.yml
deleted file mode 100644
index d031a2ad8..000000000
--- a/configs/local_setup.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Suggested data paths when using GPT-NeoX locally
-{
-  "data_path": "data/enwik8/enwik8_text_document",
-
-  # or for weighted datasets:
-  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "train-data-weights": [1., 2.],
-  # "test-data-weights": [2., 1.],
-  # "valid-data-weights": [0.5, 0.4],
-
-  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
-  # WARNING: setting this to True will override any user provided weights
-  # "weight_by_num_documents": false,
-  # "weighted_sampler_alpha": 0.3,
-
-  "vocab_file": "data/gpt2-vocab.json",
-  "merge_file": "data/gpt2-merges.txt",
-
-  "save": "checkpoints",
-  "load": "checkpoints",
-  "checkpoint_validation_with_forward_pass": False,
-
-  "tensorboard_dir": "tensorboard",
-  "log_dir": "logs",
-  "use_wandb": True,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
deleted file mode 100644
index c50e7ff01..000000000
--- a/configs/neox_arguments.md
+++ /dev/null
@@ -1,2023 +0,0 @@
-Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):
-
-
-## NeoXArgsLRScheduler
-
-LR Scheduler Arguments
-
-
-
-- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential']
-
-    Default = linear
-
-    Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'.
-
-
-
-- **lr_decay_iters**: int
-
-    Default = None
-
-    Number of iterations to decay learning rate over, If None defaults to --train-iters
-
-
-
-- **min_lr**: float
-
-    Default = 0.0
-
-    Minimum value for learning rate. The scheduler clips values below this threshold.
-
-
-
-- **warmup**: float
-
-    Default = 0.01
-
-    Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
-
-
-
-- **override_lr_scheduler**: bool
-
-    Default = False
-
-    Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
-
-
-
-- **use_checkpoint_lr_scheduler**: bool
-
-    Default = False
-
-    Use checkpoint to set the values of the scheduler (learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style from checkpoint and ignore input arguments.
-
-
-
-## NeoXArgsLogging
-
-Logging Arguments
-
-
-
-- **use_wandb**: bool
-
-    Default = None
-
-    Flag indicating if wandb is to be used.
-
-
-
-- **wandb_group**: str
-
-    Default = None
-
-    Weights and Biases group name - used to group together "runs".
-
-
-
-- **wandb_team**: str
-
-    Default = None
-
-    Team name for Weights and Biases.
-
-
-
-- **wandb_project**: str
-
-    Default = neox
-
-    wandb project name
-
-
-
-- **wandb_host**: str
-
-    Default = https://api.wandb.ai
-
-    url of the wandb host
-
-
-
-- **wandb_init_all_ranks**: bool
-
-    Default = False
-
-    Initialize wandb on all ranks.
-
-
-
-- **git_hash**: str
-
-    Default = d3e481c
-
-    current git hash of repository
-
-
-
-- **log_dir**: str
-
-    Default = None
-
-    Directory to save logs to.
-
-
-
-- **tensorboard_dir**: str
-
-    Default = None
-
-    Write TensorBoard logs to this directory.
-
-
-
-- **log_interval**: int
-
-    Default = 100
-
-    Interval between logging.
-
-
-
-- **log_grad_pct_zeros**: bool
-
-    Default = False
-
-    Log the percentage of zeros for the gradient of each parameter to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
-
-
-
-- **log_param_norm**: bool
-
-    Default = False
-
-    Log the frob norm of the parameters to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
-
-
-
-- **log_grad_norm**: bool
-
-    Default = False
-
-    Log the frob norm of the gradients to wandb / tensorboard (useful for debugging).
-    (N.B - this will only work with pp = 0 for now, as we don't have access to the gradients of the model because
-    deepspeed.)
-
-
-
-- **log_optimizer_states**: bool
-
-    Default = False
-
-    Log the frob norm of the optimizer states to wandb / tensorboard (useful for debugging).
-
-
-
-- **log_gradient_noise_scale**: bool
-
-    Default = False
-
-    Whether to log the gradient noise scale when training (cf. https://arxiv.org/abs/1812.06162 for explanation)
-
-
-
-- **gradient_noise_scale_n_batches**: int
-
-    Default = 5
-
-    Number of batches to accumulate gradients for in the gradient noise scale logger.
-
-
-
-- **gradient_noise_scale_cpu_offload**: bool
-
-    Default = False
-
-    Whether to offload the buffered gradients to cpu when measuring gradient noise scale.
-
-
-
-## NeoXArgsModel
-
-Model Arguments
-
-
-
-- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16']
-
-    Default = None
-
-    description of the used precision, either one of fp16 or fp32 (and in the future bf16).
-
-
-
-- **num_layers**: int
-
-    Default = None
-
-    Number of transformer layers.
-
-
-
-- **hidden_size**: int
-
-    Default = None
-
-    Transformer hidden size.
-
-
-
-- **num_attention_heads**: int
-
-    Default = None
-
-    Number of transformer attention heads.
-
-
-
-- **seq_length**: int
-
-    Default = None
-
-    Maximum sequence length to process.
-
-
-
-- **max_position_embeddings**: int
-
-    Default = None
-
-    Maximum number of position embeddings to use. This is the size of position embedding.
-
-
-
-- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
-
-    Default = layernorm
-
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
-
-
-
-- **layernorm_epsilon**: float
-
-    Default = 1e-05
-
-    Layer norm epsilon.
-
-
-
-- **rms_norm_epsilon**: float
-
-    Default = 1e-08
-
-    Root mean squared norm epsilon
-
-
-
-- **scalenorm_epsilon**: float
-
-    Default = 1e-08
-
-    Scalenorm epsilon
-
-
-
-- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none']
-
-    Default = learned
-
-    Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'
-
-
-
-- **rpe_num_buckets**: int
-
-    Default = 32
-
-    T5 relative positional encoding number of buckets, default 32.
-
-
-
-- **rpe_max_distance**: int
-
-    Default = 128
-
-    T5 relative positional encoding max distance, default 128.
-
-
-
-- **opt_pos_emb_offset**: int
-
-    Default = 0
-
-    Learned position embedding offset (only used by OPT, where it should be set to 2).
-
-
-
-- **no_weight_tying**: bool
-
-    Default = False
-
-    Disables weight tying between embedding weights and final Linear layer
-
-
-
-- **attention_config**: list
-
-    Default = None
-
-    Attention configuration for gpt-neox
-
-    The first item in the list specifies the attention type(s), and should be a list of strings. The second item
-    specifies the number of times to repeat those attention types in the full list.
-
-    attention type choices:  [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird]
-
-    So a 12 layer network with only global attention could be specified like:
-        [[[`global`], 12]]
-
-    or a 12 layer network with alternating global / local like:
-        [[[`global`, `local`], 6]]
-
-    If none is specified, this defaults to
-        [[[`global`], n_layers]]
-
-
-
-- **sparsity_config**: dict
-
-    Default = None
-
-    Sparsity configuration dict as defined in https://www.deepspeed.ai/docs/config-json/#sparse-attention
-
-    Note that since neox is autoregressive, attention is always "unidirectional" and `horizontal_global_attention` is
-    always false.
-
-    The main difference between our sparsity config and deepspeed's is that `mode` is ignored - since it is instead
-    specified in attention_config defining each layer.
-
-    An example config is given below:
-          "sparse_attention": {
-            "block": 16,
-            "different_layout_per_head": true,
-            "num_local_blocks": 4,
-            "num_global_blocks": 1,
-            "num_different_global_patterns": 4,
-            "num_random_blocks": 0,
-            "local_window_blocks": [4],
-            "global_block_indices": [0],
-            "global_block_end_indices": None,
-            "num_sliding_window_blocks": 3
-          }
-
-
-
-- **num_unique_layers**: int
-
-    Default = None
-
-    Number of unique transformer layers. num-layers should be divisible by this value. Currently only has an effect when pipe_parallel_size=0.
-
-
-
-- **param_sharing_style**: str
-
-    Default = grouped
-
-    Ordering of the shared parameters. For example, for a num-layers=4 and --num-unique-layers=2, we will have the following ordering for two unique layers 1 and 2-: grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].
-
-
-
-- **make_vocab_size_divisible_by**: int
-
-    Default = 128
-
-    Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
-
-
-
-- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
-
-    Default = gelu
-
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
-
-
-
-- **scaled_upper_triang_masked_softmax_fusion**: bool
-
-    Default = False
-
-    Enable fusion of query_key_value_scaling time (upper diagonal) masking and softmax.
-
-
-
-- **scaled_masked_softmax_fusion**: bool
-
-    Default = False
-
-    Enable fusion of query_key_value_scaling general masking and softmax.
-
-
-
-- **bias_gelu_fusion**: bool
-
-    Default = False
-
-    Enable bias and gelu fusion.
-
-
-
-- **bias_dropout_fusion**: bool
-
-    Default = False
-
-    Enable bias and dropout fusion.
-
-
-
-- **fp16_lm_cross_entropy**: bool
-
-    Default = False
-
-    Move the cross entropy unreduced loss calculation for lm head to fp16.
-
-
-
-- **init_method_std**: float
-
-    Default = 0.02
-
-    Standard deviation of the zero mean normal distribution used for weight initialization.
-
-
-
-- **apply_query_key_layer_scaling**: bool
-
-    Default = False
-
-    Scale Q * K^T by 1 / layer-number. If this flag is set, then it will automatically set attention-softmax-in-fp32 to true
-
-
-
-- **use_cpu_initialization**: bool
-
-    Default = False
-
-    If set, affine parallel weights initialization uses CPU
-
-
-
-- **attention_softmax_in_fp32**: bool
-
-    Default = False
-
-    Run attention masking and softmax in fp32.
-
-
-
-- **rotary_pct**: float
-
-    Default = 1.0
-
-    pct of hidden dims to apply rotary positional embedding to
-
-
-
-- **rotary_emb_base**: int
-
-    Default = 10000
-
-    Base for rotary positional embedding
-
-
-
-- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
-
-    Default = normal
-
-    Init function used on all layers except ff residual outputs - choose from
-    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
-
-
-
-- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
-
-    Default = scaled_normal
-
-    Init function used for ff residual outputs - choose from
-    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
-
-
-
-- **gmlp_attn_dim**: int
-
-    Default = 64
-
-    the dimension of the single head self attention in gmlp model (not used in gpt models).
-    If None - gmlp model doesn't use attention.
-
-
-
-- **gpt_j_residual**: bool
-
-    Default = False
-
-    If false, we use the conventional residual path:
-      x = x + attn(ln1(x))
-      x = x + mlp(ln2(x))
-    Otherwise, we use the residual path from GPT-J, which offers a slight speedup:
-      x = ln(x)
-      x = x + attn(x) + mlp(x)
-
-
-
-- **gpt_j_tied**: bool
-
-    Default = False
-
-    If false, we use
-      x = x + attn(ln1(x)) + mlp(ln2(x))
-    Otherwise, we tie the layer norms
-      y = ln(x)
-      x = x + attn(y) + mlp(y)
-
-
-
-- **use_bias_in_norms**: bool
-
-    Default = True
-
-    If false, norms (e.g. LayerNorm) will not have bias terms
-
-
-
-- **use_bias_in_attn_linear**: bool
-
-    Default = True
-
-    If false, attn_linear (e.g. QKVO) will not have bias terms
-
-
-
-- **mlp_type**: str
-
-    Default = regular
-
-    Types:
-        regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
-
-
-
-- **soft_prompt_tuning**: dict
-
-    Default = None
-
-    Dictionary configuring the soft prompt tuning parameters.
-    If enabled, will train *only* the soft prompt, and freezes the rest of the model.
-    parameters in the dict are:
-        'enabled': bool = True # enables soft prompting
-        'num_tokens': int = 10 # length of the soft prompt in tokens
-        'init_string': str = '' # if provided, initialize the soft prompt with the word embeddings of this string
-        'init_range': float = 0.5 # if no init string is provided, initialize the soft prompt with a uniform distribution between -init_range and init_rang
-
-
-
-- **output_layer_parallelism**: typing.Literal['column']
-
-    Default = column
-
-    Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
-
-
-
-## NeoXArgsOptimizer
-
-Optimizer Arguments
-
-
-
-- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd']
-
-    Default = adam
-
-    Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd']
-    NOTE: sgd will use MuSGD from Mup. Mup must be enabled for this optimizer.
-
-
-
-- **use_bnb_optimizer**: bool
-
-    Default = False
-
-    Whether to enable the bitsandbytes optimizers
-
-
-
-- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']]
-
-    Default = None
-
-    Zero Optimizer stage
-
-
-
-- **zero_reduce_scatter**: bool
-
-    Default = None
-
-    Zero: Uses reduce or reduce scatter instead of allreduce to average gradients
-
-
-
-- **zero_contiguous_gradients**: bool
-
-    Default = None
-
-    Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.
-
-
-
-- **zero_reduce_bucket_size**: int
-
-    Default = None
-
-    Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
-
-
-
-- **zero_allgather_bucket_size**: int
-
-    Default = None
-
-    Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
-
-
-
-- **lr**: float
-
-    Default = None
-
-    Max Learning rate during training
-
-
-
-## NeoXArgsOther
-
-Misc. Arguments
-
-
-
-- **distributed_backend**: str
-
-    Default = nccl
-
-    Which backend to use for distributed training.
-
-
-
-- **local_rank**: int
-
-    Default = None
-
-    local rank passed from distributed launcher.
-
-
-
-- **rank**: int
-
-    Default = None
-
-    global rank of process being run (passed in via distributed launcher)
-
-
-
-- **lazy_mpu_init**: bool
-
-    Default = False
-
-    If set to True, initialize_megatron() skips DDP initialization and returns function to complete it instead. Also turns on use-cpu-initialization flag. This is for external DDP manager.
-
-
-
-- **short_seq_prob**: float
-
-    Default = 0.1
-
-    Probability of producing a short sequence.
-
-
-
-- **eod_mask_loss**: bool
-
-    Default = False
-
-    Mask loss for the end of document tokens.
-
-
-
-- **adlr_autoresume**: bool
-
-    Default = False
-
-    Enable auto-resume on adlr cluster.
-
-
-
-- **adlr_autoresume_interval**: int
-
-    Default = 1000
-
-    Intervals over which check for auto-resume termination signal
-
-
-
-- **seed**: int
-
-    Default = 1234
-
-    Random seed used for python, numpy, pytorch, and cuda.
-
-
-
-- **onnx_safe**: bool
-
-    Default = False
-
-    Use workarounds for known problems with Torch ONNX exporter
-
-
-
-- **deepscale**: bool
-
-    Default = False
-
-    (Deprecated) enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
-
-
-
-- **deepscale_config**: str
-
-    Default = None
-
-    (Deprecated) deepscale json configuration file.
-
-
-
-- **deepspeed_mpi**: bool
-
-    Default = False
-
-    Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment
-
-
-
-- **deepspeed_slurm**: bool
-
-    Default = False
-
-    Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
-
-
-
-- **user_script**: str
-
-    Default = None
-
-    user script to be run
-
-
-
-- **iteration**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_train**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_valid**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_test**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **save_iters**: list
-
-    Default = None
-
-    Set during training
-
-
-
-- **global_num_gpus**: int
-
-    Default = None
-
-    Set during launching
-
-
-
-## NeoXArgsParallelism
-
-Parallelism Arguments
-
-
-
-- **pipe_parallel_size**: int
-
-    Default = 0
-
-    Number of pipeline parallel stages. Disable with 0.
-
-
-
-- **model_parallel_size**: int
-
-    Default = 1
-
-    Size of the model parallelism.
-
-
-
-- **pipe_partition_method**: str
-
-    Default = type:transformer|mlp
-
-    method used to distribute model layers across pipeline stages. Choose from "parameters", which balances the number
-    of parameters on each pipeline stage, "uniform", which naively balances the number of layers per stage, or
-    "type:[regex]", which balances layers whose class names match [regex]
-
-
-
-- **world_size**: int
-
-    Default = None
-
-    Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher
-
-
-
-- **is_pipe_parallel**: bool
-
-    Default = False
-
-    flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined
-    according to pipeline parallel size.
-
-
-
-## NeoXArgsTemplate
-
-NeoXArgsTemplate()
-
-
-
-## NeoXArgsTextgen
-
-Text Generation arguments
-
-
-
-- **text_gen_type**: str
-
-    Default = None
-
-    How to generate text/sample the model.
-    Options: `unconditional`, `input-file`, `interactive`
-
-
-
-- **temperature**: float
-
-    Default = 0.0
-
-    exponential scaling output distribution ("higher == more risk")
-
-
-
-- **top_p**: float
-
-    Default = 0.0
-
-    Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
-
-
-
-- **top_k**: int
-
-    Default = 0
-
-    integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
-
-
-
-- **return_logits**: bool
-
-    Default = False
-
-    Boolean for whether to return the logits for generated tokens
-
-
-
-- **maximum_tokens**: int
-
-    Default = 64
-
-    maximum number of tokens to be generated
-
-
-
-- **prompt_end**: str
-
-    Default = 
-
-
-    a single prompt's end. Defaults to newline
-
-
-
-- **sample_input_file**: str
-
-    Default = None
-
-    Get input from file instead of interactive mode, each line is an input.
-
-
-
-- **sample_output_file**: str
-
-    Default = samples.txt
-
-    Output file
-
-
-
-- **num_samples**: int
-
-    Default = 1
-
-    Number of samples to generate unconditionally, defaults to 1 and interactive conditional sampling
-
-
-
-- **recompute**: bool
-
-    Default = False
-
-    During generation recompute all attention instead of using previously computed keys/values.
-    Should be set to true for sparse attention models
-
-
-
-- **eval_results_prefix**: str
-
-    Default = 
-
-    prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
-
-
-
-- **eval_tasks**: list
-
-    Default = None
-
-    Tasks to evaluate on using lm_eval_harness
-
-
-
-## NeoXArgsTokenizer
-
-Tokenizer Arguments
-
-
-
-- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
-
-    Default = GPT2BPETokenizer
-
-    Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer", "TiktokenTokenizer"]
-
-
-
-- **padded_vocab_size**: int
-
-    Default = None
-
-    Total (padded) vocabulary size of tokenizer. Configured after launching of training,
-    as it's dependent on the parallelism size.
-
-
-
-## NeoXArgsTraining
-
-Training Arguments
-
-
-
-- **data_path**: str
-
-    Default = None
-
-    Path to combined dataset to split.
-
-
-
-- **use_shared_fs**: bool
-
-    Default = True
-
-    Whether to use a shared filesystem for data loading. If False, local rank 0 on all nodes will preprocess the data,
-    otherwise only global rank 0 will preprocess the data. This is implemented in megatron/data/gpt2_dataset.py::_build_index_mappings.
-
-
-
-- **train_data_paths**: list
-
-    Default = None
-
-    List of paths to train datasets.
-
-
-
-- **label_data_paths**: list
-
-    Default = None
-
-    List of paths to label datasets (not shifted by 1 yet!).
-
-
-
-- **test_data_paths**: list
-
-    Default = None
-
-    List of paths to test datasets.
-
-
-
-- **valid_data_paths**: list
-
-    Default = None
-
-    List of paths to validation datasets.
-
-
-
-- **train_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `train_data_paths`
-
-
-
-- **valid_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each validation dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `valid_data_paths`
-
-
-
-- **test_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each test dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `test_data_paths`
-
-
-
-- **weight_by_num_documents**: bool
-
-    Default = False
-
-    If True, Builds dataset weights from a multinomial distribution over groups of data according to the number of
-    documents in each group.
-
-    WARNING: setting this to True will override any user provided weights
-
-    We sample from a group according to the probability p(L) ∝ |L| ** α,
-    where p(L) is the probability of sampling from a given group,
-          |L| is the number of examples in that datapoint,
-          and α is a coefficient that acts to upsample data from underrepresented groups
-
-    Hence α (`alpha`) allows us to control how much to 'boost' the probability of training on low-resource groups.
-
-    See https://arxiv.org/abs/1911.02116 for more details
-
-
-
-- **weighted_sampler_alpha**: float
-
-    Default = 0.3
-
-    Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
-
-    when alpha = 1, the probability of sampling from a given group = n_samples / total_samples
-    as alpha -> 0, the probability of sampling from all groups becomes equal, and number of documents has no effect
-    as alpha -> inf, the probability of sampling from the groups with *the most samples* -> 1
-
-
-
-- **data_impl**: str
-
-    Default = infer
-
-    Implementation of indexed datasets.
-
-
-
-- **mmap_warmup**: bool
-
-    Default = False
-
-    Warm up mmap files.
-
-
-
-- **save**: str
-
-    Default = None
-
-    Output directory to save checkpoints to.
-
-
-
-- **config_files**: dict
-
-    Default = None
-
-    Store of original config files mapping config filename to file contents
-
-
-
-- **load**: str
-
-    Default = None
-
-    Directory containing a model checkpoint.
-
-
-
-- **checkpoint_validation_with_forward_pass**: bool
-
-    Default = False
-
-    save input and output of a forward pass with the checkpoint and validate after load
-
-
-
-- **checkpoint_scale**: typing.Literal['linear', 'log']
-
-    Default = linear
-
-    How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`,
-    while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1.
-
-
-
-- **checkpoint_factor**: int
-
-    Default = None
-
-    Acts as a multiplier on either the "log" or "linear" checkpoint spacing.
-
-    With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at
-    steps [20, 40, 60, 80, 100].
-
-    With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at
-    steps [1, 2, 4, 8, 16, 32, 64, 100].
-
-    Note that the last checkpoint step is always saved.
-
-
-
-- **extra_save_iters**: list
-
-    Default = None
-
-    Additional iterations when a checkpoint should be saved.
-    Must be a list of ints or `None`.
-
-
-
-- **no_save_optim**: bool
-
-    Default = False
-
-    Do not save current optimizer.
-
-
-
-- **no_save_rng**: bool
-
-    Default = False
-
-    Do not save current rng state.
-
-
-
-- **no_load_optim**: bool
-
-    Default = False
-
-    Do not load optimizer when loading checkpoint.
-
-
-
-- **no_load_rng**: bool
-
-    Default = False
-
-    Do not load rng state when loading checkpoint.
-
-
-
-- **finetune**: bool
-
-    Default = False
-
-    Load model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.
-
-
-
-- **batch_size**: int
-
-    Default = None
-
-    training microbatch size per gpu
-
-
-
-- **train_iters**: int
-
-    Default = None
-
-    Number of iterations to run for training.
-
-
-
-- **eval_iters**: int
-
-    Default = 100
-
-    Number of iterations to run for evaluation validation/test for.
-
-
-
-- **keep_last_n_checkpoints**: int
-
-    Default = None
-
-    Number of last checkpoints to keep
-
-
-
-- **eval_interval**: int
-
-    Default = 1000
-
-    Interval between running evaluation on validation set.
-
-
-
-- **split**: str
-
-    Default = 969, 30, 1
-
-    Comma_separated list of proportions for training, validation, and test split. For example the split 90,5,5 will use 90% of data for training, 5% for validation and 5% for test.
-
-
-
-- **vocab_file**: str
-
-    Default = None
-
-    Path to the vocab file.
-
-
-
-- **merge_file**: str
-
-    Default = None
-
-    Path to the BPE merge file.
-
-
-
-- **num_workers**: int
-
-    Default = 2
-
-    Dataloader number of workers.
-
-
-
-- **exit_interval**: int
-
-    Default = None
-
-    Exit the program after the iteration is divisible by this value.
-
-
-
-- **attention_dropout**: float
-
-    Default = 0.1
-
-    Post attention dropout probability.
-
-
-
-- **hidden_dropout**: float
-
-    Default = 0.1
-
-    Dropout probability for hidden state transformer.
-
-
-
-- **weight_decay**: float
-
-    Default = 0.01
-
-    Weight decay coefficient for L2 regularization.
-
-
-
-- **checkpoint_activations**: bool
-
-    Default = False
-
-    Checkpoint activation to allow for training with larger models, sequences, and batch sizes.
-
-
-
-- **checkpoint_num_layers**: int
-
-    Default = 1
-
-    Chunk size (number of layers) for checkpointing.
-
-
-
-- **deepspeed_activation_checkpointing**: bool
-
-    Default = True
-
-    DEPRECATED - TODO: remove
-    Uses activation checkpointing from deepspeed
-
-
-
-- **contiguous_checkpointing**: bool
-
-    Default = False
-
-    Contiguous memory checkpointing for activations.
-
-
-
-- **checkpoint_in_cpu**: bool
-
-    Default = False
-
-    Move the activation checkpoints to CPU.
-
-
-
-- **synchronize_each_layer**: bool
-
-    Default = False
-
-    does a synchronize at the beginning and end of each checkpointed layer.
-
-
-
-- **profile_backward**: bool
-
-    Default = False
-
-    Enables backward pass profiling for checkpointed layers.
-
-
-
-- **partition_activations**: bool
-
-    Default = False
-
-    Partition Activations across GPUs before checkpointing.
-
-
-
-- **gas**: int
-
-    Default = None
-
-    gradient_accumulation_steps
-
-
-
-- **clip_grad**: float
-
-    Default = None
-
-    Gradient clipping based on global L2 norm.
-
-
-
-- **hysteresis**: int
-
-    Default = 2
-
-    hysteresis for dynamic loss scaling
-
-
-
-- **dynamic_loss_scale**: bool
-
-    Default = None
-
-    flag indicating whether dynamic loss scale is used
-
-
-
-- **loss_scale**: float
-
-    Default = None
-
-    Static loss scaling, positive power of 2
-    values can improve fp16 convergence. If None, dynamic loss scaling is used.
-
-
-
-- **loss_scale_window**: float
-
-    Default = 1000.0
-
-    Window over which to raise/lower dynamic scale.
-
-
-
-- **min_scale**: float
-
-    Default = 1.0
-
-    Minimum loss scale for dynamic loss scale.
-
-
-
-- **char_level_ppl**: bool
-
-    Default = False
-
-    Whether to calculate character level perplexity as well as token level perplexity. (may incur a time cost)
-
-
-
-- **use_mup**: bool
-
-    Default = False
-
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
-
-
-
-- **coord_check**: bool
-
-    Default = False
-
-    Whether to generate a "coord check" plot to verify mup's implementation in neox
-
-
-
-- **save_base_shapes**: bool
-
-    Default = False
-
-    Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file.
-
-
-
-- **base_shapes_file**: str
-
-    Default = None
-
-    Path to the base shapes to save to/load from
-
-
-
-- **mup_init_scale**: float
-
-    Default = 1.0
-
-    Initialization scale: All the parameters are multiplied by this value
-
-
-
-- **mup_attn_temp**: float
-
-    Default = 1.0
-
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
-
-
-
-- **mup_output_temp**: float
-
-    Default = 1.0
-
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-
-
-
-- **mup_embedding_mult**: float
-
-    Default = 1.0
-
-    Scalar by which we multiply the output of the embedding layer
-
-
-
-- **mup_rp_embedding_mult**: float
-
-    Default = 1.0
-
-    Scalar by which we multiply vectors representing relative position
-
-
-
-- **mup_width_scale**: int
-
-    Default = 2
-
-    What to scale width by when creating the delta model for mup
-
-
-
-## NeoXArgsDeepspeedConfig
-
-Args for deepspeed config
-    Every argument included here will be included in deepspeed config json
-    As of Mar 8 2023, up to date compared to https://www.deepspeed.ai/docs/config-json/
-
-
-
-- **deepspeed**: bool
-
-    Default = True
-
-    boolean flag to enable DeepSpeed (Always True)
-
-
-
-- **train_batch_size**: int
-
-    Default = None
-
-    The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
-
-
-
-- **train_micro_batch_size_per_gpu**: int
-
-    Default = None
-
-    Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
-
-
-
-- **gradient_accumulation_steps**: int
-
-    Default = 1
-
-    Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON.
-
-
-
-- **optimizer**: dict
-
-    Default = None
-
-    dict containing the keys type and params
-
-    type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
-
-    params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam).
-
-
-
-- **scheduler**: dict
-
-    Default = None
-
-    dict containing the keys type and params
-
-    type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers.
-
-    params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.
-
-
-
-- **fp32_allreduce**: bool
-
-    Default = False
-
-    During gradient averaging perform allreduce with 32 bit values
-
-
-
-- **prescale_gradients**: bool
-
-    Default = False
-
-    Scale gradients before doing allreduce
-
-
-
-- **gradient_predivide_factor**: float
-
-    Default = 1.0
-
-    Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs
-
-
-
-- **sparse_gradients**: bool
-
-    Default = False
-
-    Enable sparse compression of torch.nn.Embedding gradients.
-
-
-
-- **fp16**: dict
-
-    Default = None
-
-    Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package.
-
-    Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#fp16-training-options
-
-
-
-- **bf16**: dict
-
-    Default = None
-
-    Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
-
-
-
-- **amp**: dict
-
-    Default = None
-
-    Configuration for using automatic mixed precision (AMP) training that leverages NVIDIA’s Apex AMP package.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options
-
-
-
-- **gradient_clipping**: float
-
-    Default = 1.0
-
-    Enable gradient clipping with provided value
-
-
-
-- **zero_optimization**: dict
-
-    Default = None
-
-    Configuration for using ZeRO optimization.
-
-    Multi-level dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#zero-optimization-options
-
-
-
-- **curriculum_learning**: dict
-
-    Default = None
-
-    
-
-
-
-- **curriculum_seqlen**: int
-
-    Default = 0
-
-    Internal var for tracking the current seqlen
-
-
-
-- **steps_per_print**: int
-
-    Default = 10
-
-    Print train loss every N steps.
-
-
-
-- **wall_clock_breakdown**: bool
-
-    Default = False
-
-    Enable timing of the latency of forward/backward/update training phases.
-
-
-
-- **dump_state**: bool
-
-    Default = False
-
-    Print out state information of DeepSpeed object after initialization.
-
-
-
-- **flops_profiler**: dict
-
-    Default = None
-
-    Configuration for using FLOPS profiler.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler
-
-
-
-- **communication_data_type**: bool
-
-    Default = None
-
-    During gradient averaging, perform communication with selected data type. By default it will be determined by selected regime
-
-
-
-- **autotuning**: dict
-
-    Default = None
-
-    Configuration for using autotuning.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#autotuning
-
-
-
-- **activation_checkpointing**: dict
-
-    Default = None
-
-    Configuration for using activation checkpointing.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#activation-checkpointing
-
-
-
-- **sparse_attention**: dict
-
-    Default = None
-
-    Configuration for using sparse attention.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#sparse-attention
-
-
-
-- **data_efficiency**: dict
-
-    Default = None
-
-    Configuration for using data efficiency.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-efficiency
-
-
-
-- **tensorboard**: dict
-
-    Default = None
-
-    Configuration for using tensorboard.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv
-
-
-
-- **wandb**: dict
-
-    Default = None
-
-    Configuration for using wandb.
-
-
-
-- **csv_monitor**: dict
-
-    Default = None
-
-    Configuration for using csv_monitor.
-
-
-
-- **elasticity**: dict
-
-    Default = None
-
-    Configuration for using elastic training.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02
-
-
-
-- **comms_logger**: dict
-
-    Default = None
-
-    Configuration for using communication logger.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#communication-logging
-
-
-
-- **compression_training**: dict
-
-    Default = None
-
-    Configuration for using compression training.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#compression
-
-
-
-- **checkpoint**: dict
-
-    Default = None
-
-    Configuration for using checkpointing.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#checkpoint-options
-
-
-
-- **data_types**: dict
-
-    Default = None
-
-    Configuration for using data types.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-type-options
-
-
-
-- **deepspeed_extra_args**: dict
-
-    Default = None
-
-    Dictionary of extra arguments to be included in the yaml config file. This can be used for any argument not included in the above list.
-
-
-
-## NeoXArgsDeepspeedRunner
-
-Args for deepspeed runner (deepspeed.launcher.runner).
-    Every argument included here will be passed as command line argument to deepspeed.launcher.runner
-
-
-
-- **hostfile**: str
-
-    Default = None
-
-    list of hostnames / ssh aliases and the number of GPUs per host
-
-    example file contents:
-    worker-1 slots=4
-    worker-2 slots=4
-    127.0.0 slots=4
-    127.0.1 slots=4
-
-
-
-- **include**: str
-
-    Default = None
-
-    Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`.
-
-
-
-- **exclude**: str
-
-    Default = None
-
-    Specify hardware resources to NOT use during execution. Same format as include
-
-
-
-- **num_nodes**: int
-
-    Default = -1
-
-    Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all.
-
-
-
-- **num_gpus**: int
-
-    Default = None
-
-    Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all.
-
-
-
-- **master_port**: int
-
-    Default = 29500
-
-    Port used by PyTorch distributed for communication during training.
-
-
-
-- **master_addr**: str
-
-    Default = None
-
-    IP address of node 0, will be inferred via 'hostname -I' if not specified.
-
-
-
-- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
-
-    Default = pdsh
-
-    Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
-
-
-
-- **force_multi**: bool
-
-    Default = False
-
-    Force multi-node training even if only one node is specified.
-
-
-
-- **detect_nvlink_pairs**: bool
-
-    Default = False
-
-    If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
-
-
-
-- **autotuning_run**: str
-
-    Default = None
-
-    Either "tune", "run", or `None`.
-
-
-
-- **no_ssh_check**: bool
-
-    Default = False
-
-    If true, overrides the default check where DeepSpeed confirms that the headnode is accessible via ssh.
-
-
-
-- **comment**: str
-
-    Default = None
-
-    Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
-
diff --git a/configs/pythia/1-4B.yml b/configs/pythia/1-4B.yml
deleted file mode 100755
index bfb8d4bc7..000000000
--- a/configs/pythia/1-4B.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 24,
-  "hidden_size": 2048,
-  "num_attention_heads": 16,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 24]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0002,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00002,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 16,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-  "tokenizer_type": "HFTokenizer"
-  }
diff --git a/configs/pythia/12B.yml b/configs/pythia/12B.yml
deleted file mode 100755
index 21b67521d..000000000
--- a/configs/pythia/12B.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 4,
-
-   "num_layers": 36,
-   "hidden_size": 5120,
-   "num_attention_heads": 40,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "rotary_pct": 0.25,
-   "no_weight_tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-
-   "attention_config": [[["flash"], 36]],
-
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train_iters": 143000,
-   "lr_decay_iters": 143000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 1000,
-   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval_interval": 143000,
-   "eval_iters": 10,
-
-   "log_interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "log_grad_norm": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/160M.yml b/configs/pythia/160M.yml
deleted file mode 100755
index 2f6abdef5..000000000
--- a/configs/pythia/160M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 12,
-  "hidden_size": 768,
-  "num_attention_heads": 12,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 12]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0006,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00006,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/1B.yml b/configs/pythia/1B.yml
deleted file mode 100755
index 78fc28946..000000000
--- a/configs/pythia/1B.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 16,
-  "hidden_size": 2048,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00025,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.000025,
-
-  "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "fp16": {
-    "enabled": true,
-    "type": "bfloat16",
-    "auto_cast": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "fp32_allreduce": true,
-
-  "train_micro_batch_size_per_gpu": 4,
-  "gradient_accumulation_steps": 4,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/2-8B.yml b/configs/pythia/2-8B.yml
deleted file mode 100755
index 04427e9fd..000000000
--- a/configs/pythia/2-8B.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 32,
-  "hidden_size": 2560,
-  "num_attention_heads": 32,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 32]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00016,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.000016,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 8,
-  "gradient_accumulation_steps": 2,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 40000,
-  "eval_iters": 10,
-
-  "log_grad_norm": true,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/410M.yml b/configs/pythia/410M.yml
deleted file mode 100755
index 95afe9cd9..000000000
--- a/configs/pythia/410M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 24,
-  "hidden_size": 1024,
-  "num_attention_heads": 16,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 24]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0003,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00003,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/6-9B.yml b/configs/pythia/6-9B.yml
deleted file mode 100755
index 869129f30..000000000
--- a/configs/pythia/6-9B.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 2,
-
-   "num_layers": 32,
-   "hidden_size": 4096,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "rotary_pct": 0.25,
-   "no_weight_tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-
-   "attention_config": [[["flash"], 32]],
-
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train_iters": 143000,
-   "lr_decay_iters": 143000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 1000,
-   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval_interval": 143000,
-   "eval_iters": 10,
-
-   "log_interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/70M.yml b/configs/pythia/70M.yml
deleted file mode 100755
index a58553b4e..000000000
--- a/configs/pythia/70M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 6,
-  "hidden_size": 512,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 6]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.001,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.0001,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
deleted file mode 100644
index c6f388b9d..000000000
--- a/configs/slurm_125M.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox"
-}
diff --git a/configs/slurm_local.json b/configs/slurm_local.json
deleted file mode 100644
index 36e16089b..000000000
--- a/configs/slurm_local.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "vocab-file": "data/gpt2-vocab.json",
-  "merge-file": "data/gpt2-merges.txt",
-  "save": "checkpoints",
-  "checkpoint_validation_with_forward_pass": false,
-  "tensorboard-dir": "tensorboard",
-  "log-dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
deleted file mode 100644
index 1a2b73aba..000000000
--- a/configs/slurm_local.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "data_path": "data/enwik8/enwik8_text_document",
-  "vocab_file": "data/gpt2-vocab.json",
-  "merge_file": "data/gpt2-merges.txt",
-  "save": "checkpoints",
-  "checkpoint_validation_with_forward_pass": false,
-  "tensorboard_dir": "tensorboard",
-  "log_dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/sparse.yml b/configs/sparse.yml
deleted file mode 100644
index 7251c88b7..000000000
--- a/configs/sparse.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Add this to your config for sparse attention every other layer
-{
-  "attention_config": [[["local", "global"], "all"]],
-
-  # sparsity config:
-  # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
-  # illustrative purposes)
-  # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
-  # more detailed config instructions and available parameters
-
-  "sparsity_config": {
-    "block": 16, # block size
-    "num_local_blocks": 32,
-  }
-}
diff --git a/configs/text_generation.yml b/configs/text_generation.yml
deleted file mode 100644
index 5a49d61e5..000000000
--- a/configs/text_generation.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Parameters used for text generation
-# Make sure `load` is specified somewhere else
-{
-  # Text gen type: `input-file`, `unconditional` or `interactive`
-  "text_gen_type": "unconditional",
-
-  # Params for all
-  "maximum_tokens": 102,
-  "prompt_end": "\n",
-  "temperature": 1.0,
-  "top_p": 0.0,
-  "top_k": 0,
-  "recompute": false,
-
-  # `unconditional`: samples
-  "num_samples": 10,
-
-  # input/output file
-  "sample_input_file": "sample_input.txt",
-  "sample_output_file": "sample_output.txt",
-}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..e9d301ca3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,116 @@
+absl-py==1.4.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+best-download==0.0.9
+boto3==1.28.22
+botocore==1.31.22
+certifi==2023.5.7
+chardet==5.1.0
+charset-normalizer==3.1.0
+click==8.1.4
+cmake==3.26.4
+colorama==0.4.6
+CPCargo @ git+https://github.com/samikama/CPCargo@efbf0a5f2ad893c0eee4caae6098001b74be62d8
+DataProperty==1.0.0
+datasets==2.13.1
+DeepSpeed @ git+https://github.com/EleutherAI/DeeperSpeed.git@new-fix#egg=deepspeed
+dill==0.3.6
+docker-pycreds==0.4.0
+einops==0.6.1
+filelock==3.12.2
+flash-attn==2.0.0.post1
+frozenlist==1.3.3
+fsspec==2023.6.0
+ftfy==6.1.1
+fused-kernels @ file:///fsx/hailey/math-lm/gpt-neox/megatron/fused_kernels
+gitdb==4.0.10
+GitPython==3.1.32
+hf_transfer==0.1.3
+hjson==3.1.0
+huggingface-hub==0.16.4
+idna==3.4
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.1
+jsonlines==3.1.0
+lit==16.0.6
+lm-dataformat @ git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+lm-eval==0.3.0
+MarkupSafe==2.1.3
+mbstrdecoder==1.1.3
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
+ninja==1.11.1
+nltk==3.8.1
+numexpr==2.8.4
+numpy==1.25.0
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+openai==0.27.8
+packaging==23.1
+pandas==2.0.3
+pathtools==0.1.2
+pathvalidate==3.0.0
+portalocker==2.7.0
+protobuf==4.23.4
+psutil==5.9.5
+py-cpuinfo==9.0.0
+pyarrow==12.0.1
+pybind11==2.10.4
+pycountry==22.3.5
+pydantic==1.10.11
+pytablewriter==1.0.0
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+regex==2023.6.3
+rehash==1.0.1
+requests==2.31.0
+rouge-score==0.1.2
+s3transfer==0.6.1
+sacrebleu==1.5.0
+safetensors==0.3.1
+scikit-learn==1.3.0
+scipy==1.11.1
+sentencepiece==0.1.99
+sentry-sdk==1.28.1
+setproctitle==1.3.2
+six==1.16.0
+smmap==5.0.0
+sqlitedict==2.1.0
+sympy==1.12
+tabledata==1.3.1
+tcolorpy==0.1.3
+threadpoolctl==3.1.0
+tiktoken==0.4.0
+tokenizers==0.13.3
+torch==2.0.1
+tqdm==4.65.0
+tqdm-multiprocess==0.0.11
+transformers==4.31.0
+triton==2.0.0
+typepy==1.3.1
+typing_extensions==4.7.1
+tzdata==2023.3
+ujson==5.8.0
+urllib3==1.26.16
+wandb==0.15.5
+watchdog==3.0.0
+wcwidth==0.2.6
+xxhash==3.2.0
+yarl==1.9.2
+zstandard==0.21.0
\ No newline at end of file
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
deleted file mode 100644
index 6c58478f3..000000000
--- a/requirements/requirements-dev.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-autopep8>=1.5.6
-clang-format>=13.0.1
-pre-commit>=2.17.0
-pytest>=6.2.3
-pytest-cov>=2.11.1
-pytest-forked>=1.3.0
-pytest-xdist
diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt
deleted file mode 100644
index 8397ebe75..000000000
--- a/requirements/requirements-flashattention.txt
+++ /dev/null
@@ -1 +0,0 @@
-flash-attn==2.0.0.post1
diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt
deleted file mode 100644
index 349e3b39a..000000000
--- a/requirements/requirements-onebitadam.txt
+++ /dev/null
@@ -1 +0,0 @@
-cupy-cuda111>=8.6.0
diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt
deleted file mode 100644
index 09386fdcb..000000000
--- a/requirements/requirements-sparseattention.txt
+++ /dev/null
@@ -1 +0,0 @@
-triton==2.0.0.dev20221202
diff --git a/requirements/requirements-tensorboard.txt b/requirements/requirements-tensorboard.txt
deleted file mode 100644
index 6d5967364..000000000
--- a/requirements/requirements-tensorboard.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard==2.13.0
diff --git a/requirements/requirements-wandb.txt b/requirements/requirements-wandb.txt
deleted file mode 100644
index 1df18b051..000000000
--- a/requirements/requirements-wandb.txt
+++ /dev/null
@@ -1 +0,0 @@
-wandb>=0.10.28
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
deleted file mode 100644
index 443f162e6..000000000
--- a/requirements/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-best_download
-git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed
-ftfy>=6.0.1
-git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
-huggingface_hub>=0.11.0
-hf-transfer>=0.1.3
-lm_eval>=0.3.0
-mpi4py>=3.0.3
-numpy>=1.22.0
-pybind11>=2.6.2
-regex
-sentencepiece
-six
-tiktoken>=0.1.2
-tokenizers>=0.12.1
-transformers>=4.24.0
-git+https://github.com/samikama/CPCargo@main

From 6bc724b90895fb7de7d324a385e5ca6992d54e9e Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Thu, 5 Oct 2023 17:51:48 -0400
Subject: [PATCH 17/64] update with recent PR

---
 34b_launch_script.sh     | 7 +++----
 configs/data_mixture.yml | 8 ++++----
 configs/llemma_34b.yml   | 6 +++---
 configs/llemma_7b.yml    | 6 +++---
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/34b_launch_script.sh b/34b_launch_script.sh
index 3718c5862..2dd0166b8 100644
--- a/34b_launch_script.sh
+++ b/34b_launch_script.sh
@@ -10,9 +10,8 @@
 #SBATCH --open-mode=append
 #SBATCH --requeue
 
-# setup the environment using the script we created before
-source /fsx/proj-mathlm/conda_setup_deeperspeed.sh
-#source /fsx/quentin/setup.sh
+# setup the conda environment and paths to NCCL install.
+source /path/to/conda_setup_script.sh
 
 ds_report
 
@@ -36,4 +35,4 @@ export DLTS_HOSTFILE=path/to/hostfile/hosts_$SLURM_JOBID
 # launch distributed job. If using `"deepspeed_slurm": true` and `"launcher": "slurm"` on a SLURM cluster, 
 # then NeoX will handle the creation of a distributed run across 256 gpus.
 python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
-        --conf_dir /path/to/math-lm/pretraining llemma_34b.yml data_mixture.yml   
\ No newline at end of file
+        --conf_dir /path/to/math-lm/pretraining llemma_34b.yml data_mixture.yml   
diff --git a/configs/data_mixture.yml b/configs/data_mixture.yml
index 7c6a1b9f1..637ece59c 100644
--- a/configs/data_mixture.yml
+++ b/configs/data_mixture.yml
@@ -1,6 +1,6 @@
 {
-  "train-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/train/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/train/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/train/code-with-proofsteps/code-with-proofsteps_text_document", "/fsx/proj-mathlm/proof-pile_llama/train/pile-sample/pile-sample_text_document", "/fsx/proj-mathlm/code-rp_llama/train/code-rp/code-rp_text_document"],
+  "train-data-paths": ["/path/to/proof-pile_llama_tokenizer/train/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/train/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/code-with-proofsteps_llama/train/code-with-proofsteps/code-with-proofsteps_text_document", "/path/to/proof-pile_llama_tokenizer/train/pile-sample/pile-sample_text_document", "/path/to/code-rp_llama_tokenizer/train/code-rp/code-rp_text_document"],
   "train-data-weights": [2, 4, 1, 0.147368, 0.221053],
-  "valid-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/validation/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/validation/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/validation/code-with-proofsteps/code-with-proofsteps_text_document"],
-  "test-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/test/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/test/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/test/code-with-proofsteps/code-with-proofsteps_text_document"],
-}
\ No newline at end of file
+  "valid-data-paths": ["/path/to/proof-pile_llama_tokenizer/validation/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/validation/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/validation/code-with-proofsteps/code-with-proofsteps_text_document"],
+  "test-data-paths": ["/path/to/proof-pile_llama_tokenizer/test/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/test/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/code-with-proofsteps_llama/test/code-with-proofsteps/code-with-proofsteps_text_document"],
+}
diff --git a/configs/llemma_34b.yml b/configs/llemma_34b.yml
index 88a714575..46e1396cc 100644
--- a/configs/llemma_34b.yml
+++ b/configs/llemma_34b.yml
@@ -98,11 +98,11 @@
 
 
    "use_wandb": true,
-   "wandb_group": "34b-codellama-5e-5lr",
-   "wandb_project": "math-lm",
+   "wandb_group": "llemma_34b_replication",
+   "wandb_project": "your-project-name",
    "wandb_team": "your-teamname-here",
    "wandb_host": "https://api.wandb.ai",
 
    "launcher": "slurm",
    "deepspeed_slurm": true
-}
\ No newline at end of file
+}
diff --git a/configs/llemma_7b.yml b/configs/llemma_7b.yml
index c77c1c841..fa61af90f 100644
--- a/configs/llemma_7b.yml
+++ b/configs/llemma_7b.yml
@@ -95,11 +95,11 @@
    "checkpoint_validation_with_forward_pass": true,
 
    "use_wandb": true,
-   "wandb_group": "codellama_200btok_mp2_32node_2gas_deeperspeed",
-   "wandb_project": "math-lm",
+   "wandb_group": "llemma_7b_replication",
+   "wandb_project": "your-project-name",
    "wandb_team": "your-teamname-here",
    "wandb_host": "https://api.wandb.ai",
 
    "launcher": "slurm",
    "deepspeed_slurm": true
-}
\ No newline at end of file
+}

From 48d394e057cb9c7716cfe5a059fce570418af1b7 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 25 Oct 2023 01:48:47 +0000
Subject: [PATCH 18/64] README and requirements updated

---
 README.md        | 5 +++++
 requirements.txt | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 56c21b08c..62b9ab82c 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
+# Work in Progress: adding support for Mistral
+
+This branch is work in progress to add Mistral 7B support. This branch is currently undergoing development, and is not meant to be stable.
+
+
 # LLeMA Pretraining
 
 This is a modified version of the `EleutherAI/GPT-NeoX` repository used for the Llemma project. This branch diverged from `main` at commit `009018e`. This branch implements the following features that are not present in `009018e` (some of these features may have subsequently been merged into `main`):
diff --git a/requirements.txt b/requirements.txt
index e9d301ca3..d00b0a8e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,7 +21,7 @@ dill==0.3.6
 docker-pycreds==0.4.0
 einops==0.6.1
 filelock==3.12.2
-flash-attn==2.0.0.post1
+flash-attn==2.3.3
 frozenlist==1.3.3
 fsspec==2023.6.0
 ftfy==6.1.1

From 694bc7fd975672ba7b61b1e55c4a8be021cdf8fc Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 25 Oct 2023 01:55:00 +0000
Subject: [PATCH 19/64] Added Mistral config

---
 configs/mistral_7b.yml | 111 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 configs/mistral_7b.yml

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
new file mode 100644
index 000000000..9418402e9
--- /dev/null
+++ b/configs/mistral_7b.yml
@@ -0,0 +1,111 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "intermediate_size": 14336,
+  "num_attention_heads": 32,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "sliding_window_width": 4096,
+  "max_position_embeddings": 131072,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+
+  "attention_config": [[["flash"]]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+  "init_method_std": 0.02,
+
+  "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+  },
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 16,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 12000,
+  "lr_decay_iters": 12000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "decay_lr_to": 0.033,
+  "warmup_iters": 500,
+  "checkpoint_factor": 250,
+  "eval_interval": 250,
+  "eval_iters": 25,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+  "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
+  # "load": "" # set to same as "save" to resume from intermediate finetuning step
+  "load": "/path/to/converted/codellama_34b_weights_with_mp8",
+
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "checkpoint_validation_with_forward_pass": true,
+
+
+  "use_wandb": true,
+  "wandb_group": "llemma_34b_replication",
+  "wandb_project": "your-project-name",
+  "wandb_team": "your-teamname-here",
+  "wandb_host": "https://api.wandb.ai",
+
+  "launcher": "slurm",
+  "deepspeed_slurm": true
+}

From 612de2998dbbc02018a9efdbce591051efc3c36e Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 25 Oct 2023 01:57:16 +0000
Subject: [PATCH 20/64] Added sliding window through flash attention 2

---
 megatron/model/transformer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index aaeefb5ec..841e38152 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -160,8 +160,13 @@ def __init__(
 
         self.multiple_of = multiple_of
 
-        ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
-        ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+        # Allow custom intermediate size, e.g. for Mistral
+        if neox_args.intermediate_size is not None:
+            ff_dim = neox_args.intermediate_size
+        else:
+            ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
+            ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+            
         self.w1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
@@ -283,6 +288,7 @@ def __init__(
             neox_args.num_attention_heads, world_size
         )
         self.pos_emb = neox_args.pos_emb
+        self.sliding_window_width = neox_args.sliding_window_width
 
         self.attention_type = neox_args.attention_type
         if self.attention_type != "multihead":
@@ -560,6 +566,7 @@ def flash_attention(self, query_layer, key_layer, value_layer):
                     max_seqlen_q, max_seqlen_k,
                     softmax_scale=None,
                     causal=True,
+                    window_size=(self.sliding_window_width, -1) if self.sliding_window_width is not None else (-1, -1),
                 )
                 output = output.reshape(q_shape)
             else:
@@ -568,6 +575,7 @@ def flash_attention(self, query_layer, key_layer, value_layer):
                     self.dropout_p if self.training else 0.0,
                     softmax_scale=None,
                     causal=True,
+                    window_size=(self.sliding_window_width, -1) if self.sliding_window_width is not None else (-1, -1),
                 )
 
             matmul_result = output

From 9bd58f16192419154a6a7a70db86b4d9d67bbf34 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 25 Oct 2023 01:57:46 +0000
Subject: [PATCH 21/64] Added sliding window

---
 megatron/neox_arguments/neox_args.py | 9 +++++++++
 megatron/training.py                 | 1 +
 megatron/utils.py                    | 7 ++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index b707724e8..1d412cfab 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -105,6 +105,10 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Transformer hidden size.
     """
 
+    intermediate_size: int = None
+    """
+    Transformer intermediate size.
+    """
     num_attention_heads: int = None
     """
     Number of transformer attention heads. Used for number of query heads only, if num_kv_heads is set.
@@ -125,6 +129,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Maximum sequence length to process.
     """
 
+    sliding_window_width: int = None
+    """
+    Width of the attention sliding window. Only supported with Flash Attention 2.
+    """
+
     max_position_embeddings: int = None
     """
     Maximum number of position embeddings to use. This is the size of position embedding.
diff --git a/megatron/training.py b/megatron/training.py
index 0f1294c35..b27912f8c 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -293,6 +293,7 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype):
         data=tokens,
         eod_token=neox_args.tokenizer.eod,
         eod_mask_loss=neox_args.eod_mask_loss,
+        sliding_window_width=neox_args.sliding_window_width,
     )
     # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation
     if "label" in data_b:
diff --git a/megatron/utils.py b/megatron/utils.py
index 0071ef872..21d830819 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -63,7 +63,7 @@ def report_memory(name):
     print_rank_0(string)
 
 
-def get_attn_mask(seq_length, device):
+def get_attn_mask(seq_length, device, sliding_window_width):
     """
     Get triangular attention mask for a given sequence length / device.
     """
@@ -71,6 +71,9 @@ def get_attn_mask(seq_length, device):
     mask = torch.tril(torch.ones((1, seq_length, seq_length), device=device)).view(
         1, 1, seq_length, seq_length
     )
+    # get rid of lower diagonals than the sliding window width, if a value was provided
+    if sliding_window_width is not None:
+        mask = torch.triu(mask, diagonal=-sliding_window_width)
 
     # convert to binary
     return mask < 0.5
@@ -80,6 +83,7 @@ def get_ltor_masks_and_position_ids(
     data,
     eod_token,
     eod_mask_loss=False,
+    sliding_window_width=None,
 ):
     """Build masks and position id for left to right model."""
 
@@ -90,6 +94,7 @@ def get_ltor_masks_and_position_ids(
     attention_mask = get_attn_mask(
         seq_length=seq_length,
         device=data.device,
+        sliding_window_width=sliding_window_width,
     )
 
     # Loss mask.

From d5d90dc10de9a883cf4a803f5e70193127095ec7 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Fri, 27 Oct 2023 04:21:29 +0000
Subject: [PATCH 22/64] Mistral should likely use mp=2 like llama2

---
 configs/mistral_7b.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
index 9418402e9..73a56567f 100644
--- a/configs/mistral_7b.yml
+++ b/configs/mistral_7b.yml
@@ -1,6 +1,6 @@
 {
   "pipe_parallel_size": 0,
-  "model_parallel_size": 8,
+  "model_parallel_size": 2,
   "make_vocab_size_divisible_by": 1,
 
   # model settings

From 67638e15597e1b5ede8fb647012a388ce11c0f5a Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 1 Nov 2023 06:48:49 +0000
Subject: [PATCH 23/64] Update gitignore

---
 .gitignore | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.gitignore b/.gitignore
index 34de4e774..9630bcfee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,3 +150,14 @@ test_logs/
 logs/
 tensorboard/
 src/
+*.out
+
+# extensions
+extensions/fused_adam/.ninja_deps
+extensions/fused_adam/.ninja_log
+extensions/fused_adam/build.ninja
+extensions/fused_adam/multi_tensor_adam.cuda.o
+extensions/utils/.ninja_deps
+extensions/utils/.ninja_log
+extensions/utils/build.ninja
+extensions/utils/flatten_unflatten.o

From b521408109587bc899b7f7acc3476b98f8fa0727 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 1 Nov 2023 06:49:41 +0000
Subject: [PATCH 24/64] Removed unused CPCargo import

---
 megatron/training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index b27912f8c..b28b24e7e 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -30,7 +30,6 @@
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
 import numpy as np
 
-from CPCargo import Heartbeat
 from megatron.utils import (
     Timers,
     init_wandb,

From c842ea9a99d810a9fabdef7e481f9c6db26a2cf7 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 1 Nov 2023 06:50:09 +0000
Subject: [PATCH 25/64] Conversion script (WIP)

---
 configs/mistral_7b.yml              |   9 +-
 convert_hf_to_sequential_mistral.py | 644 ++++++++++++++++++++++++++++
 mistral_7b_convert.sh               |  76 ++++
 3 files changed, 725 insertions(+), 4 deletions(-)
 create mode 100644 convert_hf_to_sequential_mistral.py
 create mode 100644 mistral_7b_convert.sh

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
index 73a56567f..107bf2efe 100644
--- a/configs/mistral_7b.yml
+++ b/configs/mistral_7b.yml
@@ -1,6 +1,6 @@
 {
   "pipe_parallel_size": 0,
-  "model_parallel_size": 2,
+  "model_parallel_size": 4,
   "make_vocab_size_divisible_by": 1,
 
   # model settings
@@ -22,7 +22,7 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
 
-  "attention_config": [[["flash"]]],
+  "attention_config": [[["flash"], 32]],
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
@@ -52,6 +52,7 @@
     "cpu_offload": false
   },
 
+  # "train_batch_size": 4,
   "train_micro_batch_size_per_gpu": 2,
   "gradient_accumulation_steps": 16,
   "data_impl": "mmap",
@@ -90,7 +91,7 @@
   "wall_clock_breakdown": true,
 
   "tokenizer_type": "SPMTokenizer",
-  "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+  "vocab-file": "/fsx/adami/raw_mistral/mistral-v0.1/7B/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
 
   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
   # "load": "" # set to same as "save" to resume from intermediate finetuning step
@@ -100,7 +101,7 @@
   "checkpoint_validation_with_forward_pass": true,
 
 
-  "use_wandb": true,
+  "use_wandb": false,
   "wandb_group": "llemma_34b_replication",
   "wandb_project": "your-project-name",
   "wandb_team": "your-teamname-here",
diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
new file mode 100644
index 000000000..cce5d60b9
--- /dev/null
+++ b/convert_hf_to_sequential_mistral.py
@@ -0,0 +1,644 @@
+import sys
+import os
+import copy
+import deepspeed
+
+# import time
+
+import argparse
+import torch
+
+import numpy as np
+
+from functools import reduce
+from transformers import AutoModelForCausalLM, AutoConfig
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+from megatron.neox_arguments import NeoXArgs
+from megatron.training import get_model, get_optimizer, get_learning_rate_scheduler
+from megatron.initialize import initialize_megatron
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint, save_checkpoint
+
+# from megatron.utils import (
+#     Timers,
+#     init_wandb,
+# )
+
+"""
+A script for converting publicly available Huggingface (HF) checkpoints NeoX format.
+
+Note that this script requires access to corresponding config files for equivalent NeoX models to those found in Hugging face.
+
+Example usage: (Converts the 70M Pythia model to NeoX format)
+================================================================
+OMPI_COMM_WORLD_RANK=0 CUDA_VISIBLE_DEVICES=0 python tools/ckpts/convert_hf_to_sequential.py \
+    --hf-model-name pythia-70m-v0 \
+    --revision 143000 \
+    --output-dir checkpoints/neox_converted/pythia/70m \
+    --cache-dir checkpoints/HF \
+    --config configs/pythia/70M.yml configs/local_setup.yml \
+    --test
+
+
+For multi-gpu support we must initialize deepspeed:
+NOTE: This requires manually changing the arguments below.
+================================================================
+CUDA_VISIBLE_DEVICES=0,1,2,3 python ./deepy.py tools/ckpts/convert_hf_to_sequential.py \
+    -d configs pythia/70M.yml local_setup.yml
+"""
+
+MULTI_GPU_ARGS = " ".join(
+    [
+        "--hf-model-name mistralai/Mistral-7B-v0.1",
+        "--output-dir ../neox-converted/",
+        "--cache-dir checkpoints/HF",
+        "--config configs/mistral_7b.yml",
+        "--test",
+    ]
+)
+
+
+def convert_hf_to_sequential(hf_model, seq_state_dict):
+    """Converts the weights of a HuggingFace model to neox 2.0 format.
+
+    :param hf_model: the huggingface model
+    :param seq_state_dict: the state dict of the equivalent neox model
+
+    returns the updated sequential state dict
+    """
+    num_layers = hf_model.config.num_hidden_layers
+    # Embedding is layer idx 0
+    seq_state_dict[
+        "sequential.0.word_embeddings.weight"
+    ] = hf_model.gpt_neox.embed_in.state_dict()["weight"]
+
+    for layer_hf in range(num_layers):
+        # offset by 2
+        layer_seq = layer_hf + 2
+
+        # get layer from hf model
+        hf_layer = hf_model.gpt_neox.layers[layer_hf]
+        hf_layer_sd = hf_layer.state_dict()
+
+        for key in hf_model.gpt_neox.layers[0].state_dict().keys():
+
+            if key in ["attention.bias", "attention.masked_bias"]:
+                continue
+            seq_state_dict[f"sequential.{layer_seq}.{key}"] = hf_layer_sd[key]
+
+    # Load final layer norm
+    layer_seq = num_layers + 3
+    seq_state_dict[
+        f"sequential.{layer_seq}.norm.weight"
+    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["weight"]
+    seq_state_dict[
+        f"sequential.{layer_seq}.norm.bias"
+    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["bias"]
+
+    # output embedding / LM head
+    layer_seq += 1
+    seq_state_dict[
+        f"sequential.{layer_seq}.final_linear.weight"
+    ] = hf_model.embed_out.state_dict()["weight"]
+
+
+def shard_sequential_mp(num_mp_ranks, sequential):
+    """Shards the sequential model into model parallel ranks.
+
+    :param num_mp_ranks: the number of model parallel ranks
+    :param sequential: the state dict of the sequential model at mp=1
+
+    returns a dict of state dicts for each mp rank
+    """
+    ranks = {x: dict() for x in range(num_mp_ranks)}
+    for k, v in sequential.items():
+        if reduce(
+            np.logical_or,
+            [
+                x in k
+                for x in [
+                    "layernorm",
+                    "rotary_emb",
+                    "dense_4h_to_h.bias",
+                    "norm.weight",
+                    "norm.bias",
+                    "attention.dense.bias",
+                ]
+            ],
+        ):
+            # no splitting
+            for x in range(num_mp_ranks):
+                ranks[x][k] = v
+        else:
+            if len(v.shape) == 1:
+                size_per_rank = v.shape[0] / num_mp_ranks
+                if size_per_rank % 128 != 0.0:
+                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
+                    size_diff = int((padded_size * 4) - v.shape[max_])
+                    zero_pad = torch.zeros((size_diff))
+                    v = torch.cat([v, zero_pad], dim=max_)
+                else:
+                    padded_size = size_per_rank
+
+                assert size_per_rank % 1.0 == 0.0
+                assert padded_size % 1.0 == 0.0
+
+                padded_size = int(padded_size)
+                size_per_rank = int(size_per_rank)
+
+                for x in range(num_mp_ranks):
+                    if size_per_rank != padded_size:
+                        # need to pad
+                        ranks[x][k] = v[padded_size * x : padded_size * (x + 1)]
+                    else:
+                        ranks[x][k] = v[size_per_rank * x : size_per_rank * (x + 1)]
+
+            elif len(v.shape) == 2:
+
+                if reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "attention.dense.weight",
+                            "mlp.dense_4h_to_h.weight",
+                        ]
+                    ],
+                ):  # column parallel
+                    max_, min_ = 1, 0
+                elif reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "mlp.dense_h_to_4h.weight",
+                            "mlp.dense_h_to_4h.bias",
+                            "attention.query_key_value.weight",
+                            "attention.query_key_value.bias",
+                            "word_embeddings.weight",
+                            "final_linear.weight",
+                        ]
+                    ],
+                ):
+                    # row parallel
+                    max_, min_ = 0, 1
+                else:
+                    raise Exception("Unknown weight to shard: {}".format(k))
+
+                size_per_rank = v.shape[max_] / num_mp_ranks
+                if size_per_rank % 128 != 0.0:
+                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
+                    size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
+
+                    assert (
+                        size_diff > 0
+                    ), "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
+                        size_diff, size_per_rank, k, v.shape, padded_size
+                    )
+
+                    zero_pad = (
+                        torch.zeros((size_diff, v.shape[min_]))
+                        if max_ == 0
+                        else torch.zeros((v.shape[min_], size_diff))
+                    )
+
+                    v = torch.cat([v, zero_pad], dim=max_)
+                else:
+                    padded_size = size_per_rank
+
+                assert size_per_rank % 1.0 == 0.0
+                assert padded_size % 1.0 == 0.0
+
+                padded_size = int(padded_size)
+                size_per_rank = int(size_per_rank)
+
+                for x in range(num_mp_ranks):
+                    if size_per_rank != padded_size:
+                        # need to pad
+                        ranks[x][k] = (
+                            v[padded_size * x : padded_size * (x + 1), :]
+                            if max_ == 0
+                            else v[:, padded_size * x : padded_size * (x + 1)]
+                        )
+                    else:
+                        ranks[x][k] = (
+                            v[size_per_rank * x : size_per_rank * (x + 1), ...]
+                            if max_ == 0
+                            else v[:, size_per_rank * x : size_per_rank * (x + 1)]
+                        )
+
+            else:
+                raise NotImplementedError()
+
+    return ranks
+
+
+def replace_sharded_seq(mp_checkpoints, mp_sharded_seq):
+    """replaces the values within checkpointed configs with those
+    from the sharded sequential object."""
+
+    for mp_idx, shard in mp_sharded_seq.items():
+        mp_key = f"mp_rank_{mp_idx:02}_model_states.pt"
+
+        # use for loop instead of direct assignment
+        # to check for compatibility
+        for k, v in mp_checkpoints[mp_key]["module"].items():
+            try:
+                mp_checkpoints[mp_key]["module"][k] = shard[k]
+            except KeyError:
+                print("ERROR key:{} not found in shard.".format(k))
+
+
+def shard_pp(sequential, mp_rank, num_layers):
+    """Shards the model into layers.
+
+    :param sequential: the state dict of the sequential model at mp=1
+    :param mp_rank: the model parallel rank of the layers
+
+    returns a dict of state dicts for each layer
+    """
+    suffix = f"-model_{mp_rank:02}-model_states.pt"
+
+    layers_seq = dict()
+    layers_seq[f"layer_00" + suffix] = {
+        "word_embeddings.weight": sequential[f"sequential.0.word_embeddings.weight"]
+    }
+    layers_seq[f"layer_{num_layers+3:02}" + suffix] = {
+        "norm.weight": sequential[f"sequential.{num_layers+3}.norm.weight"],
+        "norm.bias": sequential[f"sequential.{num_layers+3}.norm.bias"],
+    }
+
+    layers_seq[f"layer_{num_layers+4:02}" + suffix] = {
+        "final_linear.weight": sequential[
+            f"sequential.{num_layers+4}.final_linear.weight"
+        ]
+    }
+
+    for layer in range(2, num_layers + 2):
+        layer_keys = [x for x in sequential if ".{}.".format(layer) in x]
+        layers_seq[f"layer_{layer:02}" + suffix] = {
+            k.split(".{}.".format(layer))[1]: sequential[k] for k in layer_keys
+        }
+
+    return layers_seq
+
+
+def shard_pp_mp(num_mp_ranks, sequential, num_layers):
+    """Shards the model into layers and model parallel ranks.
+
+    :param num_mp_ranks: the number of model parallel ranks
+    :param sequential: the state dict of the sequential model at mp=1
+    :param num_layers: the number of layers in the model
+
+    returns a dict of state dicts for each layer for each model parallel rank
+    """
+    mp_sharded = shard_sequential_mp(num_mp_ranks=num_mp_ranks, sequential=sequential)
+
+    layers_pp_mp = {}
+    for mp_rank, d in mp_sharded.items():
+        layers_pp_mp.update(
+            shard_pp(sequential=d, mp_rank=mp_rank, num_layers=num_layers)
+        )
+    return layers_pp_mp
+
+
+def convert(hf_model, ckpt_dir, output_dir):
+    """Converts a huggingface model to a NeoX checkpoint for different
+        model parallel and pipeline parallel settings degrees.
+
+    :param hf_model: the huggingface model
+    :param ckpt_dir: the directory containing the NeoX checkpoint
+    :param output_dir: the directory to save the converted checkpoint
+    returns None
+    """
+
+    os.listdir(ckpt_dir)
+
+    ckpts, layers = {}, {}
+    for x in os.listdir(ckpt_dir):
+        if x.startswith("mp_rank"):
+            ckpts[x] = torch.load(os.path.join(ckpt_dir, x))
+        elif x.startswith("layer"):
+            layers[x] = torch.load(os.path.join(ckpt_dir, x))
+
+    assert len(layers) + len(ckpts) > 0, "No checkpoints found in {}".format(ckpt_dir)
+
+    os.makedirs(output_dir, exist_ok=True)
+    seq_state_dict = dict()
+    convert_hf_to_sequential(hf_model, seq_state_dict)
+
+    if len(ckpts) == 1 and len(layers) == 0:
+        # pp=0, mp=1
+        key = list(ckpts.keys())[0]
+        ckpts[key]["module"] = seq_state_dict
+        to_save = ckpts
+
+    elif len(ckpts) > 1 and len(layers) == 0:
+        # pp=0, mp>1
+        sharded_seq = shard_sequential_mp(
+            num_mp_ranks=len(ckpts), sequential=seq_state_dict
+        )
+        replace_sharded_seq(mp_checkpoints=ckpts, mp_sharded_seq=sharded_seq)
+        to_save = ckpts
+
+    elif len(ckpts) == 1 and len(layers) > 1:
+        # pp>0, mp==1
+        to_save = shard_pp(
+            sequential=seq_state_dict,
+            mp_rank=0,
+            num_layers=hf_model.config.num_hidden_layers,
+        )
+
+    elif len(ckpts) > 1 and len(layers) > 1:
+        # pp>0, mp>1
+        to_save = shard_pp_mp(
+            num_mp_ranks=len(ckpts),
+            sequential=seq_state_dict,
+            num_layers=hf_model.config.num_hidden_layers,
+        )
+
+    else:
+        raise NotImplementedError(
+            "Not implemented for len(ckpts)={} and len(layers)={}".format(
+                len(ckpts), len(layers)
+            )
+        )
+
+    for k, v in to_save.items():
+        print("saving {}...".format(os.path.join(output_dir, k)))
+        torch.save(v, os.path.join(ckpt_dir, k))
+
+    # copy the checkpoint to the output_dir
+    print("rm {}/*".format(output_dir))
+    os.system("rm {}/*".format(output_dir))
+    os.makedirs(output_dir, exist_ok=True)
+    print("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+    os.system("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+
+    # set latest file within the output_dir
+    latest_file = os.path.join("/".join(output_dir.split("/")[:-1]), "latest")
+    os.system("rm " + latest_file)
+    with open(latest_file, "w") as f:
+        f.write(output_dir.split("/")[-1])
+
+
+def consume_neox_args2(args_parsed, overwrite_values=None):
+    """
+    Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
+
+    In order not to have any problems with different configs being mismatched across machines, we instead read the .yaml configuration file from the main rank,
+    then serialize the arguments to a dictionary, which the deepspeed launcher broadcasts to all machines (`--megatron_config`).
+
+    We then instantiate a new NeoXArgs from the dictionary (`.from_dict`). This should ensure args are never inconsistent across machines.
+    """
+
+    with open(args_parsed.megatron_config) as jsonfile:
+        megatron_config = json.load(jsonfile)
+    if args_parsed.deepspeed_config is not None:
+        overwrite_values = NeoXArgs.set_up_autotuning(
+            args_parsed.deepspeed_config, overwrite_values
+        )
+    if overwrite_values is not None:
+        megatron_config.update(overwrite_values)
+    return NeoXArgs.from_dict(args_dict=megatron_config)
+
+
+def get_non_existing_dir(tmp_dir):
+    while os.path.exists(tmp_dir):
+        tmp_dir = os.path.join(tmp_dir, "tmp_dir")
+    return tmp_dir
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert a Hugging Face GPT-NeoX model back to a sequential model compatible with GPT-NeoX training."
+    )
+    parser.add_argument(
+        "--revision",
+        type=int,
+        default=143000,
+        help="Revision or step of the Pythia model to convert.",
+    )
+    parser.add_argument( 
+        "--output-dir",
+        type=str,
+        help="Path to save the converted GPT-NeoX model checkpoint.",
+    )
+    parser.add_argument(
+        "--config", 
+        nargs="*",
+        default=[],
+        help="Path to the config file for the equivalent NeoX model."
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set, will run a test to ensure the conversion was successful."
+    )
+    parser.add_argument(
+        "--download-only",
+        action="store_true",
+        help="If set, script will only download the model and not convert it."
+    )
+
+    parser.add_argument(
+        "--ckpt-tmp-dir",
+        default="/tmp/ckpt_tmp_dir",
+        help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]"
+    )
+    parser.add_argument(
+        "--hf-model-name",
+        type=str,
+        help="Name of the hugging face model to download from EleutherAI/{hf-model-name}.}"
+    )
+
+    parser.add_argument(
+        "--cache-dir",
+        default="/gpfs/alpine/csc499/proj-shared/hf_checkpoints",
+        help="Directory to store cached hugging face checkpoints."
+    )
+    try:
+        if int(os.environ['WORLD_SIZE']) > 1:
+            args = parser.parse_args(MULTI_GPU_ARGS.split(" "))
+        else:
+            args = parser.parse_args()
+    except KeyError:
+        args = parser.parse_args()
+
+
+    tmp_cache_dir = get_non_existing_dir(args.ckpt_tmp_dir)
+
+    if args.download_only:
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            f"{args.hf_model_name}",
+            cache_dir=os.path.join(
+                args.cache_dir, f"{args.hf_model_name}"
+            ),
+        ).half()
+        exit(0)
+    else:
+        print("======================================================================")
+        print(
+            "Warning the following script will delete files within {}".format(
+                args.output_dir
+            )
+        )
+        print(
+            "Warning the following script will delete this directory {}".format(
+                tmp_cache_dir
+            )
+        )
+        print("======================================================================")
+        # time.sleep(5)
+    
+    if int(os.environ.get('OMPI_COMM_WORLD_SIZE',1)) > 1:
+        neox_args = consume_neox_args2(args2)
+    else:
+        neox_args = NeoXArgs.from_ymls(args.config)
+    neox_args.configure_distributed_args()
+    neox_args.build_tokenizer()
+    neox_args.initialize_tensorboard_writer()
+
+    # setup logging and timers
+    # init_wandb(neox_args=neox_args)
+    # timers = Timers(
+    #     use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
+    # )
+    initialize_megatron(neox_args=neox_args)
+
+    torch.distributed.barrier()
+
+    model = get_model(neox_args=neox_args, use_cache=True)
+    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
+    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
+
+    model, optimizer, _, lr_scheduler = deepspeed.initialize(
+        model=model,
+        optimizer=optimizer,
+        # args=neox_args,
+        lr_scheduler=lr_scheduler,
+        dist_init_required=False,
+        model_parameters=None,
+        config_params=neox_args.deepspeed_config,
+        mpu=mpu if not neox_args.is_pipe_parallel else None,
+    )
+
+    if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0":
+        os.makedirs(f"{tmp_cache_dir}", exist_ok=True)
+
+    torch.distributed.barrier()
+    neox_args.save = tmp_cache_dir
+
+    save_checkpoint(
+        neox_args=neox_args,
+        iteration=0,
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+    )
+    print(os.listdir(f"{tmp_cache_dir}"))
+    ckpt_dir = os.path.join(tmp_cache_dir, "global_step0")
+
+
+    if torch.distributed.get_rank() == 0:
+        config = AutoConfig.from_pretrained(
+            f"{args.hf_model_name}",
+            cache_dir=os.path.join(
+                args.cache_dir,f"{args.hf_model_name}"
+                ),
+            )
+        # does not change the weights, but is needed to align logits
+        # config.update({'hidden_act':'gelu_fast'})
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            f"{args.hf_model_name}",
+            config=config,
+            cache_dir=os.path.join(
+                args.cache_dir,f"{args.hf_model_name}"
+                ),
+        ).half()
+        print("==========================================")
+        print("Loaded Hugging Face model successfully!")
+        print("==========================================")
+        convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir)
+
+        if os.environ['OMPI_COMM_WORLD_RANK'] == '0':
+            # cleanup temp dir
+            os.system(f"rm -r {tmp_cache_dir}")
+
+    torch.distributed.barrier()
+
+    #verify the conversion can be loaded
+    neox_args.load = "/".join(args.output_dir.split("/")[:-1])
+    print(neox_args.load)
+    neox_args.finetune=True
+    load_checkpoint(
+        neox_args=neox_args,
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        iteration=None,
+    )
+    print("==========================================")
+    print("Converted checkpoint successfully loaded!")
+    print("==========================================")
+
+
+    if args.test and torch.distributed.get_world_size() == 1:
+        # only implemented for world size 1
+
+        with torch.no_grad():
+            # torch.backends.cudnn.benchmark = False
+            # torch.use_deterministic_algorithms(True) #setting the CUBLAS_WORKSPACE_CONFIG=:4096:8 environment variable is required for this to work (tested for A6000)
+            model.eval()
+            hf_model.eval()
+
+            b = 10
+            seq_len = 32
+            inputs = torch.randint(0, 50304, (b, seq_len), dtype=torch.long).cuda()
+            mask = (
+                (torch.triu(torch.ones(seq_len, seq_len)) != 1).transpose(0, 1).cuda()
+            )
+            pos_ids = torch.arange(0, seq_len).unsqueeze(0).cuda()
+
+            torch.manual_seed(0)
+            outputs_neox = model.cuda()(
+                (inputs, pos_ids, mask.unsqueeze(0).unsqueeze(0)), neox_args=neox_args
+            )
+
+            torch.manual_seed(0)
+            outputs = hf_model.cuda()(input_ids=inputs)
+
+            print("HF logits   .sum(): ", outputs.logits.to(torch.float32).sum())
+            print("NeoX logits .sum(): ", outputs_neox.to(torch.float32).sum())
+
+            print(
+                "\nLogit comparison summary for {} sequences of length {}:".format(
+                    b, seq_len
+                )
+            )
+            print("=============================================================")
+            for i in range(b):
+                abs_diff = (
+                    outputs.logits[i, ...].to(torch.float32)
+                    - outputs_neox[i, ...].to(torch.float32)
+                ).abs()
+                print(
+                    "[Random sequence {}] (hflogits - neoxlogits).abs() -- mean: {:.5f}\tmax: {:.5f}\tmin: {:.5f}\tmedian: {:.5f}".format(
+                        i,
+                        abs_diff.mean(),
+                        abs_diff.max(),
+                        abs_diff.min(),
+                        abs_diff.median(),
+                    )
+                )
+
+    elif args.test:
+        print(
+            "[INFO] Checkpoint conversion logit test not implemented for distributed world_size > 1. Current world_size: {}".format(
+                torch.distributed.get_world_size()
+            )
+        )
+
diff --git a/mistral_7b_convert.sh b/mistral_7b_convert.sh
new file mode 100644
index 000000000..61643a371
--- /dev/null
+++ b/mistral_7b_convert.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=7b_mistral
+#SBATCH --partition=g40x
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8          # Crucial - only 1 task per dist per node!
+####SBATCH --ntasks=1
+#SBATCH --cpus-per-task=12          # Number of cores per tasks
+#SBATCH --gres=gpu:8                 # Number of gpus
+#SBATCH --output=mistral_7b_%x_%j.out      # Set this dir where you want slurm outs to go
+#SBATCH --error=mistral_7b_%x_%j.out      # Set this dir where you want slurm outs to go
+#SBATCH --exclusive      # Turn off node sharing
+#SBATCH --account=neox
+#SBATCH --open-mode=append
+#SBATCH --requeue
+#SBATCH --time=0-00:20:00
+
+# setup the environment using the script we created before
+source ~/setup.sh
+#source /fsx/quentin/setup.sh
+
+ds_report
+
+export NCCL_DEBUG=INFO
+export NCCL_TREE_THRESHOLD=0
+export NCCL_PROTO=simple
+# Network issues without the following two NCCL vars set; See https://github.com/NVIDIA/nccl/issues/676
+export NCCL_IBEXT_DISABLE=1
+export NCCL_SOCKET_IFNAME=^docker0,lo
+
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+
+export PYTHONFAULTHANDLER=1
+
+export OMPI_MCA_mtl_base_verbose=1
+export OMPI_MCA_btl="^openib"
+
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+# Hide duplicated errors using this hack - will be properly fixed in pt-1.12
+export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json
+export TORCH_EXTENSIONS_DIR=./extensions/
+
+# Move to the gpt-neox install
+TRAIN_PATH=/fsx/$(whoami)/gpt-neox #/fsx/hailey/math-lm/gpt-neox
+cd $TRAIN_PATH
+
+# Write the hostfile for this job
+export MASTER_ADDR=$(echo $MASTER_ADDR | cut -d '-' -f 2- | tr '-' '.')
+bash ~/write_ip_hostfile.sh
+export DLTS_HOSTFILE=/fsx/$(whoami)/hostfiles/hosts_$SLURM_JOBID
+
+
+# python tools/convert_raw_llama_weights_to_neox.py \
+#     --input_dir ../raw_mistral/mistral-v0.1/ \
+#     --output_dir ../neox-converted/ \
+#     --model_size "7B" \
+#     --num_output_shards 2
+# #    --config_file configs/mistral_7b.yml \
+
+# python convert_hf_to_sequential_mistral.py \
+#     --hf-model-name mistralai/Mistral-7B-v0.1 \
+#     --config configs/mistral_7b.yml \
+#     --output-dir ../neox-converted/ 
+
+python ./deepy.py convert_hf_to_sequential_mistral.py \
+    -d configs mistral_7b.yml
+
+# python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
+#         --conf_dir configs/ mistral_7b.yml
\ No newline at end of file

From aa50fd131d34de0f2ae41135c1f22192d1d31529 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 1 Nov 2023 06:50:34 +0000
Subject: [PATCH 26/64] Fixed missing slurm environ vars

---
 megatron/neox_arguments/arguments.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 7a43371e4..2aadd9670 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -732,7 +732,8 @@ def configure_distributed_args(self):
         if self.deepspeed_slurm:
             os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"]
             os.environ["RANK"] = os.environ["SLURM_PROCID"]
-            os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"]
+            os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"] if os.environ.get("SLURM_NTASKS") is not None \
+                                        else str(int(os.environ["SLURM_NNODES"]) * int(os.environ["SLURM_NTASKS_PER_NODE"]))
 
         self.update_value("local_rank", int(os.getenv("LOCAL_RANK", "0")))
         self.update_value("rank", int(os.getenv("RANK", "0")))

From 6a8631008be24e428be45a15fa03a7e222a4b088 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 8 Nov 2023 21:18:16 +0000
Subject: [PATCH 27/64] updated mistral config

---
 configs/mistral_7b.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
index 107bf2efe..bd41adc01 100644
--- a/configs/mistral_7b.yml
+++ b/configs/mistral_7b.yml
@@ -1,7 +1,7 @@
 {
   "pipe_parallel_size": 0,
   "model_parallel_size": 4,
-  "make_vocab_size_divisible_by": 1,
+  # "make_vocab_size_divisible_by": 1,
 
   # model settings
   "num_layers": 32,

From b5f2c6aea253185475c574bfd64f8163914653d6 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 8 Nov 2023 21:18:47 +0000
Subject: [PATCH 28/64] updated job script

---
 mistral_7b_convert.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mistral_7b_convert.sh b/mistral_7b_convert.sh
index 61643a371..3ca98912d 100644
--- a/mistral_7b_convert.sh
+++ b/mistral_7b_convert.sh
@@ -12,7 +12,9 @@
 #SBATCH --account=neox
 #SBATCH --open-mode=append
 #SBATCH --requeue
-#SBATCH --time=0-00:20:00
+#SBATCH --time=0-00:12:00
+#SBATCH --exclude=ip-26-0-157-217
+
 
 # setup the environment using the script we created before
 source ~/setup.sh

From 44e0397af15fba3ea5d37a73798d650cb190e7b2 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Wed, 8 Nov 2023 21:19:57 +0000
Subject: [PATCH 29/64] initial commit conversion mistral hf to sequential

---
 .gitignore                          |  1 +
 convert_hf_to_sequential_mistral.py | 89 +++++++++++++++++++----------
 2 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9630bcfee..dfdd5d44b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,3 +161,4 @@ extensions/utils/.ninja_deps
 extensions/utils/.ninja_log
 extensions/utils/build.ninja
 extensions/utils/flatten_unflatten.o
+testbed_gptneox.py
diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
index cce5d60b9..7b03ffd7a 100644
--- a/convert_hf_to_sequential_mistral.py
+++ b/convert_hf_to_sequential_mistral.py
@@ -53,7 +53,7 @@
 MULTI_GPU_ARGS = " ".join(
     [
         "--hf-model-name mistralai/Mistral-7B-v0.1",
-        "--output-dir ../neox-converted/",
+        "--output-dir checkpoints/neox-converted/",
         "--cache-dir checkpoints/HF",
         "--config configs/mistral_7b.yml",
         "--test",
@@ -70,39 +70,60 @@ def convert_hf_to_sequential(hf_model, seq_state_dict):
     returns the updated sequential state dict
     """
     num_layers = hf_model.config.num_hidden_layers
+
+
     # Embedding is layer idx 0
     seq_state_dict[
         "sequential.0.word_embeddings.weight"
-    ] = hf_model.gpt_neox.embed_in.state_dict()["weight"]
+    ] = hf_model.model.embed_tokens.state_dict()["weight"]
 
     for layer_hf in range(num_layers):
         # offset by 2
         layer_seq = layer_hf + 2
 
         # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_hf]
+        hf_layer = hf_model.model.layers[layer_hf]
         hf_layer_sd = hf_layer.state_dict()
 
-        for key in hf_model.gpt_neox.layers[0].state_dict().keys():
+        seq_state_dict[f"sequential.{layer_seq}.attention.query_key_value.weight"] = torch.cat((hf_layer_sd["self_attn.q_proj.weight"],
+                                                                                                        hf_layer_sd["self_attn.k_proj.weight"],
+                                                                                                        hf_layer_sd["self_attn.v_proj.weight"]),
+                                                                                                        dim=0)
+        seq_state_dict[f"sequential.{layer_seq}.attention.dense.weight"] = hf_layer_sd["self_attn.o_proj.weight"]
+        seq_state_dict[f"sequential.{layer_seq}.mlp.w1.weight"] = hf_layer_sd["mlp.gate_proj.weight"]
+        seq_state_dict[f"sequential.{layer_seq}.mlp.w3.weight"] = hf_layer_sd["mlp.up_proj.weight"]
+        seq_state_dict[f"sequential.{layer_seq}.mlp.w2.weight"] = hf_layer_sd["mlp.down_proj.weight"]
+        seq_state_dict[f"sequential.{layer_seq}.input_layernorm.scale"] = hf_layer_sd["input_layernorm.weight"]
+        seq_state_dict[f"sequential.{layer_seq}.post_attention_layernorm.scale"] = hf_layer_sd["post_attention_layernorm.weight"]
+
+# Conversion table for Mistral 7b 0.1
+# FOR PARTS LOADED THROUGH hf_model.model
+# 'embed_tokens.weight'                           ->      '0.word_embeddings.weight'
+# 'layers.0.self_attn.q_proj.weight'              ->      '2.attention.query_key_value.weight'
+# 'layers.0.self_attn.k_proj.weight'              ->      <gpt_neox packs qkv in single tensor, see transformer.py>
+# 'layers.0.self_attn.v_proj.weight'              ->      <gpt_neox packs qkv in single tensor, see transformer.py>
+# 'layers.0.self_attn.o_proj.weight'              ->      '2.attention.dense.weight'
+# 'layers.0.mlp.gate_proj.weight'                 ->      '2.mlp.w1.weight'
+# 'layers.0.mlp.up_proj.weight'                   ->      '2.mlp.w3.weight'
+# 'layers.0.mlp.down_proj.weight'                 ->      '2.mlp.w2.weight'
+# 'layers.0.input_layernorm.weight'               ->      '2.input_layernorm.scale'
+# 'layers.0.post_attention_layernorm.weight'      ->      '2.post_attention_layernorm.scale'
+# 'norm.weight'                                   ->      '<num_layers+3>.norm.scale'
+# FOR PARTS LOADED THROUGH hf_model.lm_head
+# <weights>                                       ->      '<num_layers+4>.final_linear.weight'
 
-            if key in ["attention.bias", "attention.masked_bias"]:
-                continue
-            seq_state_dict[f"sequential.{layer_seq}.{key}"] = hf_layer_sd[key]
 
     # Load final layer norm
     layer_seq = num_layers + 3
     seq_state_dict[
-        f"sequential.{layer_seq}.norm.weight"
-    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["weight"]
-    seq_state_dict[
-        f"sequential.{layer_seq}.norm.bias"
-    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["bias"]
+        f"sequential.{layer_seq}.norm.scale"
+    ] = hf_model.model.norm.state_dict()["weight"]
 
     # output embedding / LM head
     layer_seq += 1
     seq_state_dict[
         f"sequential.{layer_seq}.final_linear.weight"
-    ] = hf_model.embed_out.state_dict()["weight"]
+    ] = hf_model.lm_head.state_dict()["weight"]
 
 
 def shard_sequential_mp(num_mp_ranks, sequential):
@@ -122,9 +143,8 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                 for x in [
                     "layernorm",
                     "rotary_emb",
-                    "dense_4h_to_h.bias",
-                    "norm.weight",
-                    "norm.bias",
+                    "mlp.w2.bias",
+                    "norm.scale",
                     "attention.dense.bias",
                 ]
             ],
@@ -164,7 +184,7 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                         x in k
                         for x in [
                             "attention.dense.weight",
-                            "mlp.dense_4h_to_h.weight",
+                            "mlp.w2.weight",
                         ]
                     ],
                 ):  # column parallel
@@ -174,12 +194,12 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                     [
                         x in k
                         for x in [
-                            "mlp.dense_h_to_4h.weight",
-                            "mlp.dense_h_to_4h.bias",
                             "attention.query_key_value.weight",
                             "attention.query_key_value.bias",
                             "word_embeddings.weight",
                             "final_linear.weight",
+                            "mlp.w1",
+                            "mlp.w3",
                         ]
                     ],
                 ):
@@ -267,8 +287,7 @@ def shard_pp(sequential, mp_rank, num_layers):
         "word_embeddings.weight": sequential[f"sequential.0.word_embeddings.weight"]
     }
     layers_seq[f"layer_{num_layers+3:02}" + suffix] = {
-        "norm.weight": sequential[f"sequential.{num_layers+3}.norm.weight"],
-        "norm.bias": sequential[f"sequential.{num_layers+3}.norm.bias"],
+        "norm.scale": sequential[f"sequential.{num_layers+3}.norm.scale"],
     }
 
     layers_seq[f"layer_{num_layers+4:02}" + suffix] = {
@@ -372,15 +391,15 @@ def convert(hf_model, ckpt_dir, output_dir):
         torch.save(v, os.path.join(ckpt_dir, k))
 
     # copy the checkpoint to the output_dir
-    print("rm {}/*".format(output_dir))
-    os.system("rm {}/*".format(output_dir))
+    print("rm -r {}/*".format(output_dir))
+    os.system("rm -r {}/*".format(output_dir))
     os.makedirs(output_dir, exist_ok=True)
-    print("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
-    os.system("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+    print("cp -r {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+    os.system("cp -r {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
 
     # set latest file within the output_dir
     latest_file = os.path.join("/".join(output_dir.split("/")[:-1]), "latest")
-    os.system("rm " + latest_file)
+    os.system("rm -r" + latest_file)
     with open(latest_file, "w") as f:
         f.write(output_dir.split("/")[-1])
 
@@ -446,7 +465,7 @@ def get_non_existing_dir(tmp_dir):
 
     parser.add_argument(
         "--ckpt-tmp-dir",
-        default="/tmp/ckpt_tmp_dir",
+        default="/tmp/ckpt_tmp_dir/" + os.environ["USER"],
         help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]"
     )
     parser.add_argument(
@@ -526,7 +545,8 @@ def get_non_existing_dir(tmp_dir):
         mpu=mpu if not neox_args.is_pipe_parallel else None,
     )
 
-    if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0":
+
+    if os.environ.get("RANK", "1") == "0":
         os.makedirs(f"{tmp_cache_dir}", exist_ok=True)
 
     torch.distributed.barrier()
@@ -562,9 +582,18 @@ def get_non_existing_dir(tmp_dir):
         print("==========================================")
         print("Loaded Hugging Face model successfully!")
         print("==========================================")
+
+        # if os.environ.get("RANK", "1") == '0':
+        #     hf_param_names_and_sizes = [[param_name, param_weight.size()] for param_name, param_weight in hf_model.model.named_parameters()]
+        #     neox_param_names_and_sizes = [[param_name, param_weight.size()] for param_name, param_weight in model.sequential.named_parameters()]
+        #     print("\n\n\n\n\n\n HF param names ", hf_param_names_and_sizes)
+        #     print("Neox param names ", neox_param_names_and_sizes, "\n\n\n\n\n\n")
+        # torch.distributed.barrier()
+        # exit(0)
+
         convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir)
 
-        if os.environ['OMPI_COMM_WORLD_RANK'] == '0':
+        if os.environ.get("RANK", "1") == '0':
             # cleanup temp dir
             os.system(f"rm -r {tmp_cache_dir}")
 
@@ -572,7 +601,7 @@ def get_non_existing_dir(tmp_dir):
 
     #verify the conversion can be loaded
     neox_args.load = "/".join(args.output_dir.split("/")[:-1])
-    print(neox_args.load)
+    print(os.environ["RANK"], neox_args.load)
     neox_args.finetune=True
     load_checkpoint(
         neox_args=neox_args,

From fa71c63cff7a43aa4110e371157b2ee517867627 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Sun, 12 Nov 2023 07:06:32 +0000
Subject: [PATCH 30/64] Added stacking q, k, v appropriately for mp ranks

---
 convert_hf_to_sequential_mistral.py | 23 +++++++++++-----
 megatron/mpu/utils.py               | 41 +++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
index 7b03ffd7a..5f4108667 100644
--- a/convert_hf_to_sequential_mistral.py
+++ b/convert_hf_to_sequential_mistral.py
@@ -21,6 +21,7 @@
 from megatron.initialize import initialize_megatron
 from megatron import mpu
 from megatron.checkpointing import load_checkpoint, save_checkpoint
+from megatron.mpu.utils import split_reorder_and_stack_separate_qkv
 
 # from megatron.utils import (
 #     Timers,
@@ -61,7 +62,7 @@
 )
 
 
-def convert_hf_to_sequential(hf_model, seq_state_dict):
+def convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks):
     """Converts the weights of a HuggingFace model to neox 2.0 format.
 
     :param hf_model: the huggingface model
@@ -85,10 +86,13 @@ def convert_hf_to_sequential(hf_model, seq_state_dict):
         hf_layer = hf_model.model.layers[layer_hf]
         hf_layer_sd = hf_layer.state_dict()
 
-        seq_state_dict[f"sequential.{layer_seq}.attention.query_key_value.weight"] = torch.cat((hf_layer_sd["self_attn.q_proj.weight"],
-                                                                                                        hf_layer_sd["self_attn.k_proj.weight"],
-                                                                                                        hf_layer_sd["self_attn.v_proj.weight"]),
-                                                                                                        dim=0)
+
+        seq_state_dict[f"sequential.{layer_seq}.attention.query_key_value.weight"] \
+                        = split_reorder_and_stack_separate_qkv(hf_layer_sd["self_attn.q_proj.weight"], 
+                                                               hf_layer_sd["self_attn.k_proj.weight"],
+                                                               hf_layer_sd["self_attn.v_proj.weight"],
+                                                               num_mp_ranks=num_mp_ranks,
+                                                               dim=0)
         seq_state_dict[f"sequential.{layer_seq}.attention.dense.weight"] = hf_layer_sd["self_attn.o_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.mlp.w1.weight"] = hf_layer_sd["mlp.gate_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.mlp.w3.weight"] = hf_layer_sd["mlp.up_proj.weight"]
@@ -169,6 +173,8 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                 padded_size = int(padded_size)
                 size_per_rank = int(size_per_rank)
 
+                print("size_per_rank 1", size_per_rank)
+                print("padded_size 1", padded_size)
                 for x in range(num_mp_ranks):
                     if size_per_rank != padded_size:
                         # need to pad
@@ -235,6 +241,8 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                 padded_size = int(padded_size)
                 size_per_rank = int(size_per_rank)
 
+                print("size_per_rank 2", size_per_rank)
+                print("padded_size 2", padded_size)
                 for x in range(num_mp_ranks):
                     if size_per_rank != padded_size:
                         # need to pad
@@ -347,7 +355,7 @@ def convert(hf_model, ckpt_dir, output_dir):
 
     os.makedirs(output_dir, exist_ok=True)
     seq_state_dict = dict()
-    convert_hf_to_sequential(hf_model, seq_state_dict)
+    convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks=len(ckpts))
 
     if len(ckpts) == 1 and len(layers) == 0:
         # pp=0, mp=1
@@ -476,7 +484,7 @@ def get_non_existing_dir(tmp_dir):
 
     parser.add_argument(
         "--cache-dir",
-        default="/gpfs/alpine/csc499/proj-shared/hf_checkpoints",
+        default="checkpoints/HF",
         help="Directory to store cached hugging face checkpoints."
     )
     try:
@@ -490,6 +498,7 @@ def get_non_existing_dir(tmp_dir):
 
     tmp_cache_dir = get_non_existing_dir(args.ckpt_tmp_dir)
 
+
     if args.download_only:
         hf_model = AutoModelForCausalLM.from_pretrained(
             f"{args.hf_model_name}",
diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index cb12f2569..cec84eccd 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -52,6 +52,47 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=
 
     return tensor_list
 
+def split_reorder_and_stack_separate_qkv(q, k, v, num_mp_ranks, dim=0, contiguous_qkv_chunks=False):
+    """
+    Splits separate q, k, v matrices e.g. from huggingface into chunks of size=mp config, then interleaves them so that each worker
+    gets its packed qkv matrices appropriately before stacking them.
+    Idea (example for GQA):
+    q = [q1, q2, q3, q4]
+    k = [k1, k2]
+    v = [v1, v2]
+    1) Split: First split into mp chunks, assuming mp=2 we get   [[q1, q2], [q3, q4]],   [[k1], [k2]],   [[v1], [v2]]
+    2) Reorder: Then group relevant qkv for each mp rank: [q1, q2, k1, v1], [q3, q4, k2, v2]
+    3) Stack: Consolidate into single qkv: [q1, q2, k1, v1, q3, q4, k2, v2]
+    That way when the qkv gets loaded on each rank we avoid [q1, q2, q3, q4] on one rank, [k1, k2, v1, v2] on the other, which would
+    be misinterpreted in transformer.py as q3 being a key tensor, q4 being a value tensor, etc.
+    
+    Relying on the assert happening when mpu.divide gets called when initialising the neox transformer; note this will need to be updated
+    if the q, k, v behaviour of transformers.py is changed.
+
+    To perform a simple test on the case num_mp_ranks=2:
+    m = 2
+    A = torch.ones((8,2))
+    B = torch.ones((8,2))*2
+    C = torch.ones((8,2))*3
+    D = torch.cat([torch.cat((x, y, z), dim=0) for x, y, z in zip(torch.chunk(A, chunks=m, dim=0),
+                                                                  torch.chunk(B, chunks=m, dim=0),
+                                                                  torch.chunk(C, chunks=m, dim=0))],
+                      dim=0)
+    """
+    def conditional_contiguous(tensor, contiguous_qkv_chunks):
+        if contiguous_qkv_chunks:
+            return tensor.contiguous()
+        else:
+            return tensor
+    return torch.cat(
+                        [
+                        conditional_contiguous(torch.cat((x, y, z), dim=dim), contiguous_qkv_chunks) 
+                                                    for x, y, z in zip(torch.chunk(q, chunks=num_mp_ranks, dim=dim),
+                                                                       torch.chunk(k, chunks=num_mp_ranks, dim=dim),
+                                                                       torch.chunk(v, chunks=num_mp_ranks, dim=dim))
+                        ],
+                        dim=dim
+                    )
 
 class VocabUtility:
     """Split the vocabulary into `world_size` chunks amd return the

From e263367b797bee0bf1bf4cfea4483584982df37e Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Sat, 20 Jan 2024 07:48:17 +0000
Subject: [PATCH 31/64] pp=0 support from end of 2023

---
 convert_hf_to_sequential_mistral.py | 102 +++++++++++++++++++++++++---
 1 file changed, 93 insertions(+), 9 deletions(-)

diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
index 5f4108667..0aa6b0327 100644
--- a/convert_hf_to_sequential_mistral.py
+++ b/convert_hf_to_sequential_mistral.py
@@ -61,6 +61,24 @@
     ]
 )
 
+def reshape_qkv_helper(heads, num_heads, hidden_size, num_mp_ranks, num_q_heads):
+    assert hidden_size >= num_q_heads
+    assert num_q_heads >= num_mp_ranks
+    assert int(hidden_size//num_q_heads * (num_heads/num_q_heads)) > 0
+    heads = heads.reshape(num_heads, hidden_size//num_q_heads, hidden_size)
+    heads = heads.reshape(num_mp_ranks, num_q_heads//num_mp_ranks, int(hidden_size//num_q_heads * (num_heads/num_q_heads)), hidden_size)
+    return heads
+
+def handle_qkv(hf_layer_sd, num_mp_ranks, hf_config):
+    hidden_size = hf_config.hidden_size
+    num_attention_heads = hf_config.num_attention_heads
+    num_key_value_heads = hf_config.num_key_value_heads
+    q = reshape_qkv_helper(hf_layer_sd["self_attn.q_proj.weight"], num_attention_heads, hidden_size, num_mp_ranks, num_attention_heads)
+    k = reshape_qkv_helper(hf_layer_sd["self_attn.k_proj.weight"], num_key_value_heads, hidden_size, num_mp_ranks, num_attention_heads)
+    v = reshape_qkv_helper(hf_layer_sd["self_attn.v_proj.weight"], num_key_value_heads, hidden_size, num_mp_ranks, num_attention_heads)
+
+    return torch.cat([q, k, v], dim=2).reshape(num_mp_ranks, (hidden_size + int(2 * (hidden_size * num_key_value_heads/num_attention_heads)))//num_mp_ranks, hidden_size)
+    # return torch.cat([hf_layer_sd["self_attn.q_proj.weight"], hf_layer_sd["self_attn.k_proj.weight"], hf_layer_sd["self_attn.v_proj.weight"]], dim=0)
 
 def convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks):
     """Converts the weights of a HuggingFace model to neox 2.0 format.
@@ -70,8 +88,9 @@ def convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks):
 
     returns the updated sequential state dict
     """
-    num_layers = hf_model.config.num_hidden_layers
 
+    num_layers = hf_model.config.num_hidden_layers
+    hf_config = hf_model.config
 
     # Embedding is layer idx 0
     seq_state_dict[
@@ -88,17 +107,21 @@ def convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks):
 
 
         seq_state_dict[f"sequential.{layer_seq}.attention.query_key_value.weight"] \
-                        = split_reorder_and_stack_separate_qkv(hf_layer_sd["self_attn.q_proj.weight"], 
-                                                               hf_layer_sd["self_attn.k_proj.weight"],
-                                                               hf_layer_sd["self_attn.v_proj.weight"],
-                                                               num_mp_ranks=num_mp_ranks,
-                                                               dim=0)
+                        = handle_qkv(hf_layer_sd, num_mp_ranks, hf_config)
+                        # = split_reorder_and_stack_separate_qkv(hf_layer_sd["self_attn.q_proj.weight"], 
+                        #                                        hf_layer_sd["self_attn.k_proj.weight"],
+                        #                                        hf_layer_sd["self_attn.v_proj.weight"],
+                        #                                        num_mp_ranks=num_mp_ranks,
+                        #                                        dim=0)
         seq_state_dict[f"sequential.{layer_seq}.attention.dense.weight"] = hf_layer_sd["self_attn.o_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.mlp.w1.weight"] = hf_layer_sd["mlp.gate_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.mlp.w3.weight"] = hf_layer_sd["mlp.up_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.mlp.w2.weight"] = hf_layer_sd["mlp.down_proj.weight"]
         seq_state_dict[f"sequential.{layer_seq}.input_layernorm.scale"] = hf_layer_sd["input_layernorm.weight"]
         seq_state_dict[f"sequential.{layer_seq}.post_attention_layernorm.scale"] = hf_layer_sd["post_attention_layernorm.weight"]
+# 
+
+
 
 # Conversion table for Mistral 7b 0.1
 # FOR PARTS LOADED THROUGH hf_model.model
@@ -157,6 +180,8 @@ def shard_sequential_mp(num_mp_ranks, sequential):
             for x in range(num_mp_ranks):
                 ranks[x][k] = v
         else:
+            print(k)
+            print(v.shape)
             if len(v.shape) == 1:
                 size_per_rank = v.shape[0] / num_mp_ranks
                 if size_per_rank % 128 != 0.0:
@@ -258,6 +283,52 @@ def shard_sequential_mp(num_mp_ranks, sequential):
                             else v[:, size_per_rank * x : size_per_rank * (x + 1)]
                         )
 
+            elif len(v.shape) == 3:
+                if reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "attention.query_key_value.weight",
+                        ]
+                    ],
+                ):
+                    # row parallel (indices start at 1 since idx 0 is mp rank)
+                    max_, min_ = 1, 2
+                else:
+                    raise Exception("Unknown weight to shard: {}".format(k))
+
+                # size_per_rank = v.shape[max_]
+                # if size_per_rank % 128 != 0.0:
+                #     padded_size = (128 - (size_per_rank % 128)) + size_per_rank
+                #     size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
+
+                #     assert (
+                #         size_diff > 0
+                #     ), "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
+                #         size_diff, size_per_rank, k, v.shape, padded_size
+                #     )
+
+                #     zero_pad = (
+                #         torch.zeros((v.shape[0], size_diff, v.shape[min_]))
+                #         if max_ == 0
+                #         else torch.zeros((v.shape[min_], size_diff))
+                #     )
+
+                #     v = torch.cat([v, zero_pad], dim=max_)
+                # else:
+                #     padded_size = size_per_rank
+
+                # assert size_per_rank % 1.0 == 0.0
+                # assert padded_size % 1.0 == 0.0
+
+                # padded_size = int(padded_size)
+                # size_per_rank = int(size_per_rank)
+
+                # print("size_per_rank 2", size_per_rank)
+                # print("padded_size 2", padded_size)
+                for x in range(num_mp_ranks):
+                    ranks[x][k] = v[x, :, :]
             else:
                 raise NotImplementedError()
 
@@ -277,6 +348,16 @@ def replace_sharded_seq(mp_checkpoints, mp_sharded_seq):
             try:
                 mp_checkpoints[mp_key]["module"][k] = shard[k]
             except KeyError:
+                if reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "rotary_emb",
+                        ]
+                    ],
+                ):
+                    continue
                 print("ERROR key:{} not found in shard.".format(k))
 
 
@@ -407,7 +488,7 @@ def convert(hf_model, ckpt_dir, output_dir):
 
     # set latest file within the output_dir
     latest_file = os.path.join("/".join(output_dir.split("/")[:-1]), "latest")
-    os.system("rm -r" + latest_file)
+    os.system("rm -r " + latest_file)
     with open(latest_file, "w") as f:
         f.write(output_dir.split("/")[-1])
 
@@ -473,7 +554,7 @@ def get_non_existing_dir(tmp_dir):
 
     parser.add_argument(
         "--ckpt-tmp-dir",
-        default="/tmp/ckpt_tmp_dir/" + os.environ["USER"],
+        default="tmp/convert_ckpt_tmp_dir/JOBID_" + os.environ["SLURM_JOB_ID"],
         help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]"
     )
     parser.add_argument(
@@ -540,6 +621,9 @@ def get_non_existing_dir(tmp_dir):
     torch.distributed.barrier()
 
     model = get_model(neox_args=neox_args, use_cache=True)
+
+    torch.distributed.barrier()
+
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
 
@@ -553,7 +637,7 @@ def get_non_existing_dir(tmp_dir):
         config_params=neox_args.deepspeed_config,
         mpu=mpu if not neox_args.is_pipe_parallel else None,
     )
-
+    
 
     if os.environ.get("RANK", "1") == "0":
         os.makedirs(f"{tmp_cache_dir}", exist_ok=True)

From bcfb2792806bd500861f6e142966496a8465fc6e Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Thu, 25 Jan 2024 08:00:45 +0000
Subject: [PATCH 32/64] Cleaning up config and removing Autoconfig in
 conversion script

---
 configs/mistral_7b.yml              | 23 ++++++++++++++++-------
 convert_hf_to_sequential_mistral.py | 14 +++++++-------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
index bd41adc01..c2dbf409e 100644
--- a/configs/mistral_7b.yml
+++ b/configs/mistral_7b.yml
@@ -42,7 +42,7 @@
   },
 
   "zero_optimization": {
-    "stage": 1,
+    "stage": 0,
     "allgather_partitions": true,
     "allgather_bucket_size": 1260000000,
     "overlap_comm": true,
@@ -91,14 +91,15 @@
   "wall_clock_breakdown": true,
 
   "tokenizer_type": "SPMTokenizer",
-  "vocab-file": "/fsx/adami/raw_mistral/mistral-v0.1/7B/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+  "vocab-file": "/path/to/mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
 
-  "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
+  "save": "checkpoints/test",
   # "load": "" # set to same as "save" to resume from intermediate finetuning step
-  "load": "/path/to/converted/codellama_34b_weights_with_mp8",
+  "load": "checkpoints/neox-converted",
 
-  "finetune": true, # set to false once resuming from intermediate finetuning step
-  "checkpoint_validation_with_forward_pass": true,
+# Set these 2 (finetune and validation with fw pass) to false during conversion
+  "finetune": false, # set to false once resuming from intermediate finetuning step
+  "checkpoint_validation_with_forward_pass": false,
 
 
   "use_wandb": false,
@@ -108,5 +109,13 @@
   "wandb_host": "https://api.wandb.ai",
 
   "launcher": "slurm",
-  "deepspeed_slurm": true
+  "deepspeed_slurm": true,
+
+  # Special stability ?
+  "no_ssh_check": true,
+  "comment": "neox",
+
+  "train-data-paths": ["data/enwik8/enwik8_text_document"],
+  "valid-data-paths": ["data/enwik8/enwik8_text_document"],
+  "test-data-paths": ["data/enwik8/enwik8_text_document"], 
 }
diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
index 0aa6b0327..032738654 100644
--- a/convert_hf_to_sequential_mistral.py
+++ b/convert_hf_to_sequential_mistral.py
@@ -657,17 +657,17 @@ def get_non_existing_dir(tmp_dir):
 
 
     if torch.distributed.get_rank() == 0:
-        config = AutoConfig.from_pretrained(
-            f"{args.hf_model_name}",
-            cache_dir=os.path.join(
-                args.cache_dir,f"{args.hf_model_name}"
-                ),
-            )
+        # config = AutoConfig.from_pretrained(
+        #     f"{args.hf_model_name}",
+        #     cache_dir=os.path.join(
+        #         args.cache_dir,f"{args.hf_model_name}"
+        #         ),
+        #     )
         # does not change the weights, but is needed to align logits
         # config.update({'hidden_act':'gelu_fast'})
         hf_model = AutoModelForCausalLM.from_pretrained(
             f"{args.hf_model_name}",
-            config=config,
+            #config=config,
             cache_dir=os.path.join(
                 args.cache_dir,f"{args.hf_model_name}"
                 ),

From 753ef0f2f501e6a1cbcf58dbfbea5a21c72ec6c5 Mon Sep 17 00:00:00 2001
From: Adam Ibrahim <ai@cs.ucsb.edu>
Date: Thu, 25 Jan 2024 08:09:46 +0000
Subject: [PATCH 33/64] Cleaned up conversion example script

---
 mistral_7b_convert.sh | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/mistral_7b_convert.sh b/mistral_7b_convert.sh
index 3ca98912d..3e4d20568 100644
--- a/mistral_7b_convert.sh
+++ b/mistral_7b_convert.sh
@@ -1,24 +1,21 @@
 #!/bin/bash
-#SBATCH --job-name=7b_mistral
-#SBATCH --partition=g40x
+#SBATCH --job-name=convert_mistral_7B
+#SBATCH --partition=a40x            # Make sure you need this
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=8          # Crucial - only 1 task per dist per node!
-####SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=12          # Number of cores per tasks
 #SBATCH --gres=gpu:8                 # Number of gpus
-#SBATCH --output=mistral_7b_%x_%j.out      # Set this dir where you want slurm outs to go
-#SBATCH --error=mistral_7b_%x_%j.out      # Set this dir where you want slurm outs to go
+#SBATCH --output=convert_mistral_7B_%x_%j.out      # Set this dir where you want slurm outs to go
+#SBATCH --error=convert_mistral_7B_%x_%j.out      # Set this dir where you want slurm outs to go
 #SBATCH --exclusive      # Turn off node sharing
 #SBATCH --account=neox
 #SBATCH --open-mode=append
 #SBATCH --requeue
 #SBATCH --time=0-00:12:00
-#SBATCH --exclude=ip-26-0-157-217
 
 
-# setup the environment using the script we created before
+# set up the environment using a setup script
 source ~/setup.sh
-#source /fsx/quentin/setup.sh
 
 ds_report
 
@@ -50,26 +47,15 @@ export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json
 export TORCH_EXTENSIONS_DIR=./extensions/
 
 # Move to the gpt-neox install
-TRAIN_PATH=/fsx/$(whoami)/gpt-neox #/fsx/hailey/math-lm/gpt-neox
+TRAIN_PATH=/weka/$(whoami)/gpt-neox
 cd $TRAIN_PATH
 
 # Write the hostfile for this job
 export MASTER_ADDR=$(echo $MASTER_ADDR | cut -d '-' -f 2- | tr '-' '.')
 bash ~/write_ip_hostfile.sh
-export DLTS_HOSTFILE=/fsx/$(whoami)/hostfiles/hosts_$SLURM_JOBID
+export DLTS_HOSTFILE=/weka/$(whoami)/hostfiles/hosts_$SLURM_JOBID
 
 
-# python tools/convert_raw_llama_weights_to_neox.py \
-#     --input_dir ../raw_mistral/mistral-v0.1/ \
-#     --output_dir ../neox-converted/ \
-#     --model_size "7B" \
-#     --num_output_shards 2
-# #    --config_file configs/mistral_7b.yml \
-
-# python convert_hf_to_sequential_mistral.py \
-#     --hf-model-name mistralai/Mistral-7B-v0.1 \
-#     --config configs/mistral_7b.yml \
-#     --output-dir ../neox-converted/ 
 
 python ./deepy.py convert_hf_to_sequential_mistral.py \
     -d configs mistral_7b.yml

From 3488daeafc9f6625e7378d3e2c69c3e7160a5ab0 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 15:59:22 +0000
Subject: [PATCH 34/64] cleanup: add back configs folder, discard Llemma readme

---
 .gitignore                                 |   12 -
 README.md                                  |  690 ++++++-
 configs/1-3B.yml                           |   93 +
 configs/125M-json.yml                      |   80 +
 configs/125M.yml                           |   96 +
 configs/13B.yml                            |   94 +
 configs/175B.yml                           |   92 +
 configs/19M.yml                            |   97 +
 configs/2-7B.yml                           |   93 +
 configs/20B.yml                            |  113 ++
 configs/350M.yml                           |   92 +
 configs/49M.yml                            |   93 +
 configs/6-7B.yml                           |   93 +
 configs/760M.yml                           |   93 +
 configs/800M.yml                           |   86 +
 configs/README.md                          |  314 +++
 configs/autotuning_configs/small_tune.json |   78 +
 configs/autotuning_configs/tune.json       |   72 +
 configs/autotuning_configs/tune_1-3B.json  |   86 +
 configs/autotuning_configs/tune_6-7B.json  |   77 +
 configs/bf16_125M.yml                      |   80 +
 configs/bnb_125M.yml                       |   87 +
 configs/cpu_mock_config.yml                |    5 +
 configs/docker/paths.yml                   |   12 +
 configs/eleutherai_cluster.yml             |   29 +
 configs/finetuning_configs/6-9B.yml        |   89 +
 configs/gen_docs.py                        |   93 +
 configs/gmlp_small.yml                     |   72 +
 configs/llama/13B.yml                      |   26 +
 configs/llama/30B.yml                      |   26 +
 configs/llama/65B.yml                      |   26 +
 configs/llama/7B.yml                       |   26 +
 configs/llama/README.md                    |   23 +
 configs/llama/train_config.yml             |   73 +
 configs/llemma-34b-config.yml              |  108 +
 configs/llemma-7b-config.yml               |  105 +
 configs/local_setup.yml                    |   30 +
 configs/mistral/7B.yml                     |   39 +
 configs/neox_arguments.md                  | 2065 ++++++++++++++++++++
 configs/pythia/1-4B.yml                    |   85 +
 configs/pythia/12B.yml                     |   84 +
 configs/pythia/14M.yml                     |   98 +
 configs/pythia/160M.yml                    |   85 +
 configs/pythia/1B.yml                      |   86 +
 configs/pythia/2-8B.yml                    |   87 +
 configs/pythia/31M.yml                     |   97 +
 configs/pythia/410M.yml                    |   85 +
 configs/pythia/6-9B.yml                    |   84 +
 configs/pythia/70M.yml                     |   85 +
 configs/slurm_125M.yml                     |   66 +
 configs/slurm_local.json                   |   11 +
 configs/slurm_local.yml                    |   12 +
 configs/sparse.yml                         |   15 +
 configs/text_generation.yml                |   21 +
 54 files changed, 6509 insertions(+), 50 deletions(-)
 create mode 100644 configs/1-3B.yml
 create mode 100644 configs/125M-json.yml
 create mode 100644 configs/125M.yml
 create mode 100644 configs/13B.yml
 create mode 100644 configs/175B.yml
 create mode 100644 configs/19M.yml
 create mode 100644 configs/2-7B.yml
 create mode 100644 configs/20B.yml
 create mode 100644 configs/350M.yml
 create mode 100644 configs/49M.yml
 create mode 100644 configs/6-7B.yml
 create mode 100644 configs/760M.yml
 create mode 100644 configs/800M.yml
 create mode 100644 configs/README.md
 create mode 100644 configs/autotuning_configs/small_tune.json
 create mode 100644 configs/autotuning_configs/tune.json
 create mode 100644 configs/autotuning_configs/tune_1-3B.json
 create mode 100644 configs/autotuning_configs/tune_6-7B.json
 create mode 100644 configs/bf16_125M.yml
 create mode 100644 configs/bnb_125M.yml
 create mode 100644 configs/cpu_mock_config.yml
 create mode 100644 configs/docker/paths.yml
 create mode 100644 configs/eleutherai_cluster.yml
 create mode 100755 configs/finetuning_configs/6-9B.yml
 create mode 100644 configs/gen_docs.py
 create mode 100644 configs/gmlp_small.yml
 create mode 100644 configs/llama/13B.yml
 create mode 100644 configs/llama/30B.yml
 create mode 100644 configs/llama/65B.yml
 create mode 100644 configs/llama/7B.yml
 create mode 100644 configs/llama/README.md
 create mode 100644 configs/llama/train_config.yml
 create mode 100644 configs/llemma-34b-config.yml
 create mode 100644 configs/llemma-7b-config.yml
 create mode 100644 configs/local_setup.yml
 create mode 100644 configs/mistral/7B.yml
 create mode 100644 configs/neox_arguments.md
 create mode 100755 configs/pythia/1-4B.yml
 create mode 100755 configs/pythia/12B.yml
 create mode 100644 configs/pythia/14M.yml
 create mode 100755 configs/pythia/160M.yml
 create mode 100755 configs/pythia/1B.yml
 create mode 100755 configs/pythia/2-8B.yml
 create mode 100644 configs/pythia/31M.yml
 create mode 100755 configs/pythia/410M.yml
 create mode 100755 configs/pythia/6-9B.yml
 create mode 100755 configs/pythia/70M.yml
 create mode 100644 configs/slurm_125M.yml
 create mode 100644 configs/slurm_local.json
 create mode 100644 configs/slurm_local.yml
 create mode 100644 configs/sparse.yml
 create mode 100644 configs/text_generation.yml

diff --git a/.gitignore b/.gitignore
index dfdd5d44b..34de4e774 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,15 +150,3 @@ test_logs/
 logs/
 tensorboard/
 src/
-*.out
-
-# extensions
-extensions/fused_adam/.ninja_deps
-extensions/fused_adam/.ninja_log
-extensions/fused_adam/build.ninja
-extensions/fused_adam/multi_tensor_adam.cuda.o
-extensions/utils/.ninja_deps
-extensions/utils/.ninja_log
-extensions/utils/build.ninja
-extensions/utils/flatten_unflatten.o
-testbed_gptneox.py
diff --git a/README.md b/README.md
index 62b9ab82c..2a06ac9bf 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,692 @@
-# Work in Progress: adding support for Mistral
+[![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
+
+# GPT-NeoX
+
+This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training. This library is in widespread use in [academic, industry, and government labs](https://github.com/EleutherAI/gpt-neox#adoption-and-publications), including by researchers at Oak Ridge National Lab, CarperAI, Stability AI, Together.ai, Korea University, Carnegie Mellon University, and the University of Tokyo among others. Uniquely among similar libraries GPT-NeoX supports a wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
+
+**If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
+
+## Why GPT-NeoX?
+
+GPT-NeoX leverages many of the same features and technologies as the popular Megatron-DeepSpeed library but with substantially increased usability and novel optimizations. Major features include:
+* Distributed training with ZeRO and 3D parallelism
+* A wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
+* Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention.
+* Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2
+* Curriculum Learning
+* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, logging via [WandB](https://wandb.ai/site), and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
+
+## News
+**[8/10/2023]** We now support checkpointing with AWS S3! Activate with the `s3_path` config option (for more detail, see [the PR](https://github.com/EleutherAI/gpt-neox/pull/1010))
+
+**[9/20/2023]** As of https://github.com/EleutherAI/gpt-neox/pull/1035, we have deprecated Flash Attention 0.x and 1.x, and migrated support to Flash Attention 2.x. We don't believe this will cause problems, but if you have a specific use-case that requires old flash support using the latest GPT-NeoX, please raise an issue.
+
+**[8/10/2023]** We have experimental support for LLaMA 2 and Flash Attention v2 supported in our [math-lm](https://github.com/EleutherAI/math-lm) project that will be upstreamed later this month.
+
+**[5/17/2023]** After fixing some miscellaneous bugs we now fully support bf16.
+
+**[4/11/2023]** We have upgraded our Flash Attention implementation to now support Alibi positional embeddings.
+
+**[3/9/2023]** We have released GPT-NeoX 2.0.0, an upgraded version built on the latest DeepSpeed which will be regularly synced with going forward.
+
+## Versions
+
+Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
+
+- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
+- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
+
+# Contents
+
+- [GPT-NeoX](#gpt-neox)
+  * [Why GPT-NeoX?](#why-gpt-neox)
+  * [News](#news)
+  * [Versions](#versions)
+- [Contents](#contents)
+- [Quick Start](#quick-start)
+  * [Environment and Dependencies](#environment-and-dependencies)
+    + [Host Setup](#host-setup)
+    + [Flash Attention](#flash-attention)
+    + [Multi-Node Launching](#multi-node-launching)
+    + [Containerized Setup](#containerized-setup)
+  * [Usage](#usage)
+- [Configuration](#configuration)
+- [Datasets](#datasets)
+  * [Preconfigured Datasets](#preconfigured-datasets)
+  * [Using Custom Data](#using-custom-data)
+- [Training and Finetuning](#training-and-finetuning)
+  * [Pretrained Models](#pretrained-models)
+    + [GPT-NeoX-20B](#gpt-neox-20b)
+    + [Pythia](#pythia)
+    + [Polyglot](#polyglot)
+- [Inference](#inference)
+- [Evaluation](#evaluation)
+- [Exporting to Hugging Face](#exporting-to-hugging-face)
+- [Monitoring](#monitoring)
+  * [Weights and Biases](#weights-and-biases)
+  * [TensorBoard](#tensorboard)
+- [Running on multi-node](#running-on-multi-node)
+- [Adoption and Publications](#adoption-and-publications)
+  * [Publications](#publications)
+  * [Models](#models)
+    + [English LLMs](#english-llms)
+    + [Non-English LLMs](#non-english-llms)
+    + [Code Models](#code-models)
+    + [Other Modalities](#other-modalities)
+- [Administrative Notes](#administrative-notes)
+  * [Citing GPT-NeoX](#citing-gpt-neox)
+  * [Contributing](#contributing)
+  * [Licensing](#licensing)
+  * [Acknowledgements](#acknowledgements)
+
+# Quick Start
+
+## Environment and Dependencies
+
+### Host Setup
+
+First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
+
+To install the remaining basic dependencies, run:
 
-This branch is work in progress to add Mistral 7B support. This branch is currently undergoing development, and is not meant to be stable.
+```bash
+pip install -r requirements/requirements.txt
+pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
+pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
+python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels
+```
+
+from the repository root.
+
+> [!Warning]
+> Our codebase relies on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), our fork of the [DeepSpeed](https://github.com/microsoft/DeepSpeed) library with some added changes. We strongly recommend using Anaconda, a virtual machine, or some other form of environment isolation before continuing. Failure to do so may cause other repositories that rely on DeepSpeed to break.
+
+</aside>
+
+### Flash Attention
+
+To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
 
 
-# LLeMA Pretraining
+### Multi-Node Launching
+
+NeoX and Deep(er)Speed support training on multiple different nodes and you have the option of using a variety of different launchers to orchestrate multi-node jobs.
+
+In general there needs to be a "hostfile" somewhere accessible with the format:
+
+```bash
+node1_ip slots=8
+node2_ip slots=8
+```
+
+where the first column contains the IP address for each node in your setup and the number of slots is the number of GPUs that node has access to. In your config you must pass in the path to the hostfile with `"hostfile": "/path/to/hostfile"`. Alternatively the path to the hostfile can be in the environment variable `DLTS_HOSTFILE`.
+
+#### pdsh
+
+`pdsh` is the default launcher, and if you're using `pdsh` then all you must do (besides ensuring that pdsh is installed in your environment) is set `{"launcher": "pdsh"}` in your config files.
+
+#### MPI
+
+If using MPI then you must specify the MPI library (DeepSpeed/GPT-NeoX currently supports `mvapich`, `openmpi`, `mpich`, and `impi`, though `openmpi` is the most commonly used and tested) as well as pass the `deepspeed_mpi` flag in your config file:
+
+```json
+{
+    "launcher": "openmpi",
+    "deepspeed_mpi": true
+}
+```
+
+With your environment properly set up and the correct configuration files you can use `deepy.py` like a normal python script and start (for example) a training job with:
+
+`python3 deepy.py train.py /path/to/configs/my_model.yml`
+
+#### Slurm
+
+Using Slurm can be slightly more involved. Like with MPI, you must add the following to your config:
+
+```json
+{
+    "launcher": "slurm",
+    "deepspeed_slurm": true
+}
+```
+If you do not have ssh access to the compute nodes in your Slurm cluster you need to add `{"no_ssh_check": true}`
 
-This is a modified version of the `EleutherAI/GPT-NeoX` repository used for the Llemma project. This branch diverged from `main` at commit `009018e`. This branch implements the following features that are not present in `009018e` (some of these features may have subsequently been merged into `main`):
-- [FlashAttention-2](https://arxiv.org/abs/2307.08691)
-- Grouped Query Attention
-- A numerical precision fix for RoPE    
-- Saving checkpoints to Amazon S3.
+#### (Advanced) Custom Launching
 
-The remaining portion of this `README` contains instructions to replicate pretraining of the LLeMA models. 
+There are many cases where the above default launching options are not sufficient
 
-Training was performed across 256 A100 GPUs. We include configuration files and sample SLURM job script for the library to replicate training on a SLURM-managed cluster.
+- Many clusters have their own unique job scheduler or specific MPI/Slurm arguments necessary for launching jobs such as [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) or [LLNL Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management)
+- While the above Slurm/MPI/pdsh default options are enough for most job runs, advanced users may want to add arguments for optimization or debugging purposes
 
+In these cases, you will need to modify the DeepSpeed [multinode runner](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py) utility to support your usecase. Broadly, these enhancements fall under two categories:
 
-## Replicating Training
+##### 1. Adding a Launcher (e.g. [JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun), [Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management), etc)
 
+In this case, you must add a new multinode runner class to `deepspeed/launcher/multinode_runner.py` and expose it as a configuration option in GPT-NeoX. Examples on how we did this for [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) are in [this DeeperSpeed commit](https://github.com/EleutherAI/DeeperSpeed/commit/9aed6c8500d7c492d85c5c88687322dbda70e370) and [this GPT-NeoX commit](https://github.com/EleutherAI/gpt-neox/commit/3782c7ae60f8624e566e3879b89bb09e8b59b869), respectively.
 
-### Set up environment
+##### 2. Modifying Run Command or Environment Variables
 
-We provide a file containing a dump of our training environment.
+We have encountered many cases where we wish to modify the MPI/Slurm run command for an optimization or to debug (e.g. to modify the [Slurm srun CPU binding](https://slurm.schedmd.com/srun.html#OPT_cpu-bind) or to tag MPI logs with the rank). In this case, you must modify the multinode runner class' run command under its `get_cmd` method (e.g. [mpirun_cmd](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py#L135-L147) for OpenMPI). Examples on how we did this to provide optimized and rank-tagged run commands using Slurm and OpenMPI for the Stability cluster are in [this DeeperSpeed branch](https://github.com/microsoft/DeepSpeed/compare/master...EleutherAI:DeeperSpeed:v2.0-stability)
+
+
+#### Hostfile Generation
+
+In general you will not be able to have a single fixed hostfile, so you need to have a script to generate one dynamically when your job starts. An example script to dynamically generate a hostfile using [Slurm](https://slurm.schedmd.com/documentation.html) and 8 GPUs per node is:
+
+```bash
+#!/bin/bash
+GPUS_PER_NODE=8
+mkdir -p /sample/path/to/hostfiles
+# need to add the current slurm jobid to hostfile name so that we don't add to previous hostfile
+hostfile=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
+# be extra sure we aren't appending to a previous hostfile
+rm $hostfile &> /dev/null
+# loop over the node names
+for i in `scontrol show hostnames $SLURM_NODELIST`
+do
+    # add a line to the hostfile
+    echo $i slots=$GPUS_PER_NODE >>$hostfile
+done
+```
+
+`$SLURM_JOBID` and `$SLURM_NODELIST` being environment variables Slurm will create for you. See the [sbatch documentation](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES) for a full list of available Slurm environment variables set at job creation time.
+
+#### Job Launching
+
+Then you can create an [sbatch](https://slurm.schedmd.com/sbatch.html) script from which to kick off your GPT-NeoX job. A bare-bones sbatch script on a Slurm-based cluster with 8 GPUs per node would look like this:
 
-You can install all required packages via
 ```bash
-pip install -r requirements.txt
+#!/bin/bash
+#SBATCH --job-name="neox"
+#SBATCH --partition=your-partition
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8
+
+# Some potentially useful distributed environment variables
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+# Your hostfile creation script from above
+./write_hostfile.sh
+# Tell DeepSpeed where to find our generated hostfile via DLTS_HOSTFILE
+export DLTS_HOSTFILE=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
+
+# Launch training
+python3 deepy.py train.py /sample/path/to/your/configs/my_model.yml
+
+```
+
+You can then kick off a training run with `sbatch my_sbatch_script.sh`
+
+
+### Containerized Setup
+
+We also provide a Dockerfile and docker-compose configuration if you prefer to run NeoX in a container.
+
+Requirements to run the container are to have appropriate GPU drivers, an up-to-date installation of Docker, and [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed. To test if your installation is good you can use their "sample workload", which is:
+
+```
+docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
+```
+
+Provided that will run, you need to export NEOX_DATA_PATH and NEOX_CHECKPOINT_PATH in your environment to specify your data directory and directory for storing and loading checkpoints:
+
+```
+export NEOX_DATA_PATH=/mnt/sda/data/enwiki8 #or wherever your data is stored on your system
+export NEOX_CHECKPOINT_PATH=/mnt/sda/checkpoints
+```
+
+And then, from the gpt-neox directory, you can build the image and run a shell in a container with
+
+```
+docker compose run gpt-neox bash
 ```
-Make sure you are installing https://github.com/EleutherAI/DeeperSpeed/tree/new-fix for your DeepSpeed version and install fused kernels for GPT-NeoX via `python ./megatron/fused_kernels/setup.py install` from within your GPT-NeoX install.
 
+After the build, you should be able to do this:
+```
+mchorse@537851ed67de:~$ echo $(pwd)
+/home/mchorse
+mchorse@537851ed67de:~$ ls -al
+total 48
+drwxr-xr-x  1 mchorse mchorse 4096 Jan  8 05:33 .
+drwxr-xr-x  1 root    root    4096 Jan  8 04:09 ..
+-rw-r--r--  1 mchorse mchorse  220 Feb 25  2020 .bash_logout
+-rw-r--r--  1 mchorse mchorse 3972 Jan  8 04:09 .bashrc
+drwxr-xr-x  4 mchorse mchorse 4096 Jan  8 05:35 .cache
+drwx------  3 mchorse mchorse 4096 Jan  8 05:33 .nv
+-rw-r--r--  1 mchorse mchorse  807 Feb 25  2020 .profile
+drwxr-xr-x  2 root    root    4096 Jan  8 04:09 .ssh
+drwxrwxr-x  8 mchorse mchorse 4096 Jan  8 05:35 chk
+drwxrwxrwx  6 root    root    4096 Jan  7 17:02 data
+drwxr-xr-x 11 mchorse mchorse 4096 Jan  8 03:52 gpt-neox
+```
 
-### Converting Llama 2 checkpoints into NeoX format
+For a long-running job, you should run
 
-First, download CodeLlama 7b or 34b from the Meta AI repo and rename the download folder to 7B or 34B within the CodeLlama repository.
+```
+docker compose up -d
+```
 
-Then, to convert either model into the format expected by GPT-NeoX for checkpoints:
+to run the container in detached mode, and then, in a separate terminal session, run
+
+```
+docker compose exec gpt-neox bash
+```
+
+You can then run any job you want from inside the container.
+
+Concerns when running for a long time or in detached mode include
+ - You will have to terminate the container manually when you are no longer using it
+ - If you want processes to continue running when your shell session ends, you will need to background them.
+ - If you then want logging, you will have to make sure to pipe logs to disk or set up wandb.
+
+If you prefer to run the prebuilt container image from dockerhub, you can run the docker compose commands with ```-f docker-compose-dockerhub.yml``` instead, e.g.,
+
+```
+docker compose run -f docker-compose-dockerhub.yml gpt-neox bash
+```
+
+## Usage
+
+All functionality should be launched using `deepy.py`, a wrapper around the `deepspeed` launcher.
+
+We currently offer three main functions:
+1. `train.py` is used for training and finetuning models.
+2. `eval.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
+3. `generate.py` is used to sample text from a trained model.
+
+which can be launched with:
 
-Sample command for 7b Meta->NeoX format:
 ```bash
-python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_7b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 2}
+./deepy.py [script.py] [./path/to/config_1.yml] [./path/to/config_2.yml] ... [./path/to/config_n.yml]
 ```
 
-Sample command for 34b Meta->NeoX format:
-(Requires large amounts of GPU VRAM or CPU RAM. Pass `CUDA_VISIBLE_DEVICES=""` to perform conversion on CPU. 34b conversion may take a while)
+For example, to launch training you can run
 ```bash
-CUDA_VISIBLE_DEVICES="" python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_34b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 8}
+./deepy.py train.py ./configs/20B.yml ./configs/local_cluster.yml
 ```
 
+For more details on each entry point, see the [Training and Finetuning](#training-and-finetuning), [Inference](#inference) and [Evaluation](#evaluation) respectively.
 
-### Check Out Codebase
+# Configuration
 
-Next, check out the commit used to train the model you are replicating.
+GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yml files in [configs](./configs/), showing a diverse array of features and model sizes.
 
-* 7b / 34b: https://github.com/EleutherAI/gpt-neox/commit/{this_commit_hash}
+These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
 
-### Launching Training
+For a more detailed guide to the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
+
+# Datasets
+
+## Preconfigured Datasets
+
+Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
+
+E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
+
+```
+python prepare_data.py -d ./data
+```
+
+or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
+
+```
+python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
+```
+
+The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
+
+```yaml
+  "data-path": "./data/enwik8/enwik8_text_document",
+```
+
+## Using Custom Data
+
+To prepare your own dataset for training with custom data, format it as one large [jsonl](https://jsonlines.org/)-formatted file with each item in the list of dictionaries being a separate document. The document text should be grouped under one JSON key, i.e `"text"`. Any auxiliary data stored in other fields will not be used.
+
+Next make sure to download the GPT2 tokenizer vocab, and merge files from the following links:
+
+- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+
+Or use the 20B tokenizer (for which only a single Vocab file is needed):
+
+- Vocab: https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
+
+(alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
+
+You can now pretokenize your data using `tools/datasets/preprocess_data.py`, the arguments for which are detailed below:
+
+```
+usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
+                          [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] [--log-interval LOG_INTERVAL]
+
+optional arguments:
+  -h, --help            show this help message and exit
+
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Defa
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+
+tokenizer:
+  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
+                        What type of tokenizer to use.
+  --vocab-file VOCAB_FILE
+                        Path to the vocab file
+  --merge-file MERGE_FILE
+                        Path to the BPE merge file (if necessary).
+  --append-eod          Append an <eod> token to the end of a document.
+  --ftfy                Use ftfy to clean text
+
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+
+```
+
+For example:
+
+```bash
+python tools/datasets/preprocess_data.py \
+            --input ./data/mydataset.jsonl.zst \
+            --output-prefix ./data/mydataset \
+            --vocab ./data/gpt2-vocab.json \
+            --merge-file gpt2-merges.txt \
+            --dataset-impl mmap \
+            --tokenizer-type GPT2BPETokenizer \
+            --append-eod
+```
+
+You would then run training with the following settings added to your configuration file:
+
+```yaml
+  "data-path": "data/mydataset/mydataset",
+```
+
+# Training and Finetuning
+
+Training is launched using `deepy.py`, a wrapper around DeepSpeed's launcher, which launches the same script in parallel across many GPUs / nodes.
+
+The general usage pattern is:
+
+```bash
+python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
+```
+
+You can pass in an arbitrary number of configs which will all be merged at runtime.
+
+You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
+
+E.G:
+
+```bash
+python ./deepy.py train.py -d configs 125M.yml local_setup.yml
+```
+
+This will deploy the `train.py` script on all nodes with one process per GPU. The worker nodes and number of GPUs are specified in the `/job/hostfile` file (see [parameter documentation](configs/README.md)), or can simply be passed in as the `num_gpus` arg if running on a single node setup.
+
+Although this is not strictly necessary, we find it useful to define the model parameters in one config file (e.g `configs/125M.yml`) and the data path parameters in another (e.g `configs/local_setup.yml`).
+
+
+## Pretrained Models
+
+### GPT-NeoX-20B
+
+GPT-NeoX-20B is a 20 billion parameter autoregressive language model trained on [the Pile](https://arxiv.org/abs/2101.00027). Technical details about GPT-NeoX-20B can be found in [the associated paper](https://arxiv.org/abs/2204.06745). The configuration file for this model is both available at [`./configs/20B.yml`](./configs/20B.yml) and included in the download links below.
+
+[Slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/) - (No optimizer states, for inference or finetuning, 39GB)
+
+To download from the command line to a folder named `20B_checkpoints`, use the following command:
+
+```bash
+wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
+```
+
+[Full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/) - (Including optimizer states, 268GB)
+
+To download from the command line to a folder named `20B_checkpoints`, use the following command:
+
+```bash
+wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/ -P 20B_checkpoints
+```
+
+Weights can be alternatively be downloaded using a BitTorrent client. Torrent files can be downloaded here: [slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent), [full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights.torrent).
+
+We additionally have 150 checkpoints saved throughout training, one every 1,000 steps. We are working on figuring out how to best serve these at scale, but in the meanwhile people interested in working with the partially trained checkpoints can email us at contact@eleuther.ai to arrange access.
+
+### Pythia
+
+The Pythia Scaling Suite is a suite of models ranging from 70M parameters to 12B parameters trained on [the Pile](https://pile.eleuther.ai) intended to promote research on interpretability and training dynamics of large language models. Further details about the project and links to the models can be found in the [in the paper](https://arxiv.org/abs/2304.01373) and [on the project's GitHub](https://github.com/EleutherAI/pythia).
+
+### Polyglot
+
+The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
+
+# Inference
+
+**For most uses we recommend deploying models trained using the GPT-NeoX library via the Hugging Face Transformers library which is better optimized for inference.**
+
+We support three types of generation from a pretrained model:
+1. Unconditional generation
+2. Conditional generation based on an input read from a file
+3. Interactive generation, which allows for multiple rounds of back-and-forth between a user and the language model via a command line interface
+
+All three types of text generation can be launched via `python ./deepy.py generate.py -d configs 125M.yml local_setup.yml text_generation.yml` with the appropriate values set in `configs/text_generation.yml`.
+
+# Evaluation
+
+GPT-NeoX supports evaluation on downstream tasks through the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
+
+To evaluate a trained model on the evaluation harness, simply run:
+
+```bash
+python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
+```
+
+where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
+
+# Exporting to Hugging Face
+
+GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) format.
+
+Though NeoX supports a number of different architectural configurations, including AliBi positional embeddings, not all of these configurations map cleanly onto the supported configurations within Hugging Face Transformers.
+
+NeoX supports export of compatible models into the following architectures:
+- GPTNeoXForCausalLM
+- LlamaForCausalLM (GQA Support Coming Soon -- all Llama 1 models and Llama 2 / Codellama up to size 13B supported)
+
+Training a model which does not fit into one of these Hugging Face Transformers architectures cleanly will require writing custom modeling code for the exported model.
+
+To convert a GPT-NeoX library checkpoint to Hugging Face-loadable format, run:
+```bash
+python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,llama}
+```
+
+Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
+```bash
+huggingface-cli login
+python ./tools/ckpts/upload.py
+```
+and input the requested information, including HF hub user token.
+
+### Importing Models Into GPT-NeoX
+
+NeoX supplies several utilities for converting a pretrained model checkpoint into a format that can be trained within the library.
+
+The following models can be loaded in GPT-NeoX:
+- Llama 1
+- Llama 2 (Up to size 13B)
+- CodeLlama (Up to size 13B)
+- Mistral-7b-v0.1 (Coming Soon!)
+
+We provide two utilities for converting from two different checkpoint formats into a format compatible with GPT-NeoX.
+
+To convert a Llama 1 or Llama 2 checkpoint distributed by Meta AI from its original file format (downloadable [here](https://github.com/facebookresearch/llama) or [here](https://huggingface.co/meta-llama/Llama-2-7b)) into the GPT-NeoX library, run
+
+```
+python tools/ckpts/convert_raw_llama_weights_to_neox.py --input_dir /path/to/model/parent/dir/7B --model_size 7B --output_dir /path/to/save/ckpt --num_output_shards <TENSOR_PARALLEL_SIZE> (--pipeline_parallel if pipeline-parallel-size >= 1)
+```
+
+
+To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/convert_hf_to_sequential.py`. See documentation within that file for further options.
+
+
+# Monitoring
+
+In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/)
+
+## Weights and Biases
+
+EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account.
+
+## TensorBoard
+
+We also support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
+
+# Running on multi-node
+
+If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
+
+# Adoption and Publications
+
+The GPT-NeoX library was been widely adopted by academic and industry researchers and ported on to many HPC systems.
+
+If you have found this library useful in your research, please reach out and let us know! We would love to add you to our lists.
+
+## Publications
+
+EleutherAI and our collaborators have used it in the following publications:
+ - **Sid Black**, **Stella Biderman**, **Eric Hallahan**, **Quentin Anthony**, **Leo Gao**, **Laurence Golding**, **Horace He**, **Connor Leahy**, **Kyle McDonell**, **Jason Phang**, **Michael Pieler**, **Shivanshu Purohit**, **Laria Reynolds**, **Jon Tow**, **Ben Wang**, and **Samuel Weinbach**. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+ - **Stella Biderman**, **Hailey Schoelkopf**, **Quentin Anthony**, **Herbie Bradley**, **Kyle O'Brien**, **Eric Hallahan**, **Mohammad Aflah Khan**, **Shivanshu Purohit**, **USVSN Sai Prashanth**, Edward Raff, **Aviya Skowron**, **Lintang Sutawika**, **Oskar van der Wal**. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. _PMLR_, 2023.
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433*, 2023.
+ - **Stella Biderman**, **USVSN Sai Prashanth**, **Lintang Sutawika**, **Hailey Schoelkopf**, **Quentin Anthony**, **Shivanshu Purohit**, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" In _Neural Information Processing Systems_, 2023.
+ - **Hyunwoong Ko**, **Kichang Yang**, **Minho Ryu**, **Taekyoon Choi**, **Seungmu Yang,** and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254*, 2023.
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_, 2023.
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_, 2023.
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." In _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
+
+The following publications by other research groups use this library:
+- Ta-Chung Chi, Ting-Han Fan, Peter J. Ramadge, and Alexander Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)." In *Advances in Neural Information Processing Systems* 35 (2022).
+- Sameera Horawalavithana, Ellyn Ayton, Shivam Sharma, Scott Howland, Megha Subramanian, Scott Vasquez, Robin Cosbey, Maria Glenski, and Svitlana Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://aclanthology.org/2022.bigscience-1.12/)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+- Sophia Kolak, Ruben Martins, Claire Le Goues, and Vincent J. Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://par.nsf.gov/biblio/10340618)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*, 2022.
+- Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*, 2022.
+- Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" _BioRxiv_, 2022.
+- Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." In *Findings of the Association for Computational Linguistics*, 2023.
+- Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537, 2023.
+- Ta-Chung Chi, Ting-Han Fan, Li-Wei Chen, Alexander Rudnicky, and Peter Ramadge. "[Latent Positional Information is in the Self-Attention Variance of Transformer Language Models Without Positional Embeddings](https://aclanthology.org/2023.acl-short.102/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)_, pp. 13522-13537 (2023).
+- Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_, 2023.
+- Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint under review_, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_, 2023.
+- Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://arxiv.org/abs/2312.02406)." In _NeurIPS Workshop on R0-FoMo: Robustness of Few-shot and Zero-shot Learning in Large Foundation Models_, 2023.
+- Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." _bioRxiv_, 2023.
+- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv preprint arXiv:2310.01119_, 2023.
+- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." _arXiv preprint arXiv:2310.06266_, 2023.
+- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
+
+## Models
+The following models were trained using this library:
+
+### English LLMs
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
+- CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
+- StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
+- Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
+- Carnegie Mellon University's [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
+- Dampish's [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
+- Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
+
+### Non-English LLMs
+- EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
+- Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b)
+- LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
+- Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
+- CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
+- The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
+- The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
+- nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+
+### Code Models
+- Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
+- StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
+- CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
+
+### Other Modalities
+-  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
+-  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
+
+# Administrative Notes
+
+## Citing GPT-NeoX
+
+If you have found the GPT-NeoX library helpful in your work, you can cite this repository as
+
+```bibtex
+@software{gpt-neox-library,
+  title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
+  author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Phang, Jason and Purohit, Shivanshu and Schoelkopf, Hailey and Stander, Dashiell and Songz, Tri and Tigges, Curt and Thérien, Benjamin and Wang, Phil and Weinbach, Samuel},
+  url = {https://www.github.com/eleutherai/gpt-neox},
+  doi = {10.5281/zenodo.5879544},
+  month = {9},
+  year = {2023},
+  version = {2.0.0},
+}
+```
+
+To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
+
+```bibtex
+@inproceedings{gpt-neox-20b,
+  title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
+  author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
+  booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
+  url={https://arxiv.org/abs/2204.06745},
+  year={2022}
+}
+```
 
-Then, edit the provided YML files to set paths based on your own system's saved locations for checkpoints and data files, and edit the SLURM job script as specified (using ) or run the job across multiple nodes using your own system's orchestration.
+## Contributing
+GPT-NeoX is built by the open-source AI community, and relies on our amazing contributors! Please see our
+[contributing](CONTRIBUTING.md) guide for more details on our CLA, code formatting, testing,
+etc.
 
-**Tip**: Note that the global batch size will be scaled by your number of nodes. Therefore, if running on a number of nodes different from 32 you should scale gradient accumulation steps accordingly. 
+## Licensing
 
-We used a batch size of 4M tokens. To calculate global batch size, you should compute `seq_len * num_gpus * ( train_microbatch_size_per_gpu * gradient_accumulation_steps) / (model_parallel_size * max(pipeline_parallel_size, 1))` .
+This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2024, EleutherAI. Licensed under the Apache License:
 
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
 
-## Contents
+        http://www.apache.org/licenses/LICENSE-2.0
 
-The files in this folder are as follows:
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
 
-* `34b_launch_script.sh` contains a skeleton SLURM job script to launch training with NeoX across 32 nodes.
+This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
 
-* `configs/data_mixture.yml` contains a list of the domain weights for the final training run.
+This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
 
-* `configs/llemma_7b.yml` is a cleaned-up version of the config file used to train Llemma-7b.
+For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email us at contact@eleuther.ai.
 
-* `configs/llemma_34b.yml` is a cleaned-up version of the config file used to train Llemma-34b.
+## Acknowledgements
 
-* `requirements.txt` is a dump of the virtual environmment used in training, created via `pip freeze`.
+We run our experiments on a Kubernetes cluster provided by [CoreWeave](https://coreweave.com/) and a Slurm cluster provided by [Stability AI](https://stability.ai). We are thankful to the DeepSpeed team for their advice and consultation.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
new file mode 100644
index 000000000..ea3fdb9bf
--- /dev/null
+++ b/configs/1-3B.yml
@@ -0,0 +1,93 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 2048,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/125M-json.yml b/configs/125M-json.yml
new file mode 100644
index 000000000..467318f24
--- /dev/null
+++ b/configs/125M-json.yml
@@ -0,0 +1,80 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 12,
+  "hidden_size": 768,
+  "num_attention_heads": 12,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "norm": "layernorm",
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0006,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00006,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true
+  },
+
+  "train_micro_batch_size_per_gpu": 4,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0.0,
+  "attention_dropout": 0.0,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 320000,
+  "lr_decay_iters": 320000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 10000,
+  "eval_interval": 1000,
+  "eval_iters": 10,
+
+  "log_interval": 100,
+  "steps_per_print": 10,
+  "keep_last_n_checkpoints": 4,
+  "wall_clock_breakdown": true,
+
+  "hostfile": "/mock_path"
+}
diff --git a/configs/125M.yml b/configs/125M.yml
new file mode 100644
index 000000000..466492f7b
--- /dev/null
+++ b/configs/125M.yml
@@ -0,0 +1,96 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+
+  #  networking
+  "hostfile": "/mock_path"
+}
diff --git a/configs/13B.yml b/configs/13B.yml
new file mode 100644
index 000000000..99caab585
--- /dev/null
+++ b/configs/13B.yml
@@ -0,0 +1,94 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 40,
+   "hidden_size": 5120,
+   "num_attention_heads": 40,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   "min_lr": 0.00001,
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/175B.yml b/configs/175B.yml
new file mode 100644
index 000000000..4d011f1b4
--- /dev/null
+++ b/configs/175B.yml
@@ -0,0 +1,92 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 96,
+   "hidden_size": 12288,
+   "num_attention_heads": 96,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000006,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/19M.yml b/configs/19M.yml
new file mode 100644
index 000000000..c14ebe8ea
--- /dev/null
+++ b/configs/19M.yml
@@ -0,0 +1,97 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 6,
+  "hidden_size": 512,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+  "train_micro_batch_size_per_gpu": 4, #32,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  # additional deepspeed args not specified above
+  "deepspeed_extra_args": {
+    "comms_logger": {
+        "enabled": true,
+        "verbose": true,
+        "prof_all": true,
+        "debug": false
+    },
+  }
+
+}
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
new file mode 100644
index 000000000..9e6a47e15
--- /dev/null
+++ b/configs/2-7B.yml
@@ -0,0 +1,93 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 32,
+   "hidden_size": 2560,
+   "num_attention_heads": 32,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00016,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000016,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/20B.yml b/configs/20B.yml
new file mode 100644
index 000000000..0a4ce6335
--- /dev/null
+++ b/configs/20B.yml
@@ -0,0 +1,113 @@
+# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
+# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
+# the model in memory.
+
+{
+  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
+  "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
+  "save": "./20B_checkpoints",
+  "load": "./20B_checkpoints",
+
+  # If finetuning, edit the following to the location of your finetuning dataset:
+  "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
+
+  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+  # across the node boundaries )
+  "pipe_parallel_size": 4,
+  "model_parallel_size": 2,
+
+  # model settings
+  "num_layers": 44,
+  "hidden_size": 6144,
+  "num_attention_heads": 64,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "norm": "layernorm",
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.97e-4,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+      }
+      },
+
+  "min_lr": 0.97e-5,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+  "stage": 1,
+  "allgather_partitions": True,
+  "allgather_bucket_size": 1260000000,
+  "overlap_comm": True,
+  "reduce_scatter": True,
+  "reduce_bucket_size": 1260000000,
+  "contiguous_gradients": True,
+  },
+
+  # batch / data settings (assuming 96 GPUs)
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 32,
+  "data_impl": "mmap",
+  "split": "995,4,1",
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": false,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.01,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+    },
+
+  # misc. training settings
+  "train_iters": 150000,
+  "lr_decay_iters": 150000,
+
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 500, # this variable previously called `save-interval`
+  "eval_interval": 1000,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 2,
+  "steps_per_print": 2,
+  "wall_clock_breakdown": false,
+
+  ### NEW DATA: ####
+  "tokenizer_type": "HFTokenizer",
+  "tensorboard-dir": "./tensorboard",
+  "log_dir": "./logs",
+
+}
diff --git a/configs/350M.yml b/configs/350M.yml
new file mode 100644
index 000000000..00a174433
--- /dev/null
+++ b/configs/350M.yml
@@ -0,0 +1,92 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 1024,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0003,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00003,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/49M.yml b/configs/49M.yml
new file mode 100644
index 000000000..099af6a48
--- /dev/null
+++ b/configs/49M.yml
@@ -0,0 +1,93 @@
+{
+  # parallelism settings
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 10,
+  "hidden_size": 640,
+  "num_attention_heads": 10,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  # these should provide some speedup but takes a while to build, set to true if desired
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0008,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.00008,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+
+  # misc. training settings
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100000,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}
diff --git a/configs/6-7B.yml b/configs/6-7B.yml
new file mode 100644
index 000000000..087b7d763
--- /dev/null
+++ b/configs/6-7B.yml
@@ -0,0 +1,93 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 32,
+   "hidden_size": 4096,
+   "num_attention_heads": 32,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   "min_lr": 0.000012,
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/760M.yml b/configs/760M.yml
new file mode 100644
index 000000000..6d62dc0f3
--- /dev/null
+++ b/configs/760M.yml
@@ -0,0 +1,93 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 1536,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00025,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000025,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/800M.yml b/configs/800M.yml
new file mode 100644
index 000000000..4fb9559a8
--- /dev/null
+++ b/configs/800M.yml
@@ -0,0 +1,86 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 16,
+  "hidden_size": 2048,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00025,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.000025,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+  "train_micro_batch_size_per_gpu": 16,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 40000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}
diff --git a/configs/README.md b/configs/README.md
new file mode 100644
index 000000000..d8ae81739
--- /dev/null
+++ b/configs/README.md
@@ -0,0 +1,314 @@
+# Configuration and parameters
+
+GPT-NeoX parameters are defined in a YAML configuration file which is passed to the `deepy.py` launcher - for examples see the files contained in this folder.
+Parameters originate from either the [DeepSpeed runner CLI (DSL)](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/runner.py#L33), [DeepSpeed configuration file (DSC)](https://www.deepspeed.ai/docs/config-json/), [Megatron-LM CLI (Meg)](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L224) or are GPT-NeoX (NeoX) modifications.
+
+## Example Configuration (GPT3 Small):
+
+Below is an example configuration `.yaml` to train a ~160M parameter GPT model. This readme will go through each section in the configuration and the options available.
+
+For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
+
+Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
+```yaml
+# GPT-3 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "rmsnorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+    # this should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "train_iters": 320000,
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "max_grad_norm": 1.0,
+       "betas": [0.9, 0.95]
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 1,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # lr decay settings
+   "lr_decay_iters": 320000,
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+
+   # misc. training settings
+   "distributed_backend": "nccl",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
+```
+
+### Parallelism Settings:
+
+The parallelism settings are left at 1 in all configs, as the settings you want will be highly dependent on your compute setup and network topology.
+We have found it best to do model parallelism within a node, and schedule pipeline stages across node boundaries.
+
+```yaml
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+```
+
+These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must be divisible by `pipe_parallel_size` * `model_parallel_size`.
+
+
+### Model Settings:
+```yaml
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "rmsnorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+    # this should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "train_iters": 320000,
+```
+An example of some basic settings used to configure your model's architecture and number of training steps.
+
+### Optimizer Settings:
+
+Our optimizer configuration has a similar syntax to deepspeed's. Different optimizers will have different arguments for "params".
+Learning rate should be configured from here using the `"lr"` field of `optimizer["params"]`.
+
+```yaml
+  # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "max_grad_norm": 1.0,
+       "betas": [0.9, 0.95]
+     }
+   }
+   ```
+Available optimizer types are:
+
+- `"Adam"`: regular Adam optimizer
+- `"OneBitAdam"`: Deepspeed's [OneBitAdam optimizer](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). To use 1-bit adam, you'll also need to add the `freeze_step`, `cuda_aware`, and `comm_backend_name` fields, like so:
+```yaml
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": false,
+       "comm_backend_name": "nccl"
+     }
+```
+
+- `"CPU_Adam"`/`"CPU_torch_adam"`: Adam optimizer on CPU. Either megatron's version ("CPU_Adam") or torch's ("CPU_torch_adam")
+- `"SM3"`: SM3 or [Memory adaptive efficient optimization optimizer](https://arxiv.org/pdf/1901.11150.pdf). We have found this doesn't work well with fp16 training.
+- `"madgrad_wd"`: MADGRAD or [A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
+    Optimizer] weight decay has been implemented AdamW style instead of the original madgrad Adam style. https://arxiv.org/abs/2101.11075
+
+### ZeRO Optimization:
+
+```yaml
+# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+        "stage": 0,
+        "allgather_partitions": True,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": True,
+        "reduce_scatter": True,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": True,
+  },
+  "zero_allow_untested_optimizer": false,
+
+```
+
+ZeRO optimization in NeoX is currently configured identically to how deepspeed configures it, please see [the deepspeed docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for more information.
+
+If you want to combine an optimizer untested by DeepSpeed with ZeRO (i.e, not ADAM or LAMB), you must pass `"zero_allow_untested_optimizer": true` *outside* of the `"zero_optimization"` dictionary (see above).
+
+N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pipe-parallel-size"` to 0 if you want to use ZeRO stage 2 or more.
+
+### Batch Size Settings:
+
+```yaml
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 1,
+```
+Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
+- `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
+- `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
+- `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
+
+### Extra DeepSpeed Settings
+
+```yaml
+# additional deepspeed args not specified above
+"deepspeed_extra_args": {
+    "comms_logger": {
+        "enabled": true,
+        "verbose": true,
+        "prof_all": true,
+        "debug": false
+    },
+}
+```
+Additional DeepSpeed settings besides those mentioned above should be wrapped in the `"deepspeed_extra_args` argument, as in the example above. This functionality is designed to allow arguments not specified by existing dataclasses to be passed to DeepSpeed (e.g. when new functionalities are implemented). If any settings are duplicated here from elsewhere in the YAML, the system will throw an exception and notify the user.
+
+### Dataset / Tokenizer / Checkpoint / Logging Settings:
+
+```yaml
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # Suggested data paths when using GPT-NeoX locally
+   "data_path": "data/enwik8/enwik8_text_document",
+   #"train_data_path": "data/enwik8/enwik8_text_document",
+   #"test_data_path": "data/enwik8/enwik8_text_document",
+   #"valid_data_path": "data/enwik8/enwik8_text_document",
+   "vocab_file": "data/gpt2-vocab.json",
+   "merge_file": "data/gpt2-merges.txt",
+   "save": "checkpoints",
+   "load": "checkpoints",
+   "tensorboard_dir": "tensorboard",
+   "log_dir": "logs",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+```
+
+### LR Scheduler settings
+
+```yaml
+   "lr_decay_iters": 320000,
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+```
+
+Settings used to modify the learning rate over time.
+
+N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because reasons. Currently the lr decay style defaults to deepspeed's `WarmupDecay
+
+### Activation Checkpointing Settings:
+
+```yaml
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+```
+
+Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
+
+### Mixed Precision Training Settings:
+gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
+An example config for fp16 training:
+
+```yaml
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+```
+
+Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
+
+
+### SLURM Settings
+
+If you are running GPT-NeoX on a SLURM cluster and wish to use SLURM to coordinate nodes, then you must set the following variables in your config:
+
+```yaml
+    "launcher": "slurm",
+    "deepspeed_slurm": true
+```
+
+Additionally, you need to modify _all_ of your configs to conform to the JSON. When launching a GPT-NeoX job you can specify multiple YAML config files. Internally, all of these files are merged into one config and then passed as a single long command line argument to Deep(er)Speed. When using SLURM and its internal command `srun`, python fails to parse this long command line argument unless it is in the more restrictive JSON format. In practice, the example NeoX configs are already very close to JSON. As an example, this is a snippet of a YAML-compatible config, N.B. the comment the capital-F `False`:
+
+```yaml
+    # optimizer settings
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": False,
+       "comm_backend_name": "nccl"
+     }
+```
+
+To make this JSON just remove the comment and use all lowercase for the boolean:
+
+```yaml
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": false,
+       "comm_backend_name": "nccl"
+     }
+```
diff --git a/configs/autotuning_configs/small_tune.json b/configs/autotuning_configs/small_tune.json
new file mode 100644
index 000000000..52c99449b
--- /dev/null
+++ b/configs/autotuning_configs/small_tune.json
@@ -0,0 +1,78 @@
+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+
+   "num-layers": 12,
+   "hidden-size": 768,
+   "num-attention-heads": 12,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+
+   "train_micro_batch_size_per_gpu": 1,
+   "data-impl": "mmap",
+   "split": "949,50,1",
+
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.0,
+   "hidden-dropout": 0.0,
+   "attention-dropout": 0.0,
+
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "comment": "neox",
+   "autotuning": {
+       "enabled": true,
+       "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "zero_optimization": {
+      "stage": [0, 1, 2, 3]
+   },
+  "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
+  "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
+  "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
+}
diff --git a/configs/autotuning_configs/tune.json b/configs/autotuning_configs/tune.json
new file mode 100644
index 000000000..b2f114539
--- /dev/null
+++ b/configs/autotuning_configs/tune.json
@@ -0,0 +1,72 @@
+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+   "num-layers": 12,
+   "hidden-size": 768,
+   "num-attention-heads": 12,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "scaled-upper-triang-masked-softmax-fusion": true,
+   "bias-gelu-fusion": true,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+   "train_micro_batch_size_per_gpu": 1,
+   "autotuning_config": {
+     "enabled": true,
+     "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "data-impl": "mmap",
+   "split": "949,50,1",
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.0,
+   "hidden-dropout": 0.0,
+   "attention-dropout": 0.0,
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 200,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "comment": "neox"
+}
diff --git a/configs/autotuning_configs/tune_1-3B.json b/configs/autotuning_configs/tune_1-3B.json
new file mode 100644
index 000000000..8207d0cfc
--- /dev/null
+++ b/configs/autotuning_configs/tune_1-3B.json
@@ -0,0 +1,86 @@
+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+
+   "num-layers": 24,
+   "hidden-size": 2048,
+   "num-attention-heads": 16,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   "attention_config": [[["flash"], 24]],
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8
+     }
+   },
+   "min_lr": 0.00002,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true
+  },
+  "train_micro_batch_size_per_gpu": 1,
+   "autotuning": {
+     "enabled": true,
+     "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "data-impl": "mmap",
+
+   "checkpoint-activations": false,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.1,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "no_ssh_check": true,
+
+   "log-interval": 10,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 1,
+   "wall_clock_breakdown": true
+}
diff --git a/configs/autotuning_configs/tune_6-7B.json b/configs/autotuning_configs/tune_6-7B.json
new file mode 100644
index 000000000..3d7aadf68
--- /dev/null
+++ b/configs/autotuning_configs/tune_6-7B.json
@@ -0,0 +1,77 @@
+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 8,
+
+   "num-layers": 32,
+   "hidden-size": 4096,
+   "num-attention-heads": 32,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+
+   "train_micro_batch_size_per_gpu": 1,
+   "zero_optimization": {
+      "stage": [0, 1, 2, 3]
+   },
+   "data-impl": "mmap",
+   "split": "949,50,1",
+
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight-decay": 0,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train-iters": 100,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "no_ssh_check": true,
+   "comment": "neox",
+   "autotuning": {
+       "enabled": true,
+       "mp_size": 8,
+       "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   }
+}
diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
new file mode 100644
index 000000000..87e86e7fb
--- /dev/null
+++ b/configs/bf16_125M.yml
@@ -0,0 +1,80 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   "precision": "bfloat16",
+
+   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
new file mode 100644
index 000000000..523b10c39
--- /dev/null
+++ b/configs/bnb_125M.yml
@@ -0,0 +1,87 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "use_bnb_optimizer": true,
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/cpu_mock_config.yml b/configs/cpu_mock_config.yml
new file mode 100644
index 000000000..653aa21d8
--- /dev/null
+++ b/configs/cpu_mock_config.yml
@@ -0,0 +1,5 @@
+# CPU unit tests should be independent of the presence of GPUs on the test server
+# host. This configuration mocks these GPU resources and other dependencies.
+{
+  "global_num_gpus": 1
+}
diff --git a/configs/docker/paths.yml b/configs/docker/paths.yml
new file mode 100644
index 000000000..442d61cf3
--- /dev/null
+++ b/configs/docker/paths.yml
@@ -0,0 +1,12 @@
+{
+  "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+  "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+  "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json",
+
+  "save": "/home/mchorse/chk/",
+  "load": "/home/mchorse/chk/",
+  "checkpoint_validation_with_forward_pass": False
+}
diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
new file mode 100644
index 000000000..36e75d8b3
--- /dev/null
+++ b/configs/eleutherai_cluster.yml
@@ -0,0 +1,29 @@
+# Data paths and options when using EleutherAI cluster
+{
+  # you may include multiple distinct datasets if desired
+  "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
+  "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
+  "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
+
+  # if using multiple datasets, provide weights for them to be sampled with
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+
+
+  # If you would like the code to create val and test datasets from your training set use the following instead
+  # "split" determines the relative size of train, val, and test
+
+  # "split" 995,4,1
+  # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
+
+  "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
+  "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
+  "save": "/mnt/ssd-1/checkpoints",
+  "load": "/mnt/ssd-1/checkpoints",
+  "tensorboard_dir": "/mnt/ssd-1/tensorboard",
+  "log_dir": "/mnt/ssd-1/logs",
+  "wandb_team": "eleutherai",
+  "wandb_project": "neox",
+  "wandb_group": "example"
+}
diff --git a/configs/finetuning_configs/6-9B.yml b/configs/finetuning_configs/6-9B.yml
new file mode 100755
index 000000000..6598238cd
--- /dev/null
+++ b/configs/finetuning_configs/6-9B.yml
@@ -0,0 +1,89 @@
+{
+  # finetuning option
+  "load": "/path/to/checkpoint",
+  "finetune": true,
+
+  "pipe-parallel-size": 1,
+  "model-parallel-size": 2,
+
+   "num-layers": 32,
+   "hidden-size": 4096,
+   "num-attention-heads": 32,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "rotary_pct": 0.25,
+   "no-weight-tying": true,
+   "gpt_j_residual": true,
+   "output_layer_parallelism": "column",
+
+   "attention-config": [[["flash"], 32]],
+
+   "scaled-upper-triang-masked-softmax-fusion": true,
+   "bias-gelu-fusion": true,
+
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "min_lr": 0.000012,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false,
+    "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
+  },
+
+   "train_micro_batch_size_per_gpu": 8,
+   "gradient_accumulation_steps": 2,
+   "data-impl": "mmap",
+
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.1,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "initial_scale_power": 12,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train-iters": 143000,
+   "lr-decay-iters": 143000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 1000,
+   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
+   "eval-interval": 143000,
+   "eval-iters": 10,
+
+   "log-interval": 10,
+   "steps_per_print": 10,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/gen_docs.py b/configs/gen_docs.py
new file mode 100644
index 000000000..08431e6c6
--- /dev/null
+++ b/configs/gen_docs.py
@@ -0,0 +1,93 @@
+import sys
+import os
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+from megatron.neox_arguments import neox_args, deepspeed_args
+from inspect import getmembers, getsource
+from dataclasses import field, is_dataclass
+from itertools import tee, zip_longest
+import pathlib
+
+
+def pairwise(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b = tee(iterable)
+    next(b, None)
+    return zip_longest(a, b)
+
+
+def get_docs(module):
+    ARGS_CLASSES = getmembers(module, is_dataclass)
+    results = {}
+    for name, dcls in ARGS_CLASSES:
+        assert is_dataclass(dcls)
+        src = getsource(dcls)
+        d = dcls()
+        loc = 0
+        results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
+        for cur, _next in pairwise(d.__dataclass_fields__.items()):
+            field_name, field_def = cur
+            field_type = field_def.type
+            if hasattr(field_type, "__name__"):
+                field_type = field_type.__name__
+            else:
+                field_type = str(field_type)
+
+            field_default = field_def.default
+
+            # try to find the field definition
+            loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
+
+            if _next is not None:
+                next_field_name, _ = _next
+                # try to find the next field definition
+                next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
+            else:
+                next_loc = len(src)
+
+            # try to get the docstring
+            _src = src[loc:next_loc].strip()
+            if '"""' in _src:
+                doc = _src.split('"""')[1].strip()
+            elif "'''" in _src:
+                doc = _src.split("'''")[1].strip()
+            else:
+                doc = ""
+            results[name]["attributes"][field_name] = {
+                "name": field_name,
+                "type": field_type,
+                "default": field_default,
+                "doc": doc,
+            }
+    return results
+
+
+def to_md(docs, intro_str=""):
+    """
+    Writes the docs dictionary to markdown format
+    """
+    lines = []
+    lines.append(intro_str)
+    for name, doc in docs.items():
+        lines.append(f"## {name}")
+        lines.append(f"{doc['doc']}")
+        lines.append("")
+        for field_name, field_def in doc["attributes"].items():
+            # attribute name and type
+            lines.append(f"- **{field_name}**: {field_def['type']}")
+            # default value
+            lines.append(f"    Default = {str(field_def['default'])}")
+            lines.append(f"    {field_def['doc']}")
+            lines.append("")
+    return "\n\n".join(lines)
+
+
+if __name__ == "__main__":
+    docs = get_docs(neox_args)
+    docs.update(get_docs(deepspeed_args))
+    intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
+    md = to_md(docs, intro_str=intro_str)
+    with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
+        f.write(md)
diff --git a/configs/gmlp_small.yml b/configs/gmlp_small.yml
new file mode 100644
index 000000000..2a5b02d60
--- /dev/null
+++ b/configs/gmlp_small.yml
@@ -0,0 +1,72 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   "attention_config": [[["gmlp"], "all"]],
+
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
+   "gmlp_attn_dim": 64,
+   "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e_8,
+     }
+   },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": false,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
new file mode 100644
index 000000000..305567be1
--- /dev/null
+++ b/configs/llama/13B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 40,
+  "hidden_size": 5120,
+  "num_attention_heads": 40,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
new file mode 100644
index 000000000..450f8da38
--- /dev/null
+++ b/configs/llama/30B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 60,
+  "hidden_size": 6656,
+  "num_attention_heads": 52,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
new file mode 100644
index 000000000..85f199ce2
--- /dev/null
+++ b/configs/llama/65B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 80,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
new file mode 100644
index 000000000..ecbf187a8
--- /dev/null
+++ b/configs/llama/7B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama/README.md b/configs/llama/README.md
new file mode 100644
index 000000000..48f5ae0d8
--- /dev/null
+++ b/configs/llama/README.md
@@ -0,0 +1,23 @@
+# LLaMA
+
+## Training and Finetuning
+
+These configs contain the architecture settings required to run inference/training/finetuning on the [LLaMA](https://huggingface.co/docs/transformers/main/model_doc/llama) model suite.
+
+LLaMA finetuning  can be launched with
+```sh
+python ./deepy.py ./train.py -d configs llama/7B.yml llama/train_config.yml local_setup.yml
+```
+
+If training from scratch, set `finetune=False` in `./configs/llama/train_config.yml`.
+
+
+## Inference
+
+
+LLaMA generation can be launched with
+```sh
+python ./deepy.py ./generate.py -d configs  \
+  llama/7B.yml llama/train_config.yml local_setup.yml text_generation.yml \
+  -i input_prompt.txt -o prompt_out.txt
+```
diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml
new file mode 100644
index 000000000..64d8ff422
--- /dev/null
+++ b/configs/llama/train_config.yml
@@ -0,0 +1,73 @@
+{
+  # finetuning option
+  "finetune": true,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+     "lr": 0.0002,
+     "betas": [0.9, 0.95],
+     "eps":  1.0e-8,
+    }
+  },
+  "min_lr": 0.00002,
+  "override_lr_scheduler": true,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+   "stage": 1,
+   "allgather_partitions": True,
+   "allgather_bucket_size": 500000000,
+   "overlap_comm": True,
+   "reduce_scatter": True,
+   "reduce_bucket_size": 500000000,
+   "contiguous_gradients": True,
+  },
+
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 4,
+  "data_impl": "mmap",
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  # misc. training settings
+  "train_iters": 320000,
+  "lr_decay_iters": 320000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 10000,
+  "eval_interval": 1000,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 100,
+  "steps_per_print": 10,
+  "keep_last_n_checkpoints": 4,
+  "wall_clock_breakdown": true,
+}
diff --git a/configs/llemma-34b-config.yml b/configs/llemma-34b-config.yml
new file mode 100644
index 000000000..46e1396cc
--- /dev/null
+++ b/configs/llemma-34b-config.yml
@@ -0,0 +1,108 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 2,
+   "gradient_accumulation_steps": 16,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 12000,
+   "lr_decay_iters": 12000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 250,
+   "eval_interval": 250,
+   "eval_iters": 25,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
+   # "load": "" # set to same as "save" to resume from intermediate finetuning step
+   "load": "/path/to/converted/codellama_34b_weights_with_mp8",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+
+   "use_wandb": true,
+   "wandb_group": "llemma_34b_replication",
+   "wandb_project": "your-project-name",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
diff --git a/configs/llemma-7b-config.yml b/configs/llemma-7b-config.yml
new file mode 100644
index 000000000..fa61af90f
--- /dev/null
+++ b/configs/llemma-7b-config.yml
@@ -0,0 +1,105 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 2,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 48000,
+   "lr_decay_iters": 48000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 500,
+   "eval_interval": 250,
+   "eval_iters": 50,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/path/to/save/llema-replication",
+   #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+   "load": "/path/to/converted/codellama_7b_weights_with_mp2",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+   "use_wandb": true,
+   "wandb_group": "llemma_7b_replication",
+   "wandb_project": "your-project-name",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
diff --git a/configs/local_setup.yml b/configs/local_setup.yml
new file mode 100644
index 000000000..d031a2ad8
--- /dev/null
+++ b/configs/local_setup.yml
@@ -0,0 +1,30 @@
+# Suggested data paths when using GPT-NeoX locally
+{
+  "data_path": "data/enwik8/enwik8_text_document",
+
+  # or for weighted datasets:
+  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+
+  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
+  # WARNING: setting this to True will override any user provided weights
+  # "weight_by_num_documents": false,
+  # "weighted_sampler_alpha": 0.3,
+
+  "vocab_file": "data/gpt2-vocab.json",
+  "merge_file": "data/gpt2-merges.txt",
+
+  "save": "checkpoints",
+  "load": "checkpoints",
+  "checkpoint_validation_with_forward_pass": False,
+
+  "tensorboard_dir": "tensorboard",
+  "log_dir": "logs",
+  "use_wandb": True,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "neox"
+}
diff --git a/configs/mistral/7B.yml b/configs/mistral/7B.yml
new file mode 100644
index 000000000..a6e01c2d8
--- /dev/null
+++ b/configs/mistral/7B.yml
@@ -0,0 +1,39 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "intermediate_size": 14336,
+  "num_attention_heads": 32,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "sliding_window_width": 4096,
+  "max_position_embeddings": 131072,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  # Currently, we only support Grouped Query Attention
+  # When Flash attention is enabled.
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "/weka/hailey/mistral-support-neox/mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+}
\ No newline at end of file
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
new file mode 100644
index 000000000..7df94988d
--- /dev/null
+++ b/configs/neox_arguments.md
@@ -0,0 +1,2065 @@
+Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):
+
+
+## NeoXArgsLRScheduler
+
+LR Scheduler Arguments
+
+
+
+- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential']
+
+    Default = linear
+
+    Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'.
+
+
+
+- **lr_decay_iters**: int
+
+    Default = None
+
+    Number of iterations to decay learning rate over, If None defaults to --train-iters
+
+
+
+- **min_lr**: float
+
+    Default = 0.0
+
+    Minimum value for learning rate. The scheduler clips values below this threshold.
+
+
+
+- **warmup**: float
+
+    Default = 0.01
+
+    Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
+
+
+
+- **override_lr_scheduler**: bool
+
+    Default = False
+
+    Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
+
+
+
+- **use_checkpoint_lr_scheduler**: bool
+
+    Default = False
+
+    Use checkpoint to set the values of the scheduler (learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style from checkpoint and ignore input arguments.
+
+
+
+## NeoXArgsLogging
+
+Logging Arguments
+
+
+
+- **use_wandb**: bool
+
+    Default = None
+
+    Flag indicating if wandb is to be used.
+
+
+
+- **wandb_group**: str
+
+    Default = None
+
+    Weights and Biases group name - used to group together "runs".
+
+
+
+- **wandb_team**: str
+
+    Default = None
+
+    Team name for Weights and Biases.
+
+
+
+- **wandb_project**: str
+
+    Default = neox
+
+    wandb project name
+
+
+
+- **wandb_host**: str
+
+    Default = https://api.wandb.ai
+
+    url of the wandb host
+
+
+
+- **wandb_init_all_ranks**: bool
+
+    Default = False
+
+    Initialize wandb on all ranks.
+
+
+
+- **git_hash**: str
+
+    Default = 6a8a829
+
+    current git hash of repository
+
+
+
+- **log_dir**: str
+
+    Default = None
+
+    Directory to save logs to.
+
+
+
+- **tensorboard_dir**: str
+
+    Default = None
+
+    Write TensorBoard logs to this directory.
+
+
+
+- **log_interval**: int
+
+    Default = 100
+
+    Interval between logging.
+
+
+
+- **log_grad_pct_zeros**: bool
+
+    Default = False
+
+    Log the percentage of zeros for the gradient of each parameter to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
+
+
+
+- **log_param_norm**: bool
+
+    Default = False
+
+    Log the frob norm of the parameters to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
+
+
+
+- **log_grad_norm**: bool
+
+    Default = False
+
+    Log the frob norm of the gradients to wandb / tensorboard (useful for debugging).
+    (N.B - this will only work with pp = 0 for now, as we don't have access to the gradients of the model because
+    deepspeed.)
+
+
+
+- **log_optimizer_states**: bool
+
+    Default = False
+
+    Log the frob norm of the optimizer states to wandb / tensorboard (useful for debugging).
+
+
+
+- **log_gradient_noise_scale**: bool
+
+    Default = False
+
+    Whether to log the gradient noise scale when training (cf. https://arxiv.org/abs/1812.06162 for explanation)
+
+
+
+- **gradient_noise_scale_n_batches**: int
+
+    Default = 5
+
+    Number of batches to accumulate gradients for in the gradient noise scale logger.
+
+
+
+- **gradient_noise_scale_cpu_offload**: bool
+
+    Default = False
+
+    Whether to offload the buffered gradients to cpu when measuring gradient noise scale.
+
+
+
+## NeoXArgsModel
+
+Model Arguments
+
+
+
+- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16']
+
+    Default = None
+
+    description of the used precision, either one of fp16 or fp32 (and in the future bf16).
+
+
+
+- **num_layers**: int
+
+    Default = None
+
+    Number of transformer layers.
+
+
+
+- **hidden_size**: int
+
+    Default = None
+
+    Transformer hidden size.
+
+
+
+- **num_attention_heads**: int
+
+    Default = None
+
+    Number of transformer attention heads.
+
+
+
+- **seq_length**: int
+
+    Default = None
+
+    Maximum sequence length to process.
+
+
+
+- **max_position_embeddings**: int
+
+    Default = None
+
+    Maximum number of position embeddings to use. This is the size of position embedding.
+
+
+
+- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
+
+    Default = layernorm
+
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+
+
+
+- **layernorm_fusion**: bool
+
+    Default = False
+
+    Use fused layer norm kernel (if `norm` is `layernorm`).
+
+
+
+- **use_qk_layernorm**: bool
+
+    Default = False
+
+    Use QK Normalization
+
+
+
+- **layernorm_epsilon**: float
+
+    Default = 1e-05
+
+    Layer norm epsilon.
+
+
+
+- **rms_norm_epsilon**: float
+
+    Default = 1e-08
+
+    Root mean squared norm epsilon
+
+
+
+- **scalenorm_epsilon**: float
+
+    Default = 1e-08
+
+    Scalenorm epsilon
+
+
+
+- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none']
+
+    Default = learned
+
+    Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'
+
+
+
+- **rpe_num_buckets**: int
+
+    Default = 32
+
+    T5 relative positional encoding number of buckets, default 32.
+
+
+
+- **rpe_max_distance**: int
+
+    Default = 128
+
+    T5 relative positional encoding max distance, default 128.
+
+
+
+- **opt_pos_emb_offset**: int
+
+    Default = 0
+
+    Learned position embedding offset (only used by OPT, where it should be set to 2).
+
+
+
+- **no_weight_tying**: bool
+
+    Default = False
+
+    Disables weight tying between embedding weights and final Linear layer
+
+
+
+- **attention_config**: list
+
+    Default = None
+
+    Attention configuration for gpt-neox
+
+    The first item in the list specifies the attention type(s), and should be a list of strings. The second item
+    specifies the number of times to repeat those attention types in the full list.
+
+    attention type choices:  [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird, "gmlp", "amlp", "flash"]
+
+    So a 12 layer network with only global attention could be specified like:
+        [[[`global`], 12]]
+
+    or a 12 layer network with alternating global / local like:
+        [[[`global`, `local`], 6]]
+
+    If none is specified, this defaults to
+        [[[`global`], n_layers]]
+
+
+
+- **sparsity_config**: dict
+
+    Default = None
+
+    Sparsity configuration dict as defined in https://www.deepspeed.ai/docs/config-json/#sparse-attention
+
+    Note that since neox is autoregressive, attention is always "unidirectional" and `horizontal_global_attention` is
+    always false.
+
+    The main difference between our sparsity config and deepspeed's is that `mode` is ignored - since it is instead
+    specified in attention_config defining each layer.
+
+    An example config is given below:
+          "sparse_attention": {
+            "block": 16,
+            "different_layout_per_head": true,
+            "num_local_blocks": 4,
+            "num_global_blocks": 1,
+            "num_different_global_patterns": 4,
+            "num_random_blocks": 0,
+            "local_window_blocks": [4],
+            "global_block_indices": [0],
+            "global_block_end_indices": None,
+            "num_sliding_window_blocks": 3
+          }
+
+
+
+- **num_unique_layers**: int
+
+    Default = None
+
+    Number of unique transformer layers. num-layers should be divisible by this value. Currently only has an effect when pipe_parallel_size=0.
+
+
+
+- **param_sharing_style**: str
+
+    Default = grouped
+
+    Ordering of the shared parameters. For example, for a num-layers=4 and --num-unique-layers=2, we will have the following ordering for two unique layers 1 and 2-: grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].
+
+
+
+- **make_vocab_size_divisible_by**: int
+
+    Default = 128
+
+    Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
+
+
+
+- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
+
+    Default = gelu
+
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+
+
+
+- **scaled_upper_triang_masked_softmax_fusion**: bool
+
+    Default = False
+
+    Enable fusion of query_key_value_scaling time (upper diagonal) masking and softmax.
+
+
+
+- **scaled_masked_softmax_fusion**: bool
+
+    Default = False
+
+    Enable fusion of query_key_value_scaling general masking and softmax.
+
+
+
+- **bias_gelu_fusion**: bool
+
+    Default = False
+
+    Enable bias and gelu fusion.
+
+
+
+- **bias_dropout_fusion**: bool
+
+    Default = False
+
+    Enable bias and dropout fusion.
+
+
+
+- **rope_fusion**: bool
+
+    Default = False
+
+    Enable rotary embedding fusion.
+
+
+
+- **fp16_lm_cross_entropy**: bool
+
+    Default = False
+
+    Move the cross entropy unreduced loss calculation for lm head to fp16.
+
+
+
+- **init_method_std**: float
+
+    Default = 0.02
+
+    Standard deviation of the zero mean normal distribution used for weight initialization.
+
+
+
+- **apply_query_key_layer_scaling**: bool
+
+    Default = False
+
+    Scale Q * K^T by 1 / layer-number. If this flag is set, then it will automatically set attention-softmax-in-fp32 to true
+
+
+
+- **use_cpu_initialization**: bool
+
+    Default = False
+
+    If set, affine parallel weights initialization uses CPU
+
+
+
+- **attention_softmax_in_fp32**: bool
+
+    Default = False
+
+    Run attention masking and softmax in fp32.
+
+
+
+- **rotary_pct**: float
+
+    Default = 1.0
+
+    pct of hidden dims to apply rotary positional embedding to
+
+
+
+- **rotary_emb_base**: int
+
+    Default = 10000
+
+    Base for rotary positional embedding
+
+
+
+- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
+
+    Default = normal
+
+    Init function used on all layers except ff residual outputs - choose from
+    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
+
+
+
+- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
+
+    Default = scaled_normal
+
+    Init function used for ff residual outputs - choose from
+    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
+
+
+
+- **gmlp_attn_dim**: int
+
+    Default = 64
+
+    the dimension of the single head self attention in gmlp model (not used in gpt models).
+    If None - gmlp model doesn't use attention.
+
+
+
+- **gpt_j_residual**: bool
+
+    Default = False
+
+    If false, we use the conventional residual path:
+      x = x + attn(ln1(x))
+      x = x + mlp(ln2(x))
+    Otherwise, we use the residual path from GPT-J, which offers a slight speedup:
+      x = ln(x)
+      x = x + attn(x) + mlp(x)
+
+
+
+- **gpt_j_tied**: bool
+
+    Default = False
+
+    If false, we use
+      x = x + attn(ln1(x)) + mlp(ln2(x))
+    Otherwise, we tie the layer norms
+      y = ln(x)
+      x = x + attn(y) + mlp(y)
+
+
+
+- **use_bias_in_norms**: bool
+
+    Default = True
+
+    If false, norms (e.g. LayerNorm) will not have bias terms
+
+
+
+- **use_bias_in_attn_linear**: bool
+
+    Default = True
+
+    If false, attn_linear (e.g. QKVO) will not have bias terms
+
+
+
+- **mlp_type**: str
+
+    Default = regular
+
+    Types:
+        regular: Megatron implementation
+        llama: LLaMA MLP (SiLU-gated MLP)
+
+
+
+- **soft_prompt_tuning**: dict
+
+    Default = None
+
+    Dictionary configuring the soft prompt tuning parameters.
+    If enabled, will train *only* the soft prompt, and freezes the rest of the model.
+    parameters in the dict are:
+        'enabled': bool = True # enables soft prompting
+        'num_tokens': int = 10 # length of the soft prompt in tokens
+        'init_string': str = '' # if provided, initialize the soft prompt with the word embeddings of this string
+        'init_range': float = 0.5 # if no init string is provided, initialize the soft prompt with a uniform distribution between -init_range and init_rang
+
+
+
+- **output_layer_parallelism**: typing.Literal['column']
+
+    Default = column
+
+    Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
+
+
+
+## NeoXArgsOptimizer
+
+Optimizer Arguments
+
+
+
+- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion']
+
+    Default = adam
+
+    Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd', 'lion']
+    NOTE: sgd will use MuSGD from Mup. Mup must be enabled for this optimizer.
+
+
+
+- **use_bnb_optimizer**: bool
+
+    Default = False
+
+    Whether to enable the bitsandbytes optimizers
+
+
+
+- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']]
+
+    Default = None
+
+    Zero Optimizer stage
+
+
+
+- **zero_reduce_scatter**: bool
+
+    Default = None
+
+    Zero: Uses reduce or reduce scatter instead of allreduce to average gradients
+
+
+
+- **zero_contiguous_gradients**: bool
+
+    Default = None
+
+    Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.
+
+
+
+- **zero_reduce_bucket_size**: int
+
+    Default = None
+
+    Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
+
+
+
+- **zero_allgather_bucket_size**: int
+
+    Default = None
+
+    Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
+
+
+
+- **lr**: float
+
+    Default = None
+
+    Max Learning rate during training
+
+
+
+## NeoXArgsOther
+
+Misc. Arguments
+
+
+
+- **distributed_backend**: str
+
+    Default = nccl
+
+    Which backend to use for distributed training.
+
+
+
+- **local_rank**: int
+
+    Default = None
+
+    local rank passed from distributed launcher.
+
+
+
+- **rank**: int
+
+    Default = None
+
+    global rank of process being run (passed in via distributed launcher)
+
+
+
+- **lazy_mpu_init**: bool
+
+    Default = False
+
+    If set to True, initialize_megatron() skips DDP initialization and returns function to complete it instead. Also turns on use-cpu-initialization flag. This is for external DDP manager.
+
+
+
+- **short_seq_prob**: float
+
+    Default = 0.1
+
+    Probability of producing a short sequence.
+
+
+
+- **eod_mask_loss**: bool
+
+    Default = False
+
+    Mask loss for the end of document tokens.
+
+
+
+- **adlr_autoresume**: bool
+
+    Default = False
+
+    Enable auto-resume on adlr cluster.
+
+
+
+- **adlr_autoresume_interval**: int
+
+    Default = 1000
+
+    Intervals over which check for auto-resume termination signal
+
+
+
+- **seed**: int
+
+    Default = 1234
+
+    Random seed used for python, numpy, pytorch, and cuda.
+
+
+
+- **onnx_safe**: bool
+
+    Default = False
+
+    Use workarounds for known problems with Torch ONNX exporter
+
+
+
+- **deepscale**: bool
+
+    Default = False
+
+    (Deprecated) enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
+
+
+
+- **deepscale_config**: str
+
+    Default = None
+
+    (Deprecated) deepscale json configuration file.
+
+
+
+- **deepspeed_mpi**: bool
+
+    Default = False
+
+    Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment
+
+
+
+- **deepspeed_slurm**: bool
+
+    Default = False
+
+    Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
+
+
+
+- **user_script**: str
+
+    Default = None
+
+    user script to be run
+
+
+
+- **iteration**: int
+
+    Default = None
+
+    Set during training
+
+
+
+- **do_train**: bool
+
+    Default = None
+
+    Set during training
+
+
+
+- **do_valid**: bool
+
+    Default = None
+
+    Set during training
+
+
+
+- **do_test**: bool
+
+    Default = None
+
+    Set during training
+
+
+
+- **save_iters**: list
+
+    Default = None
+
+    Set during training
+
+
+
+- **global_num_gpus**: int
+
+    Default = None
+
+    Set during launching
+
+
+
+## NeoXArgsParallelism
+
+Parallelism Arguments
+
+
+
+- **pipe_parallel_size**: int
+
+    Default = 0
+
+    Number of pipeline parallel stages. Disable with 0.
+
+
+
+- **model_parallel_size**: int
+
+    Default = 1
+
+    Size of the model parallelism.
+
+
+
+- **pipe_partition_method**: str
+
+    Default = type:transformer|mlp
+
+    method used to distribute model layers across pipeline stages. Choose from "parameters", which balances the number
+    of parameters on each pipeline stage, "uniform", which naively balances the number of layers per stage, or
+    "type:[regex]", which balances layers whose class names match [regex]
+
+
+
+- **world_size**: int
+
+    Default = None
+
+    Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher
+
+
+
+- **is_pipe_parallel**: bool
+
+    Default = False
+
+    flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined
+    according to pipeline parallel size.
+
+
+
+## NeoXArgsTemplate
+
+NeoXArgsTemplate()
+
+
+
+## NeoXArgsTextgen
+
+Text Generation arguments
+
+
+
+- **text_gen_type**: str
+
+    Default = None
+
+    How to generate text/sample the model.
+    Options: `unconditional`, `input-file`, `interactive`
+
+
+
+- **temperature**: float
+
+    Default = 0.0
+
+    exponential scaling output distribution ("higher == more risk")
+
+
+
+- **top_p**: float
+
+    Default = 0.0
+
+    Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
+
+
+
+- **top_k**: int
+
+    Default = 0
+
+    integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
+
+
+
+- **return_logits**: bool
+
+    Default = False
+
+    Boolean for whether to return the logits for generated tokens
+
+
+
+- **maximum_tokens**: int
+
+    Default = 64
+
+    maximum number of tokens to be generated
+
+
+
+- **prompt_end**: str
+
+    Default = 
+
+
+    a single prompt's end. Defaults to newline
+
+
+
+- **sample_input_file**: str
+
+    Default = None
+
+    Get input from file instead of interactive mode, each line is an input.
+
+
+
+- **sample_output_file**: str
+
+    Default = samples.txt
+
+    Output file
+
+
+
+- **num_samples**: int
+
+    Default = 1
+
+    Number of samples to generate unconditionally, defaults to 1 and interactive conditional sampling
+
+
+
+- **recompute**: bool
+
+    Default = False
+
+    During generation recompute all attention instead of using previously computed keys/values.
+    Should be set to true for sparse attention models
+
+
+
+- **eval_results_prefix**: str
+
+    Default = 
+
+    prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
+
+
+
+- **eval_tasks**: list
+
+    Default = None
+
+    Tasks to evaluate on using lm_eval_harness
+
+    NOTE: Requires internet connection
+
+
+
+## NeoXArgsTokenizer
+
+Tokenizer Arguments
+
+
+
+- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
+
+    Default = GPT2BPETokenizer
+
+    Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer", "TiktokenTokenizer"]
+
+
+
+- **padded_vocab_size**: int
+
+    Default = None
+
+    Total (padded) vocabulary size of tokenizer. Configured after launching of training,
+    as it's dependent on the parallelism size.
+
+
+
+## NeoXArgsTraining
+
+Training Arguments
+
+
+
+- **data_path**: str
+
+    Default = None
+
+    Path to combined dataset to split.
+
+
+
+- **use_shared_fs**: bool
+
+    Default = True
+
+    Whether to use a shared filesystem for data loading. If False, local rank 0 on all nodes will preprocess the data,
+    otherwise only global rank 0 will preprocess the data. This is implemented in megatron/data/gpt2_dataset.py::_build_index_mappings.
+
+
+
+- **train_data_paths**: list
+
+    Default = None
+
+    List of paths to train datasets.
+
+
+
+- **label_data_paths**: list
+
+    Default = None
+
+    List of paths to label datasets (not shifted by 1 yet!).
+
+
+
+- **test_data_paths**: list
+
+    Default = None
+
+    List of paths to test datasets.
+
+
+
+- **valid_data_paths**: list
+
+    Default = None
+
+    List of paths to validation datasets.
+
+
+
+- **train_data_weights**: list
+
+    Default = None
+
+    List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `train_data_paths`
+
+
+
+- **valid_data_weights**: list
+
+    Default = None
+
+    List of 'weights' that decide how often to sample from each validation dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `valid_data_paths`
+
+
+
+- **test_data_weights**: list
+
+    Default = None
+
+    List of 'weights' that decide how often to sample from each test dataset when blending datasets. If None, defaults to equal weighting.
+    Should be a list the same length as `test_data_paths`
+
+
+
+- **weight_by_num_documents**: bool
+
+    Default = False
+
+    If True, Builds dataset weights from a multinomial distribution over groups of data according to the number of
+    documents in each group.
+
+    WARNING: setting this to True will override any user provided weights
+
+    We sample from a group according to the probability p(L) ∝ |L| ** α,
+    where p(L) is the probability of sampling from a given group,
+          |L| is the number of examples in that datapoint,
+          and α is a coefficient that acts to upsample data from underrepresented groups
+
+    Hence α (`alpha`) allows us to control how much to 'boost' the probability of training on low-resource groups.
+
+    See https://arxiv.org/abs/1911.02116 for more details
+
+
+
+- **weighted_sampler_alpha**: float
+
+    Default = 1.0
+
+    Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
+
+    when alpha = 1, the probability of sampling from a given group = n_samples / total_samples
+    as alpha -> 0, the probability of sampling from all groups becomes equal, and number of documents has no effect
+    as alpha -> inf, the probability of sampling from the groups with *the most samples* -> 1
+
+
+
+- **data_impl**: typing.Literal['infer', 'mmap', 'cached']
+
+    Default = infer
+
+    Implementation of indexed datasets, can be one of "infer", "cached", or "mmap"
+
+
+
+- **mmap_warmup**: bool
+
+    Default = False
+
+    Warm up mmap files.
+
+
+
+- **save**: str
+
+    Default = None
+
+    Output directory to save checkpoints to.
+
+
+
+- **s3_path**: str
+
+    Default = None
+
+    Path to s3 bucket for saving checkpoints.
+
+
+
+- **s3_chunk_size**: int
+
+    Default = 104857600
+
+    The number of bytes in each file chunk when uploading to s3. Defaults to 100MiB.
+
+
+
+- **config_files**: dict
+
+    Default = None
+
+    Store of original config files mapping config filename to file contents
+
+
+
+- **load**: str
+
+    Default = None
+
+    Directory containing a model checkpoint.
+
+
+
+- **checkpoint_validation_with_forward_pass**: bool
+
+    Default = False
+
+    save input and output of a forward pass with the checkpoint and validate after load
+
+
+
+- **checkpoint_scale**: typing.Literal['linear', 'log']
+
+    Default = linear
+
+    How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`,
+    while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1.
+
+
+
+- **checkpoint_factor**: int
+
+    Default = None
+
+    Acts as a multiplier on either the "log" or "linear" checkpoint spacing.
+
+    With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at
+    steps [20, 40, 60, 80, 100].
+
+    With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at
+    steps [1, 2, 4, 8, 16, 32, 64, 100].
+
+    Note that the last checkpoint step is always saved.
+
+
+
+- **extra_save_iters**: list
+
+    Default = None
+
+    Additional iterations when a checkpoint should be saved.
+    Must be a list of ints or `None`.
+
+
+
+- **no_save_optim**: bool
+
+    Default = False
+
+    Do not save current optimizer.
+
+
+
+- **no_save_rng**: bool
+
+    Default = False
+
+    Do not save current rng state.
+
+
+
+- **no_load_optim**: bool
+
+    Default = False
+
+    Do not load optimizer when loading checkpoint.
+
+
+
+- **no_load_rng**: bool
+
+    Default = False
+
+    Do not load rng state when loading checkpoint.
+
+
+
+- **finetune**: bool
+
+    Default = False
+
+    Load model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.
+
+
+
+- **batch_size**: int
+
+    Default = None
+
+    training microbatch size per gpu
+
+
+
+- **train_iters**: int
+
+    Default = None
+
+    Number of iterations to run for training.
+
+
+
+- **eval_iters**: int
+
+    Default = 100
+
+    Number of iterations to run for evaluation validation/test for.
+
+
+
+- **keep_last_n_checkpoints**: int
+
+    Default = None
+
+    Number of last checkpoints to keep
+
+
+
+- **eval_interval**: int
+
+    Default = 1000
+
+    Interval between running evaluation on validation set.
+
+
+
+- **split**: str
+
+    Default = 969, 30, 1
+
+    Comma_separated list of proportions for training, validation, and test split. For example the split 90,5,5 will use 90% of data for training, 5% for validation and 5% for test.
+
+
+
+- **vocab_file**: str
+
+    Default = None
+
+    Path to the vocab file.
+
+
+
+- **merge_file**: str
+
+    Default = None
+
+    Path to the BPE merge file.
+
+
+
+- **num_workers**: int
+
+    Default = 2
+
+    Dataloader number of workers.
+
+
+
+- **exit_interval**: int
+
+    Default = None
+
+    Exit the program after the iteration is divisible by this value.
+
+
+
+- **attention_dropout**: float
+
+    Default = 0.0
+
+    Post attention dropout probability.
+
+
+
+- **hidden_dropout**: float
+
+    Default = 0.0
+
+    Dropout probability for hidden state transformer.
+
+
+
+- **weight_decay**: float
+
+    Default = 0.1
+
+    Weight decay coefficient for L2 regularization.
+
+
+
+- **checkpoint_activations**: bool
+
+    Default = False
+
+    Checkpoint activation to allow for training with larger models, sequences, and batch sizes.
+
+
+
+- **checkpoint_num_layers**: int
+
+    Default = 1
+
+    Chunk size (number of layers) for checkpointing.
+
+
+
+- **deepspeed_activation_checkpointing**: bool
+
+    Default = True
+
+    DEPRECATED - TODO: remove
+    Uses activation checkpointing from deepspeed
+
+
+
+- **contiguous_checkpointing**: bool
+
+    Default = False
+
+    Contiguous memory checkpointing for activations.
+
+
+
+- **checkpoint_in_cpu**: bool
+
+    Default = False
+
+    Move the activation checkpoints to CPU.
+
+
+
+- **synchronize_each_layer**: bool
+
+    Default = False
+
+    does a synchronize at the beginning and end of each checkpointed layer.
+
+
+
+- **profile_backward**: bool
+
+    Default = False
+
+    Enables backward pass profiling for checkpointed layers.
+
+
+
+- **partition_activations**: bool
+
+    Default = False
+
+    Partition Activations across GPUs before checkpointing.
+
+
+
+- **clip_grad**: float
+
+    Default = 1.0
+
+    Gradient clipping based on global L2 norm.
+
+
+
+- **hysteresis**: int
+
+    Default = 2
+
+    hysteresis for dynamic loss scaling
+
+
+
+- **dynamic_loss_scale**: bool
+
+    Default = None
+
+    flag indicating whether dynamic loss scale is used
+
+
+
+- **loss_scale**: float
+
+    Default = None
+
+    Static loss scaling, positive power of 2
+    values can improve fp16 convergence. If None, dynamic loss scaling is used.
+
+
+
+- **loss_scale_window**: float
+
+    Default = 1000.0
+
+    Window over which to raise/lower dynamic scale.
+
+
+
+- **min_scale**: float
+
+    Default = 1.0
+
+    Minimum loss scale for dynamic loss scale.
+
+
+
+- **char_level_ppl**: bool
+
+    Default = False
+
+    Whether to calculate character level perplexity as well as token level perplexity. (may incur a time cost)
+
+
+
+- **use_mup**: bool
+
+    Default = False
+
+    Whether to use Microsoft's Mup https://github.com/microsoft/mup
+
+
+
+- **coord_check**: bool
+
+    Default = False
+
+    Whether to generate a "coord check" plot to verify mup's implementation in neox
+
+
+
+- **save_base_shapes**: bool
+
+    Default = False
+
+    Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file.
+
+
+
+- **base_shapes_file**: str
+
+    Default = None
+
+    Path to the base shapes to save to/load from
+
+
+
+- **mup_init_scale**: float
+
+    Default = 1.0
+
+    Initialization scale: All the parameters are multiplied by this value
+
+
+
+- **mup_attn_temp**: float
+
+    Default = 1.0
+
+    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
+
+
+
+- **mup_output_temp**: float
+
+    Default = 1.0
+
+    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
+    produces the distribution over output tokens.
+
+
+
+- **mup_embedding_mult**: float
+
+    Default = 1.0
+
+    Scalar by which we multiply the output of the embedding layer
+
+
+
+- **mup_rp_embedding_mult**: float
+
+    Default = 1.0
+
+    Scalar by which we multiply vectors representing relative position
+
+
+
+- **mup_width_scale**: int
+
+    Default = 2
+
+    What to scale width by when creating the delta model for mup
+
+
+
+## NeoXArgsDeepspeedConfig
+
+Args for deepspeed config
+    Every argument included here will be included in deepspeed config json
+    As of Mar 8 2023, up to date compared to https://www.deepspeed.ai/docs/config-json/
+
+
+
+- **deepspeed**: bool
+
+    Default = True
+
+    boolean flag to enable DeepSpeed (Always True)
+
+
+
+- **train_batch_size**: int
+
+    Default = None
+
+    The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
+
+
+
+- **train_micro_batch_size_per_gpu**: int
+
+    Default = None
+
+    Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
+
+
+
+- **gradient_accumulation_steps**: int
+
+    Default = 1
+
+    Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON.
+
+
+
+- **optimizer**: dict
+
+    Default = None
+
+    dict containing the keys type and params
+
+    type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
+
+    params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam).
+
+
+
+- **scheduler**: dict
+
+    Default = None
+
+    dict containing the keys type and params
+
+    type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers.
+
+    params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.
+
+
+
+- **fp32_allreduce**: bool
+
+    Default = False
+
+    During gradient averaging perform allreduce with 32 bit values
+
+
+
+- **prescale_gradients**: bool
+
+    Default = False
+
+    Scale gradients before doing allreduce
+
+
+
+- **gradient_predivide_factor**: float
+
+    Default = 1.0
+
+    Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs
+
+
+
+- **sparse_gradients**: bool
+
+    Default = False
+
+    Enable sparse compression of torch.nn.Embedding gradients.
+
+
+
+- **fp16**: dict
+
+    Default = None
+
+    Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package.
+
+    Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#fp16-training-options
+
+
+
+- **bf16**: dict
+
+    Default = None
+
+    Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
+
+
+
+- **amp**: dict
+
+    Default = None
+
+    Configuration for using automatic mixed precision (AMP) training that leverages NVIDIA’s Apex AMP package.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options
+
+
+
+- **gradient_clipping**: float
+
+    Default = 1.0
+
+    Enable gradient clipping with provided value
+
+
+
+- **zero_optimization**: dict
+
+    Default = None
+
+    Configuration for using ZeRO optimization.
+
+    Multi-level dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#zero-optimization-options
+
+
+
+- **curriculum_learning**: dict
+
+    Default = None
+
+    
+
+
+
+- **curriculum_seqlen**: int
+
+    Default = 0
+
+    Internal var for tracking the current seqlen
+
+
+
+- **steps_per_print**: int
+
+    Default = 10
+
+    Print train loss every N steps.
+
+
+
+- **wall_clock_breakdown**: bool
+
+    Default = False
+
+    Enable timing of the latency of forward/backward/update training phases.
+
+
+
+- **dump_state**: bool
+
+    Default = False
+
+    Print out state information of DeepSpeed object after initialization.
+
+
+
+- **flops_profiler**: dict
+
+    Default = None
+
+    Configuration for using FLOPS profiler.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler
+
+
+
+- **communication_data_type**: bool
+
+    Default = None
+
+    During gradient averaging, perform communication with selected data type. By default it will be determined by selected regime
+
+
+
+- **autotuning**: dict
+
+    Default = None
+
+    Configuration for using autotuning.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#autotuning
+
+
+
+- **activation_checkpointing**: dict
+
+    Default = None
+
+    Configuration for using activation checkpointing.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#activation-checkpointing
+
+
+
+- **sparse_attention**: dict
+
+    Default = None
+
+    Configuration for using sparse attention.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#sparse-attention
+
+
+
+- **data_efficiency**: dict
+
+    Default = None
+
+    Configuration for using data efficiency.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-efficiency
+
+
+
+- **tensorboard**: dict
+
+    Default = None
+
+    Configuration for using tensorboard.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv
+
+
+
+- **wandb**: dict
+
+    Default = None
+
+    Configuration for using wandb.
+
+
+
+- **csv_monitor**: dict
+
+    Default = None
+
+    Configuration for using csv_monitor.
+
+
+
+- **elasticity**: dict
+
+    Default = None
+
+    Configuration for using elastic training.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02
+
+
+
+- **comms_logger**: dict
+
+    Default = None
+
+    Configuration for using communication logger.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#communication-logging
+
+
+
+- **compression_training**: dict
+
+    Default = None
+
+    Configuration for using compression training.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#compression
+
+
+
+- **checkpoint**: dict
+
+    Default = None
+
+    Configuration for using checkpointing.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#checkpoint-options
+
+
+
+- **data_types**: dict
+
+    Default = None
+
+    Configuration for using data types.
+
+    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-type-options
+
+
+
+- **deepspeed_extra_args**: dict
+
+    Default = None
+
+    Dictionary of extra arguments to be included in the yaml config file. This can be used for any argument not included in the above list.
+
+
+
+## NeoXArgsDeepspeedRunner
+
+Args for deepspeed runner (deepspeed.launcher.runner).
+    Every argument included here will be passed as command line argument to deepspeed.launcher.runner
+
+
+
+- **hostfile**: str
+
+    Default = None
+
+    list of hostnames / ssh aliases and the number of GPUs per host
+
+    example file contents:
+    worker-1 slots=4
+    worker-2 slots=4
+    127.0.0 slots=4
+    127.0.1 slots=4
+
+
+
+- **include**: str
+
+    Default = None
+
+    Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`.
+
+
+
+- **exclude**: str
+
+    Default = None
+
+    Specify hardware resources to NOT use during execution. Same format as include
+
+
+
+- **num_nodes**: int
+
+    Default = -1
+
+    Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all.
+
+
+
+- **num_gpus**: int
+
+    Default = None
+
+    Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all.
+
+
+
+- **master_port**: int
+
+    Default = 29500
+
+    Port used by PyTorch distributed for communication during training.
+
+
+
+- **master_addr**: str
+
+    Default = None
+
+    IP address of node 0, will be inferred via 'hostname -I' if not specified.
+
+
+
+- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
+
+    Default = pdsh
+
+    Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
+
+
+
+- **force_multi**: bool
+
+    Default = False
+
+    Force multi-node training even if only one node is specified.
+
+
+
+- **detect_nvlink_pairs**: bool
+
+    Default = False
+
+    If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
+
+
+
+- **autotuning_run**: str
+
+    Default = None
+
+    Either "tune", "run", or `None`.
+
+
+
+- **no_ssh_check**: bool
+
+    Default = False
+
+    If true, overrides the default check where DeepSpeed confirms that the headnode is accessible via ssh.
+
+
+
+- **comment**: str
+
+    Default = None
+
+    Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
+
+
+
+- **account**: str
+
+    Default = None
+
+    Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
+
diff --git a/configs/pythia/1-4B.yml b/configs/pythia/1-4B.yml
new file mode 100755
index 000000000..bfb8d4bc7
--- /dev/null
+++ b/configs/pythia/1-4B.yml
@@ -0,0 +1,85 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 2048,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 24]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0002,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00002,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 16,
+  "gas": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+  "tokenizer_type": "HFTokenizer"
+  }
diff --git a/configs/pythia/12B.yml b/configs/pythia/12B.yml
new file mode 100755
index 000000000..21b67521d
--- /dev/null
+++ b/configs/pythia/12B.yml
@@ -0,0 +1,84 @@
+{
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 4,
+
+   "num_layers": 36,
+   "hidden_size": 5120,
+   "num_attention_heads": 40,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "rotary_pct": 0.25,
+   "no_weight_tying": true,
+   "gpt_j_residual": true,
+   "output_layer_parallelism": "column",
+
+   "attention_config": [[["flash"], 36]],
+
+   "scaled_upper_triang_masked_softmax_fusion": true,
+   "bias_gelu_fusion": true,
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+   "min_lr": 0.000012,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 8,
+   "gradient_accumulation_steps": 2,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "initial_scale_power": 12,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train_iters": 143000,
+   "lr_decay_iters": 143000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 1000,
+   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+   "eval_interval": 143000,
+   "eval_iters": 10,
+
+   "log_interval": 10,
+   "steps_per_print": 10,
+   "wall_clock_breakdown": true,
+
+   "log_grad_norm": true,
+
+   "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/14M.yml b/configs/pythia/14M.yml
new file mode 100644
index 000000000..41b11fcca
--- /dev/null
+++ b/configs/pythia/14M.yml
@@ -0,0 +1,98 @@
+{
+  # parallelism settings
+  "pipe-parallel-size": 0,
+  "model-parallel-size": 1,
+
+  # model settings
+  "num-layers": 6,
+  "hidden-size": 128,
+  "num-attention-heads": 4,
+  "seq-length": 2048,
+  "max-position-embeddings": 2048,
+  "pos-emb": "rotary",
+  "rotary-pct": 0.25,
+  "no-weight-tying": true,
+  "gpt-j-residual": true,
+  "output-layer-parallelism": "column",
+
+  "attention-config": [[["flash"], 6]],
+
+  "scaled-upper-triang-masked-softmax-fusion": true,
+  "bias-gelu-fusion": true,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.0001,
+
+  "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 50000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # batch size (trained on 32 gpus)
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data-impl": "mmap",
+  "num_workers": 4,
+
+  # activation checkpointing
+  "checkpoint-activations": false, #true,
+  "checkpoint-num-layers": 1,
+  "partition-activations": false, #true,
+  "synchronize-each-layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight-decay": 0.1,
+  "hidden-dropout": 0,
+  "attention-dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train-iters": 143000,
+  "lr-decay-iters": 143000,
+  "distributed-backend": "nccl",
+  "lr-decay-style": "cosine",
+  "warmup": 0.01,
+  "checkpoint-factor": 1000,
+  "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval-interval": 100000,
+  "eval-iters": 10,
+
+  "log-interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
+
+}
diff --git a/configs/pythia/160M.yml b/configs/pythia/160M.yml
new file mode 100755
index 000000000..2f6abdef5
--- /dev/null
+++ b/configs/pythia/160M.yml
@@ -0,0 +1,85 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 12,
+  "hidden_size": 768,
+  "num_attention_heads": 12,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 12]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0006,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00006,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/1B.yml b/configs/pythia/1B.yml
new file mode 100755
index 000000000..78fc28946
--- /dev/null
+++ b/configs/pythia/1B.yml
@@ -0,0 +1,86 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 16,
+  "hidden_size": 2048,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00025,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.000025,
+
+  "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "fp16": {
+    "enabled": true,
+    "type": "bfloat16",
+    "auto_cast": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "fp32_allreduce": true,
+
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 4,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/2-8B.yml b/configs/pythia/2-8B.yml
new file mode 100755
index 000000000..04427e9fd
--- /dev/null
+++ b/configs/pythia/2-8B.yml
@@ -0,0 +1,87 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 32,
+  "hidden_size": 2560,
+  "num_attention_heads": 32,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00016,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.000016,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 40000,
+  "eval_iters": 10,
+
+  "log_grad_norm": true,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/31M.yml b/configs/pythia/31M.yml
new file mode 100644
index 000000000..82f8ed343
--- /dev/null
+++ b/configs/pythia/31M.yml
@@ -0,0 +1,97 @@
+{
+  # parallelism settings
+  "pipe-parallel-size": 0,
+  "model-parallel-size": 1,
+
+  # model settings
+  "num-layers": 6,
+  "hidden-size": 256,
+  "num-attention-heads": 8,
+  "seq-length": 2048,
+  "max-position-embeddings": 2048,
+  "pos-emb": "rotary",
+  "rotary-pct": 0.25,
+  "no-weight-tying": true,
+  "gpt-j-residual": true,
+  "output-layer-parallelism": "column",
+
+  "attention-config": [[["flash"], 6]],
+
+  "scaled-upper-triang-masked-softmax-fusion": true,
+  "bias-gelu-fusion": true,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.0001,
+
+  "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # batch size (trained on 32 gpus)
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data-impl": "mmap",
+  "num_workers": 2,
+
+  # activation checkpointing
+  "checkpoint-activations": false,
+  "checkpoint-num-layers": 1,
+  "partition-activations": false,
+  "synchronize-each-layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight-decay": 0.1,
+  "hidden-dropout": 0,
+  "attention-dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train-iters": 143000,
+  "lr-decay-iters": 143000,
+  "distributed-backend": "nccl",
+  "lr-decay-style": "cosine",
+  "warmup": 0.01,
+  "checkpoint-factor": 1000,
+  "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval-interval": 100000,
+  "eval-iters": 10,
+  "log-interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "train-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json"
+
+}
diff --git a/configs/pythia/410M.yml b/configs/pythia/410M.yml
new file mode 100755
index 000000000..95afe9cd9
--- /dev/null
+++ b/configs/pythia/410M.yml
@@ -0,0 +1,85 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 1024,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 24]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0003,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00003,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/6-9B.yml b/configs/pythia/6-9B.yml
new file mode 100755
index 000000000..869129f30
--- /dev/null
+++ b/configs/pythia/6-9B.yml
@@ -0,0 +1,84 @@
+{
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 2,
+
+   "num_layers": 32,
+   "hidden_size": 4096,
+   "num_attention_heads": 32,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "rotary_pct": 0.25,
+   "no_weight_tying": true,
+   "gpt_j_residual": true,
+   "output_layer_parallelism": "column",
+
+   "attention_config": [[["flash"], 32]],
+
+   "scaled_upper_triang_masked_softmax_fusion": true,
+   "bias_gelu_fusion": true,
+
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "min_lr": 0.000012,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 8,
+   "gradient_accumulation_steps": 2,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "initial_scale_power": 12,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   "train_iters": 143000,
+   "lr_decay_iters": 143000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 1000,
+   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+   "eval_interval": 143000,
+   "eval_iters": 10,
+
+   "log_interval": 10,
+   "steps_per_print": 10,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/pythia/70M.yml b/configs/pythia/70M.yml
new file mode 100755
index 000000000..a58553b4e
--- /dev/null
+++ b/configs/pythia/70M.yml
@@ -0,0 +1,85 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 6,
+  "hidden_size": 512,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 6]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.0001,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 100000,
+  "eval_iters": 10,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer"
+}
diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
new file mode 100644
index 000000000..2ac60e534
--- /dev/null
+++ b/configs/slurm_125M.yml
@@ -0,0 +1,66 @@
+{
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "scaled_upper_triang_masked_softmax_fusion": true,
+   "bias_gelu_fusion": true,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true
+  },
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "comment": "neox"
+}
diff --git a/configs/slurm_local.json b/configs/slurm_local.json
new file mode 100644
index 000000000..36e16089b
--- /dev/null
+++ b/configs/slurm_local.json
@@ -0,0 +1,11 @@
+{
+  "vocab-file": "data/gpt2-vocab.json",
+  "merge-file": "data/gpt2-merges.txt",
+  "save": "checkpoints",
+  "checkpoint_validation_with_forward_pass": false,
+  "tensorboard-dir": "tensorboard",
+  "log-dir": "logs",
+  "use_wandb": true,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "neox"
+}
diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
new file mode 100644
index 000000000..1a2b73aba
--- /dev/null
+++ b/configs/slurm_local.yml
@@ -0,0 +1,12 @@
+{
+  "data_path": "data/enwik8/enwik8_text_document",
+  "vocab_file": "data/gpt2-vocab.json",
+  "merge_file": "data/gpt2-merges.txt",
+  "save": "checkpoints",
+  "checkpoint_validation_with_forward_pass": false,
+  "tensorboard_dir": "tensorboard",
+  "log_dir": "logs",
+  "use_wandb": true,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "neox"
+}
diff --git a/configs/sparse.yml b/configs/sparse.yml
new file mode 100644
index 000000000..7251c88b7
--- /dev/null
+++ b/configs/sparse.yml
@@ -0,0 +1,15 @@
+# Add this to your config for sparse attention every other layer
+{
+  "attention_config": [[["local", "global"], "all"]],
+
+  # sparsity config:
+  # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
+  # illustrative purposes)
+  # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
+  # more detailed config instructions and available parameters
+
+  "sparsity_config": {
+    "block": 16, # block size
+    "num_local_blocks": 32,
+  }
+}
diff --git a/configs/text_generation.yml b/configs/text_generation.yml
new file mode 100644
index 000000000..5a49d61e5
--- /dev/null
+++ b/configs/text_generation.yml
@@ -0,0 +1,21 @@
+# Parameters used for text generation
+# Make sure `load` is specified somewhere else
+{
+  # Text gen type: `input-file`, `unconditional` or `interactive`
+  "text_gen_type": "unconditional",
+
+  # Params for all
+  "maximum_tokens": 102,
+  "prompt_end": "\n",
+  "temperature": 1.0,
+  "top_p": 0.0,
+  "top_k": 0,
+  "recompute": false,
+
+  # `unconditional`: samples
+  "num_samples": 10,
+
+  # input/output file
+  "sample_input_file": "sample_input.txt",
+  "sample_output_file": "sample_output.txt",
+}

From a55d69c400c53828d74f5addb57ca3569f4a6d09 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 16:09:41 +0000
Subject: [PATCH 35/64] cleanup: remove llemma lr sched changes, re-add
 requirements/ folder

---
 megatron/checkpointing.py                     |  86 +++++++++----
 megatron/learning_rates.py                    |  29 ++---
 megatron/neox_arguments/neox_args.py          |  12 +-
 megatron/training.py                          |  20 +--
 requirements.txt                              | 116 ------------------
 requirements/requirements-dev.txt             |   7 ++
 requirements/requirements-flashattention.txt  |   1 +
 requirements/requirements-onebitadam.txt      |   1 +
 requirements/requirements-s3.txt              |   2 +
 requirements/requirements-sparseattention.txt |   1 +
 requirements/requirements-tensorboard.txt     |   1 +
 requirements/requirements-wandb.txt           |   1 +
 requirements/requirements.txt                 |  16 +++
 13 files changed, 105 insertions(+), 188 deletions(-)
 delete mode 100644 requirements.txt
 create mode 100644 requirements/requirements-dev.txt
 create mode 100644 requirements/requirements-flashattention.txt
 create mode 100644 requirements/requirements-onebitadam.txt
 create mode 100644 requirements/requirements-s3.txt
 create mode 100644 requirements/requirements-sparseattention.txt
 create mode 100644 requirements/requirements-tensorboard.txt
 create mode 100644 requirements/requirements-wandb.txt
 create mode 100644 requirements/requirements.txt

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 74a0ec491..12b81e202 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,18 @@
 import sys
 import numpy as np
 
-import boto3
-import hf_transfer
+try:
+    import boto3
+except ModuleNotFoundError:
+    print(
+        "For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3"
+    )
+try:
+    import hf_transfer
+except ModuleNotFoundError:
+    print(
+        "For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer"
+    )
 import torch
 from glob import glob
 
@@ -211,6 +221,8 @@ def save_ds_checkpoint(iteration, model, neox_args):
                     f.write(config_data)
                 else:
                     json.dump(config_data, f)
+
+
 def multiprocessing_starmap(func, args, num_processes=None):
     """Wrapper to allow for re-usable multiprocessing pools with `spawn` context handling
     Args:
@@ -219,8 +231,11 @@ def multiprocessing_starmap(func, args, num_processes=None):
         num_processes (int, optional): Number of processes to spawn. Defaults to `multiprocessing.cpu_count() - 1`
     """
     import multiprocessing
+
     num_processes = num_processes or (multiprocessing.cpu_count() - 1)
-    with multiprocessing.get_context("spawn").Pool(processes=num_processes) as process_pool:
+    with multiprocessing.get_context("spawn").Pool(
+        processes=num_processes
+    ) as process_pool:
         process_pool.starmap(func, args)
         process_pool.terminate()
         process_pool.join()
@@ -247,7 +262,7 @@ def _upload(
             chunks in parallel (cannot exceed max_files). Defaults to 63
         max_retries (int, optional): Number of retries for each chunk. Defaults to 5
     """
-    s3 = boto3.client('s3')
+    s3 = boto3.client("s3")
     bucket = s3_key.split("s3://")[1].split("/")[0]
     key = s3_key.split(bucket)[1].lstrip("/")
 
@@ -298,24 +313,42 @@ def _upload(
 
 
 def upload_checkpoint(iteration, neox_args):
-    local_checkpoint_path = os.path.join(os.path.abspath(neox_args.save), get_checkpoint_tag(iteration))
-    local_checkpoint_list = sorted(filter(
-        lambda x: os.path.isfile(x),
-        [str(p) for p in Path(local_checkpoint_path).rglob("*")],
-    ))
+    local_checkpoint_path = os.path.join(
+        os.path.abspath(neox_args.save), get_checkpoint_tag(iteration)
+    )
+    local_checkpoint_list = sorted(
+        filter(
+            lambda x: os.path.isfile(x),
+            [str(p) for p in Path(local_checkpoint_path).rglob("*")],
+        )
+    )
     remote_checkpoint_path = os.path.join(
-        neox_args.s3_path, os.path.basename(neox_args.save), get_checkpoint_tag(iteration))
+        neox_args.s3_path,
+        os.path.basename(neox_args.save),
+        get_checkpoint_tag(iteration),
+    )
     remote_checkpoint_list = [
-        os.path.join(remote_checkpoint_path, os.path.relpath(local_checkpoint, local_checkpoint_path))
+        os.path.join(
+            remote_checkpoint_path,
+            os.path.relpath(local_checkpoint, local_checkpoint_path),
+        )
         for local_checkpoint in local_checkpoint_list
     ]
-    inputs = zip(local_checkpoint_list, remote_checkpoint_list, [neox_args.s3_chunk_size] * len(local_checkpoint_list))
+    inputs = zip(
+        local_checkpoint_list,
+        remote_checkpoint_list,
+        [neox_args.s3_chunk_size] * len(local_checkpoint_list),
+    )
 
-    print_rank_0(f"[RANK {torch.distributed.get_rank()}] Uploading checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}`...")
+    print_rank_0(
+        f"[RANK {torch.distributed.get_rank()}] Uploading checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}`..."
+    )
     start = time.time()
     multiprocessing_starmap(_upload, inputs)
     total_time = time.time() - start
-    print_rank_0(f"[RANK {torch.distributed.get_rank()}] Uploaded checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}` in {total_time:.2f}s")
+    print_rank_0(
+        f"[RANK {torch.distributed.get_rank()}] Uploaded checkpoint `{local_checkpoint_path}` to `{remote_checkpoint_path}` in {total_time:.2f}s"
+    )
 
 
 def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
@@ -327,19 +360,19 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
         raise ValueError("Must be using deepspeed to use neox")
 
     torch.distributed.barrier()
+    upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
+    if upload_to_s3:
+        upload_checkpoint(iteration, neox_args)
 
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
     if neox_args.keep_last_n_checkpoints is not None:
         delete_old_checkpoints(neox_args.save, neox_args.keep_last_n_checkpoints)
 
     # Wait so everyone is done (not necessary)
     torch.distributed.barrier()
-    upload_to_s3 = torch.distributed.get_rank() == 0 and neox_args.s3_path is not None
-    if upload_to_s3:
-        upload_checkpoint(iteration, neox_args)
 
-    # Wait so everyone is done (necessary)
-    torch.distributed.barrier()
-   
+
 def load_checkpoint(
     neox_args, model, optimizer, lr_scheduler, inference=False, iteration=None
 ):
@@ -386,9 +419,12 @@ def load_checkpoint(
     if neox_args.finetune:
         iteration = 0
     else:
-        iteration = state_dict.get("iteration") or state_dict.get(
-            "total_iters"
-        )  # total_iters backward compatible with older checkpoints
+        if "iteration" in state_dict:
+            iteration = state_dict["iteration"]
+        else:
+            iteration = state_dict.get(
+                "total_iters"
+            )  # total_iters backward compatible with older checkpoints
         if iteration is None:
             raise ValueError(
                 f"Unable to load iteration from checkpoint {checkpoint_name} with keys {state_dict.keys()}, exiting"
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index e68cb7a56..9db951aa0 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,6 @@ def __init__(
         decay_style,
         last_iter,
         min_lr=0.0,
-        decay_lr_to=None,
         use_checkpoint_lr_scheduler=True,
         override_lr_scheduler=False,
         use_mup=False,
@@ -56,8 +55,6 @@ def __init__(
             assert not self.use_checkpoint_lr_scheduler, (
                 "both override and " "use-checkpoint are set."
             )
-        assert not (decay_lr_to and decay_style!="cosine")
-        self.decay_lr_to = decay_lr_to
         # Set the learning rate
         self.step(self.num_iters)
 
@@ -74,21 +71,19 @@ def get_lr(self):
 
         num_iters_ = num_iters_ - self.warmup_iter
         if self.decay_style == "linear":
-            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+            end_iter_ = self.end_iter - self.warmup_iter
+            lr = self.start_lr * (end_iter_ - num_iters_) / end_iter_
         elif self.decay_style == "cosine":
-            half_period = self.end_iter - self.warmup_iter
-            lr = ( 
-                    self.start_lr * (
-                        self.decay_lr_to +
-                        (1 - self.decay_lr_to) *
-                        0.5 * (
-                            math.cos(math.pi * num_iters_/half_period) + 1
-                        )
-                    )
-            )   
+            end_iter_ = self.end_iter - self.warmup_iter
+            lr = self.min_lr + (
+                (self.start_lr - self.min_lr)
+                / 2.0
+                * (math.cos(math.pi * num_iters_ / end_iter_) + 1)
+            )
         elif self.decay_style == "exponential":
             # exp(-0.693) = 1/2
-            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
+            end_iter = self.end_iter - self.warmup_iter
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / end_iter)
         else:
             lr = self.start_lr
         return max(lr, self.min_lr)
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 1d412cfab..0c80a9c23 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -470,21 +470,11 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
     Minimum value for learning rate. The scheduler clips values below this threshold.
     """
 
-    decay_lr_to: float = None
-    """
-    If using cosine decay, the ratio of max lr that the lr is decayed to, prior to any clipping based on the value of `min_lr`.
-    """
-
-    warmup: float = None
+    warmup: float = 0.01
     """
     Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
     """
 
-    warmup_iters: int = None
-    """
-    Number of iterations to warm up for. Incompatible with `warmup`.
-    """
-
     override_lr_scheduler: bool = False
     """
     Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
diff --git a/megatron/training.py b/megatron/training.py
index b28b24e7e..f821dc2c1 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -594,12 +594,7 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         num_iters = neox_args.train_iters
     num_iters = max(1, num_iters)
     init_step = 0
-
-    assert not (neox_args.warmup_iters and neox_args.warmup)
-    if neox_args.warmup:
-        warmup_iter = neox_args.warmup * num_iters
-    elif neox_args.warmup_iters:
-        warmup_iter = neox_args.warmup_iters
+    warmup_iter = neox_args.warmup * num_iters
     lr_scheduler = AnnealingLR(
         optimizer,
         start_lr=neox_args.lr,
@@ -608,7 +603,6 @@ def get_learning_rate_scheduler(optimizer, neox_args):
         decay_style=neox_args.lr_decay_style,
         last_iter=init_step,
         min_lr=neox_args.min_lr,
-        decay_lr_to=neox_args.decay_lr_to,
         use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=neox_args.override_lr_scheduler,
         use_mup=neox_args.use_mup,
@@ -779,18 +773,6 @@ def train(
     valid_data_iterator,
 ):
     """Train the model function."""
-    # evaluate on step 0
-    prefix = "iteration 0"
-    evaluate_and_print_results(
-        neox_args=neox_args,
-        prefix=prefix,
-        forward_step_func=forward_step,
-        data_iterator=valid_data_iterator,
-        model=model,
-        iteration=0,
-        verbose=False,
-        timers=timers,
-    )
 
     # Turn on training mode which enables dropout.
     model.train()
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d00b0a8e1..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-absl-py==1.4.0
-aiohttp==3.8.4
-aiosignal==1.3.1
-appdirs==1.4.4
-async-timeout==4.0.2
-attrs==23.1.0
-best-download==0.0.9
-boto3==1.28.22
-botocore==1.31.22
-certifi==2023.5.7
-chardet==5.1.0
-charset-normalizer==3.1.0
-click==8.1.4
-cmake==3.26.4
-colorama==0.4.6
-CPCargo @ git+https://github.com/samikama/CPCargo@efbf0a5f2ad893c0eee4caae6098001b74be62d8
-DataProperty==1.0.0
-datasets==2.13.1
-DeepSpeed @ git+https://github.com/EleutherAI/DeeperSpeed.git@new-fix#egg=deepspeed
-dill==0.3.6
-docker-pycreds==0.4.0
-einops==0.6.1
-filelock==3.12.2
-flash-attn==2.3.3
-frozenlist==1.3.3
-fsspec==2023.6.0
-ftfy==6.1.1
-fused-kernels @ file:///fsx/hailey/math-lm/gpt-neox/megatron/fused_kernels
-gitdb==4.0.10
-GitPython==3.1.32
-hf_transfer==0.1.3
-hjson==3.1.0
-huggingface-hub==0.16.4
-idna==3.4
-Jinja2==3.1.2
-jmespath==1.0.1
-joblib==1.3.1
-jsonlines==3.1.0
-lit==16.0.6
-lm-dataformat @ git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
-lm-eval==0.3.0
-MarkupSafe==2.1.3
-mbstrdecoder==1.1.3
-mpmath==1.3.0
-multidict==6.0.4
-multiprocess==0.70.14
-networkx==3.1
-ninja==1.11.1
-nltk==3.8.1
-numexpr==2.8.4
-numpy==1.25.0
-nvidia-cublas-cu11==11.10.3.66
-nvidia-cuda-cupti-cu11==11.7.101
-nvidia-cuda-nvrtc-cu11==11.7.99
-nvidia-cuda-runtime-cu11==11.7.99
-nvidia-cudnn-cu11==8.5.0.96
-nvidia-cufft-cu11==10.9.0.58
-nvidia-curand-cu11==10.2.10.91
-nvidia-cusolver-cu11==11.4.0.1
-nvidia-cusparse-cu11==11.7.4.91
-nvidia-nccl-cu11==2.14.3
-nvidia-nvtx-cu11==11.7.91
-openai==0.27.8
-packaging==23.1
-pandas==2.0.3
-pathtools==0.1.2
-pathvalidate==3.0.0
-portalocker==2.7.0
-protobuf==4.23.4
-psutil==5.9.5
-py-cpuinfo==9.0.0
-pyarrow==12.0.1
-pybind11==2.10.4
-pycountry==22.3.5
-pydantic==1.10.11
-pytablewriter==1.0.0
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0
-regex==2023.6.3
-rehash==1.0.1
-requests==2.31.0
-rouge-score==0.1.2
-s3transfer==0.6.1
-sacrebleu==1.5.0
-safetensors==0.3.1
-scikit-learn==1.3.0
-scipy==1.11.1
-sentencepiece==0.1.99
-sentry-sdk==1.28.1
-setproctitle==1.3.2
-six==1.16.0
-smmap==5.0.0
-sqlitedict==2.1.0
-sympy==1.12
-tabledata==1.3.1
-tcolorpy==0.1.3
-threadpoolctl==3.1.0
-tiktoken==0.4.0
-tokenizers==0.13.3
-torch==2.0.1
-tqdm==4.65.0
-tqdm-multiprocess==0.0.11
-transformers==4.31.0
-triton==2.0.0
-typepy==1.3.1
-typing_extensions==4.7.1
-tzdata==2023.3
-ujson==5.8.0
-urllib3==1.26.16
-wandb==0.15.5
-watchdog==3.0.0
-wcwidth==0.2.6
-xxhash==3.2.0
-yarl==1.9.2
-zstandard==0.21.0
\ No newline at end of file
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
new file mode 100644
index 000000000..6c58478f3
--- /dev/null
+++ b/requirements/requirements-dev.txt
@@ -0,0 +1,7 @@
+autopep8>=1.5.6
+clang-format>=13.0.1
+pre-commit>=2.17.0
+pytest>=6.2.3
+pytest-cov>=2.11.1
+pytest-forked>=1.3.0
+pytest-xdist
diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt
new file mode 100644
index 000000000..8cebdaa50
--- /dev/null
+++ b/requirements/requirements-flashattention.txt
@@ -0,0 +1 @@
+flash-attn==2.2.1
diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt
new file mode 100644
index 000000000..349e3b39a
--- /dev/null
+++ b/requirements/requirements-onebitadam.txt
@@ -0,0 +1 @@
+cupy-cuda111>=8.6.0
diff --git a/requirements/requirements-s3.txt b/requirements/requirements-s3.txt
new file mode 100644
index 000000000..fbf031fe4
--- /dev/null
+++ b/requirements/requirements-s3.txt
@@ -0,0 +1,2 @@
+boto3
+hf-transfer>=0.1.3
diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt
new file mode 100644
index 000000000..09386fdcb
--- /dev/null
+++ b/requirements/requirements-sparseattention.txt
@@ -0,0 +1 @@
+triton==2.0.0.dev20221202
diff --git a/requirements/requirements-tensorboard.txt b/requirements/requirements-tensorboard.txt
new file mode 100644
index 000000000..6d5967364
--- /dev/null
+++ b/requirements/requirements-tensorboard.txt
@@ -0,0 +1 @@
+tensorboard==2.13.0
diff --git a/requirements/requirements-wandb.txt b/requirements/requirements-wandb.txt
new file mode 100644
index 000000000..1df18b051
--- /dev/null
+++ b/requirements/requirements-wandb.txt
@@ -0,0 +1 @@
+wandb>=0.10.28
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
new file mode 100644
index 000000000..6247ece3d
--- /dev/null
+++ b/requirements/requirements.txt
@@ -0,0 +1,16 @@
+best_download
+git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e69fbba6f5c#egg=deepspeed
+ftfy>=6.0.1
+git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+huggingface_hub>=0.11.0
+jinja2==3.1.3
+lm_eval>=0.4.0,<=0.4.1
+mpi4py>=3.0.3
+numpy>=1.22.0
+pybind11>=2.6.2
+regex
+sentencepiece
+six
+tiktoken>=0.1.2
+tokenizers>=0.12.1
+transformers==4.36.0

From a521a82cbc6a896ffb658e3e30894717d75e8fb9 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 16:17:00 +0000
Subject: [PATCH 36/64] docs: add explanation of intermediate_size behavior

---
 megatron/neox_arguments/neox_args.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 0c80a9c23..dceeaa073 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -107,8 +107,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     intermediate_size: int = None
     """
-    Transformer intermediate size.
+    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+
+    If not passed, will be set to a reasonable default.
     """
+
     num_attention_heads: int = None
     """
     Number of transformer attention heads. Used for number of query heads only, if num_kv_heads is set.

From 4df0c4e8a2c5c1341186171f078a5dcf7b9d7609 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 16:43:03 +0000
Subject: [PATCH 37/64] args: add argument checking for num_kv_heads, clean up
 usage syntax

---
 megatron/neox_arguments/arguments.py | 10 ++++++++++
 megatron/neox_arguments/neox_args.py | 15 +++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 2aadd9670..0e1aec2d6 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1034,6 +1034,16 @@ def calculate_derived(self):
             # Can't have a default value as an empty dict so need to set it here
             self.update_value("sparsity_config", {})
 
+        # Multi-query or grouped-query attention settings
+        if self.num_kv_heads is not None:
+            # need KV heads <= query heads, and KV heads dividing query heads evenly
+            assert (self.num_attention_heads % self.num_kv_heads == 0), "num_kv_heads must evenly divide num_attention_heads and be no greater than it"
+
+            if self.num_kv_heads < self.num_attention_heads:
+                # GQA / MQA not compatible with sparse attention configurations
+                assert not self.sparsity_config, "Sparse attention not compatible with GQA or MQA"
+                assert all(attn_type == "flash" for attn_type in self.attention_config), "GQA / MQA currently only compatible with Flash Attention 2.0"
+
         # Adding equal dataset weights if none are provided
         if self.train_data_paths and (self.train_data_weights is None):
             self.train_data_weights = [1.0] * len(self.train_data_paths)
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index dceeaa073..4eff24e77 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -114,17 +114,20 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     num_attention_heads: int = None
     """
-    Number of transformer attention heads. Used for number of query heads only, if num_kv_heads is set.
+    Number of transformer attention heads. 
+    
+    If num_kv_heads is set, will control only number of query heads.
     """
 
     num_kv_heads: int = None 
     """
-    Number of transformer key/value attention heads. Need not be set if using attention_type `multihead` or `multiquery`.
-    """
+    Number of transformer key/value attention heads.
 
-    attention_type: Literal["multihead", "multiquery", "groupedquery"] = "multihead"
-    """
-    Whether to use multi-head, multi-query (https://arxiv.org/pdf/1911.02150.pdf), or grouped-query (https://arxiv.org/pdf/2305.13245.pdf) attention.
+    If set to None or the same value as num_attention_heads, will perform multi-head attention (MHA).
+    If set to < num_attention_heads but > 1, will perform grouped-query attention (GQA) (https://arxiv.org/pdf/2305.13245.pdf)
+    If set to 1, will perform multi-query attention.
+
+    Must be < num_attention_heads and divide num_attention_heads evenly.
     """
 
     seq_length: int = None

From beb66d436ee1fa6b573c925cf1ad87d4a044030e Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:40:49 +0000
Subject: [PATCH 38/64] args: prevent num KV heads < TP worldsize

---
 megatron/neox_arguments/arguments.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 0e1aec2d6..a3e45349b 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -732,8 +732,14 @@ def configure_distributed_args(self):
         if self.deepspeed_slurm:
             os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"]
             os.environ["RANK"] = os.environ["SLURM_PROCID"]
-            os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"] if os.environ.get("SLURM_NTASKS") is not None \
-                                        else str(int(os.environ["SLURM_NNODES"]) * int(os.environ["SLURM_NTASKS_PER_NODE"]))
+            os.environ["WORLD_SIZE"] = (
+                os.environ["SLURM_NTASKS"]
+                if os.environ.get("SLURM_NTASKS") is not None
+                else str(
+                    int(os.environ["SLURM_NNODES"])
+                    * int(os.environ["SLURM_NTASKS_PER_NODE"])
+                )
+            )
 
         self.update_value("local_rank", int(os.getenv("LOCAL_RANK", "0")))
         self.update_value("rank", int(os.getenv("RANK", "0")))
@@ -1037,12 +1043,21 @@ def calculate_derived(self):
         # Multi-query or grouped-query attention settings
         if self.num_kv_heads is not None:
             # need KV heads <= query heads, and KV heads dividing query heads evenly
-            assert (self.num_attention_heads % self.num_kv_heads == 0), "num_kv_heads must evenly divide num_attention_heads and be no greater than it"
+            assert (
+                self.num_attention_heads % self.num_kv_heads == 0
+            ), "num_kv_heads must evenly divide num_attention_heads and be no greater than it"
 
             if self.num_kv_heads < self.num_attention_heads:
                 # GQA / MQA not compatible with sparse attention configurations
-                assert not self.sparsity_config, "Sparse attention not compatible with GQA or MQA"
-                assert all(attn_type == "flash" for attn_type in self.attention_config), "GQA / MQA currently only compatible with Flash Attention 2.0"
+                assert (
+                    not self.sparsity_config
+                ), "Sparse attention not compatible with GQA or MQA"
+                assert all(
+                    attn_type == "flash" for attn_type in self.attention_config
+                ), "GQA / MQA currently only compatible with Flash Attention 2.0"
+                assert (
+                    self.num_kv_heads % self.model_parallel_size == 0
+                ), "Number of KV heads must be at least model_parallel_size for now!"
 
         # Adding equal dataset weights if none are provided
         if self.train_data_paths and (self.train_data_weights is None):

From 08f80fe414abdfa157d928bd0254c3e4d43a7291 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:42:35 +0000
Subject: [PATCH 39/64] readd triton flash attn func

---
 megatron/model/transformer.py | 201 +++++++++++++++++++++++-----------
 1 file changed, 136 insertions(+), 65 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 841e38152..583e1eda8 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -166,7 +166,7 @@ def __init__(
         else:
             ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
             ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
-            
+
         self.w1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
@@ -227,22 +227,23 @@ def __init__(
                 skip_bias_add=False,
                 mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
             )
-#        else:
-#            print(
-#                'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
-#            )
-#            exit()
-#            self.final_linear = mpu.RowParallelLinear(
-#                neox_args=neox_args,
-#                input_size=neox_args.hidden_size,
-#                output_size=neox_args.padded_vocab_size,
-#                bias=False,
-#                input_is_parallel=False,
-#                init_method=init_method,
-#                parallel_output=parallel_output,
-#                skip_bias_add=False,
-#                mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
-#            )
+
+    #        else:
+    #            print(
+    #                'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
+    #            )
+    #            exit()
+    #            self.final_linear = mpu.RowParallelLinear(
+    #                neox_args=neox_args,
+    #                input_size=neox_args.hidden_size,
+    #                output_size=neox_args.padded_vocab_size,
+    #                bias=False,
+    #                input_is_parallel=False,
+    #                init_method=init_method,
+    #                parallel_output=parallel_output,
+    #                skip_bias_add=False,
+    #                mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
+    #            )
 
     def forward(self, hidden_states):
         return self.final_linear(hidden_states)
@@ -292,22 +293,28 @@ def __init__(
 
         self.attention_type = neox_args.attention_type
         if self.attention_type != "multihead":
-            self.num_kv_heads_per_partition = mpu.divide(neox_args.num_kv_heads, world_size) # TODO: we want to clone single-kv heads across ranks...
-            self.kv_hidden_size = neox_args.num_kv_heads * self.hidden_size_per_attention_head
+            self.num_kv_heads_per_partition = mpu.divide(
+                neox_args.num_kv_heads, world_size
+            )  # TODO: we want to clone single-kv heads across ranks...
+            self.kv_hidden_size = (
+                neox_args.num_kv_heads * self.hidden_size_per_attention_head
+            )
         else:
-            self.num_kv_heads_per_partition = self.num_attention_heads_per_partition #None
-            self.kv_hidden_size = neox_args.hidden_size #None
+            self.num_kv_heads_per_partition = (
+                self.num_attention_heads_per_partition
+            )  # None
+            self.kv_hidden_size = neox_args.hidden_size  # None
 
         if self.attention_type == "multihead":
             # Strided linear layer.
             self.query_key_value = mpu.ColumnParallelLinear(
-                    neox_args=neox_args,
-                    input_size=neox_args.hidden_size,
-                    output_size=3 * neox_args.hidden_size,
-                    gather_output=False,
-                    init_method=init_method,
-                    bias=neox_args.use_bias_in_attn_linear,
-                )
+                neox_args=neox_args,
+                input_size=neox_args.hidden_size,
+                output_size=3 * neox_args.hidden_size,
+                gather_output=False,
+                init_method=init_method,
+                bias=neox_args.use_bias_in_attn_linear,
+            )
         else:
             self.query_key_value = mpu.ColumnParallelLinear(
                 neox_args=neox_args,
@@ -318,7 +325,6 @@ def __init__(
                 bias=neox_args.use_bias_in_attn_linear,
             )
 
-
         coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
         if self.apply_query_key_layer_scaling:
@@ -352,10 +358,10 @@ def __init__(
                 else self.hidden_size_per_attention_head
             )
             self.rotary_emb = RotaryEmbedding(
-                dim, 
-                base=neox_args.rotary_emb_base, 
+                dim,
+                base=neox_args.rotary_emb_base,
                 max_seq_len=neox_args.seq_length,
-                precision=neox_args.params_dtype
+                precision=neox_args.params_dtype,
             )
         else:
             self.rotary_emb = None
@@ -372,10 +378,21 @@ def __init__(
             )
         else:
             if self.use_flash_attention:
-                from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_func
-                self.flash_triton_fn = None
+                from megatron.model.flash_attention import (
+                    flash_attn_unpadded_qkvpacked_func_cuda,
+                    flash_attn_unpadded_kvpacked_func_cuda,
+                    flash_attn_unpadded_unpacked_func_triton,
+                )
+                from flash_attn.flash_attn_interface import (
+                    flash_attn_func,
+                    flash_attn_varlen_func,
+                )
+
+                self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
                 self.flash_qkv_fn = flash_attn_func
-                self.flash_varlen_qkv_fn = flash_attn_varlen_func
+                self.flash_varlen_qkv_fn = (
+                    flash_attn_varlen_func  # TODO: use neox's flash attention interface
+                )
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,
@@ -520,10 +537,10 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
             # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
             key_layer = key_layer.transpose(0, 1).reshape(
-                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1 
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
             )
             value_layer = value_layer.transpose(0, 1).reshape(
-                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1 
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
             )
 
             batch_size = output_size[0]
@@ -550,32 +567,46 @@ def flash_attention(self, query_layer, key_layer, value_layer):
             query_layer = query_layer.transpose(0, 1).reshape(
                 output_size[0], output_size[2], output_size[1], -1
             )
-            
-            #print(key_layer.shape)
-            #print(value_layer.shape)
+
+            # print(key_layer.shape)
+            # print(value_layer.shape)
 
             if not self.training:
                 q_shape = query_layer.shape
                 k_shape = key_layer.shape
                 v_shape = value_layer.shape
                 output = self.flash_varlen_qkv_fn(
-                    query_layer.reshape((q_shape[0]*q_shape[1], q_shape[2], q_shape[3])),
-                    key_layer.reshape((k_shape[0]*k_shape[1], k_shape[2], k_shape[3])), 
-                    value_layer.reshape((v_shape[0]*v_shape[1], v_shape[2], v_shape[3])),
-                    cu_seqlens_q, cu_seqlens_k,
-                    max_seqlen_q, max_seqlen_k,
+                    query_layer.reshape(
+                        (q_shape[0] * q_shape[1], q_shape[2], q_shape[3])
+                    ),
+                    key_layer.reshape(
+                        (k_shape[0] * k_shape[1], k_shape[2], k_shape[3])
+                    ),
+                    value_layer.reshape(
+                        (v_shape[0] * v_shape[1], v_shape[2], v_shape[3])
+                    ),
+                    cu_seqlens_q,
+                    cu_seqlens_k,
+                    max_seqlen_q,
+                    max_seqlen_k,
                     softmax_scale=None,
                     causal=True,
-                    window_size=(self.sliding_window_width, -1) if self.sliding_window_width is not None else (-1, -1),
+                    window_size=(self.sliding_window_width, -1)
+                    if self.sliding_window_width is not None
+                    else (-1, -1),
                 )
                 output = output.reshape(q_shape)
             else:
                 output = self.flash_qkv_fn(
-                    query_layer, key_layer, value_layer,
+                    query_layer,
+                    key_layer,
+                    value_layer,
                     self.dropout_p if self.training else 0.0,
                     softmax_scale=None,
                     causal=True,
-                    window_size=(self.sliding_window_width, -1) if self.sliding_window_width is not None else (-1, -1),
+                    window_size=(self.sliding_window_width, -1)
+                    if self.sliding_window_width is not None
+                    else (-1, -1),
                 )
 
             matmul_result = output
@@ -628,7 +659,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         # Query, Key, and Value
         # =====================
 
-        if self.attention_type=="multihead":
+        if self.attention_type == "multihead":
             # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
@@ -643,36 +674,77 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
                 mixed_x_layer, 3
             )
-        else: 
+        else:
             # Attention heads [sq, b, h] --> [sq, b, (np + 2 * num. (query / num. kv)) * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
             # TODO: instead split here into [sq, b, np * hn], 2 [sq, b, np/kv_ratio * hn] and then reshape?
             # TODO: check equivalence (in the multihead case(?))
             # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
-            mixed_x_layer = mixed_x_layer.reshape((mixed_x_layer.shape[0], mixed_x_layer.shape[1], self.num_attention_heads_per_partition, int(self.hidden_size_per_attention_head * (1 + 2 * (self.num_kv_heads_per_partition / self.num_attention_heads_per_partition)))))
+            mixed_x_layer = mixed_x_layer.reshape(
+                (
+                    mixed_x_layer.shape[0],
+                    mixed_x_layer.shape[1],
+                    self.num_attention_heads_per_partition,
+                    int(
+                        self.hidden_size_per_attention_head
+                        * (
+                            1
+                            + 2
+                            * (
+                                self.num_kv_heads_per_partition
+                                / self.num_attention_heads_per_partition
+                            )
+                        )
+                    ),
+                )
+            )
             (query_layer, key_layer, value_layer) = [
-                    x.contiguous() for x in torch.split(
-                        mixed_x_layer, [
-                            self.hidden_size_per_attention_head, 
-                            int((self.num_kv_heads_per_partition / self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head), 
-                            int((self.num_kv_heads_per_partition / self.num_attention_heads_per_partition) * self.hidden_size_per_attention_head)
-                        ], 
-                        dim=mixed_x_layer.dim() - 1
-                    )
+                x.contiguous()
+                for x in torch.split(
+                    mixed_x_layer,
+                    [
+                        self.hidden_size_per_attention_head,
+                        int(
+                            (
+                                self.num_kv_heads_per_partition
+                                / self.num_attention_heads_per_partition
+                            )
+                            * self.hidden_size_per_attention_head
+                        ),
+                        int(
+                            (
+                                self.num_kv_heads_per_partition
+                                / self.num_attention_heads_per_partition
+                            )
+                            * self.hidden_size_per_attention_head
+                        ),
+                    ],
+                    dim=mixed_x_layer.dim() - 1,
+                )
             ]
 
             # [sq, b, (np * (1 + 2 * num. (query / num. kv)) * hn)] --> [sq, b, np, (1 + 2 * nq / nkv) * hn]
-            #new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            # new_tensor_shape = mixed_x_layer.size()[:-1] + (
             #     self.num_attention_heads_per_partition + ???,
             #     self.hidden_size_per_attention_head,
 
             # [sq, b, np * hn] --> [sq, b, np, hn]
-            new_query_shape = (query_layer.size(0), query_layer.size(1), self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            new_query_shape = (
+                query_layer.size(0),
+                query_layer.size(1),
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
 
             query_layer = query_layer.view(*new_query_shape)
 
-            new_kv_shape = (key_layer.size(0), key_layer.size(1), self.num_kv_heads_per_partition, self.hidden_size_per_attention_head,)
+            new_kv_shape = (
+                key_layer.size(0),
+                key_layer.size(1),
+                self.num_kv_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
 
             key_layer = key_layer.view(*new_kv_shape)
 
@@ -681,10 +753,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
             ## [sq, b, np, 3 * hn
-            #(query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
+            # (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
             #    mixed_x_layer, 3
-            #)
-
+            # )
 
         if exists(self.rotary_emb):
             if exists(self.rotary_ndims):

From 732588088d62cf6739e95d112839c4d5698155f7 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:48:40 +0000
Subject: [PATCH 40/64] cleanup: use tools/ dir from main

---
 tools/README.md                               |  15 +
 tools/__init__.py                             |   0
 tools/bash/README.md                          |   8 +
 tools/bash/kill.sh                            |   1 +
 tools/bash/killall.sh                         |   1 +
 tools/bash/sync.sh                            |  28 +
 tools/bash/sync_cmd.sh                        |  22 +
 tools/bash/syncdir.sh                         |  29 +
 tools/ckpts/README.md                         | 133 ++++
 tools/ckpts/convert_hf_to_sequential.py       | 646 +++++++++++++++
 tools/ckpts/convert_neox_to_hf.py             | 739 ++++++++++++++++++
 .../convert_raw_llama_weights_to_neox.py      | 693 ++++++++++++++++
 tools/ckpts/inspect_checkpoints.py            | 335 ++++++++
 tools/ckpts/merge20b.py                       | 282 +++++++
 tools/ckpts/upload.py                         |  52 ++
 tools/convert_gqa_llama.py                    | 672 ++++++++++++++++
 tools/datasets/README.md                      | 118 +++
 tools/datasets/corpora.py                     | 364 +++++++++
 tools/datasets/merge_datasets.py              |  86 ++
 tools/datasets/multinode_prepare_data.sh      |  59 ++
 tools/datasets/preprocess_data.py             | 246 ++++++
 tools/datasets/preprocess_data_with_mask.py   | 386 +++++++++
 22 files changed, 4915 insertions(+)
 create mode 100644 tools/README.md
 create mode 100644 tools/__init__.py
 create mode 100644 tools/bash/README.md
 create mode 100755 tools/bash/kill.sh
 create mode 100755 tools/bash/killall.sh
 create mode 100755 tools/bash/sync.sh
 create mode 100644 tools/bash/sync_cmd.sh
 create mode 100755 tools/bash/syncdir.sh
 create mode 100644 tools/ckpts/README.md
 create mode 100644 tools/ckpts/convert_hf_to_sequential.py
 create mode 100644 tools/ckpts/convert_neox_to_hf.py
 create mode 100644 tools/ckpts/convert_raw_llama_weights_to_neox.py
 create mode 100644 tools/ckpts/inspect_checkpoints.py
 create mode 100644 tools/ckpts/merge20b.py
 create mode 100644 tools/ckpts/upload.py
 create mode 100644 tools/convert_gqa_llama.py
 create mode 100644 tools/datasets/README.md
 create mode 100644 tools/datasets/corpora.py
 create mode 100644 tools/datasets/merge_datasets.py
 create mode 100644 tools/datasets/multinode_prepare_data.sh
 create mode 100644 tools/datasets/preprocess_data.py
 create mode 100644 tools/datasets/preprocess_data_with_mask.py

diff --git a/tools/README.md b/tools/README.md
new file mode 100644
index 000000000..ccfb1c5f8
--- /dev/null
+++ b/tools/README.md
@@ -0,0 +1,15 @@
+# GPT-NeoX Auxiliary Tools
+
+This directory contains a number of auxiliary tools that are useful for working with GPT-NeoX but not part of the main training code.
+
+## Bash
+
+This directory contains some simple, frequently used bash commands to make working on multiple machines easier.
+
+## Checkpoints
+
+This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x.
+
+## Datasets
+
+This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library.
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/bash/README.md b/tools/bash/README.md
new file mode 100644
index 000000000..95a307240
--- /dev/null
+++ b/tools/bash/README.md
@@ -0,0 +1,8 @@
+# Bash Scripts
+Useful for running distributed per-node scripts on e.g. Kubernetes
+
+* `kill.sh` kills all python processes
+* `killall.sh` uses pdsh to kill all `train.py` processes on the nodes listed in `/job/hosts/`
+* `sync_cmd.sh` uses pdsh to run a command on all the nodes listed in `/job/hosts/`
+* `sync.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
+* `syncdir.sh` uses pdcp to copy every file in a provided path to all of the nodes listed in `/job/hosts/`
diff --git a/tools/bash/kill.sh b/tools/bash/kill.sh
new file mode 100755
index 000000000..bccd46d7e
--- /dev/null
+++ b/tools/bash/kill.sh
@@ -0,0 +1 @@
+pkill -9 python
diff --git a/tools/bash/killall.sh b/tools/bash/killall.sh
new file mode 100755
index 000000000..d2f3528bd
--- /dev/null
+++ b/tools/bash/killall.sh
@@ -0,0 +1 @@
+pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
diff --git a/tools/bash/sync.sh b/tools/bash/sync.sh
new file mode 100755
index 000000000..fd9377dbb
--- /dev/null
+++ b/tools/bash/sync.sh
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+# Push files to all nodes
+# Usage
+# sync.sh file [file2..]
+
+echo Number of files to upload: $#
+
+for file in "$@"
+do
+    full_path=$(realpath $file)
+    echo Uploading $full_path
+    pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
+done
diff --git a/tools/bash/sync_cmd.sh b/tools/bash/sync_cmd.sh
new file mode 100644
index 000000000..2d8a617ca
--- /dev/null
+++ b/tools/bash/sync_cmd.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+# Runs a command in parallel across all nodes
+# Usage
+# sync_cmd.sh 'echo "hello world"'
+
+echo "Command: $1";
+pdsh -R ssh -w ^/job/hosts $1
diff --git a/tools/bash/syncdir.sh b/tools/bash/syncdir.sh
new file mode 100755
index 000000000..229c9af9a
--- /dev/null
+++ b/tools/bash/syncdir.sh
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+# Push files to all nodes
+# Usage
+# syncdir.sh file [file2..]
+
+echo Number of files to upload: $#
+
+for file in "$@"
+do
+    full_path=$(realpath $file)
+    parentdir="$(dirname "$full_path")"
+    echo Uploading $full_path to $parentdir
+    pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
+done
diff --git a/tools/ckpts/README.md b/tools/ckpts/README.md
new file mode 100644
index 000000000..24d5cf31c
--- /dev/null
+++ b/tools/ckpts/README.md
@@ -0,0 +1,133 @@
+# Checkpoint Scripts
+
+
+## Utilities
+
+### `inspect_checkpoints.py`
+Reports information about a saved checkpoint.
+```
+usage: inspect_checkpoints.py [-h] [--attributes [ATTRIBUTES ...]] [--interactive] [--compare] [--diff] dir
+
+positional arguments:
+  dir                   The checkpoint dir to inspect. Must be either: - a directory containing pickle binaries saved with 'torch.save' ending in .pt or .ckpt - a single path to a .pt or .ckpt file - two comma separated directories -
+                        in which case the script will *compare* the two checkpoints
+
+options:
+  -h, --help            show this help message and exit
+  --attributes [ATTRIBUTES ...]
+                        Name of one or several attributes to query. To access an attribute within a nested structure, use '/' as separator.
+  --interactive, -i     Drops into interactive shell after printing the summary.
+  --compare, -c         If true, script will compare two directories separated by commas
+  --diff, -d            In compare mode, only print diffs
+```
+
+## HuggingFace Scripts
+
+### `convert_hf_to_sequential.py`
+A script for converting publicly available Huggingface (HF) checkpoints to NeoX format.
+
+Note that this script requires access to corresponding config files for equivalent NeoX models to those found in Hugging face.
+
+```
+Example usage: (Converts the 70M Pythia model to NeoX format)
+================================================================
+OMPI_COMM_WORLD_RANK=0 CUDA_VISIBLE_DEVICES=0 python tools/ckpts/convert_hf_to_sequential.py \
+    --hf-model-name pythia-70m-v0 \
+    --revision 143000 \
+    --output-dir checkpoints/neox_converted/pythia/70m \
+    --cache-dir checkpoints/HF \
+    --config configs/pythia/70M.yml configs/local_setup.yml \
+    --test
+
+
+For multi-gpu support we must initialize deepspeed:
+NOTE: This requires manually changing the arguments below.
+================================================================
+CUDA_VISIBLE_DEVICES=0,1,2,3 python ./deepy.py tools/ckpts/convert_hf_to_sequential.py \
+    -d configs pythia/70M.yml local_setup.yml
+```
+### `convert_module_to_hf.py`
+Converts a NeoX model with pipeline parallelism greater than 1 to a HuggingFace transformers `GPTNeoXForCausalLM` model
+
+Note that this script does not support all NeoX features.
+Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
+
+(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture)
+
+```
+usage: convert_module_to_hf.py [-h] [--input_dir INPUT_DIR] [--config_file CONFIG_FILE] [--output_dir OUTPUT_DIR] [--upload]
+
+Merge MP partitions and convert to HF Model.
+
+options:
+  -h, --help            show this help message and exit
+  --input_dir INPUT_DIR
+                        Path to NeoX checkpoint, e.g. /path/to/model/global_step143000
+  --config_file CONFIG_FILE
+                        Path to config file for the input NeoX checkpoint.
+  --output_dir OUTPUT_DIR
+                        Output dir, where to save the HF Model, tokenizer, and configs
+  --upload              Set to true in order to upload to the HF Hub directly.
+```
+
+### `convert_sequential_to_hf.py`
+Converts a NeoX model without pipeline parallelism to a HuggingFace transformers `GPTNeoXForCausalLM` model.
+
+```
+usage: convert_sequential_to_hf.py [-h] [--input_dir INPUT_DIR] [--config_file CONFIG_FILE] [--output_dir OUTPUT_DIR] [--upload]
+
+Merge MP partitions and convert to HF Model.
+
+options:
+  -h, --help            show this help message and exit
+  --input_dir INPUT_DIR
+                        Path to NeoX checkpoint, e.g. /path/to/model/global_step143000
+  --config_file CONFIG_FILE
+                        Path to config file for the input NeoX checkpoint.
+  --output_dir OUTPUT_DIR
+                        Output dir, where to save the HF Model, tokenizer, and configs
+  --upload              Set to true in order to upload to the HF Hub directly.
+```
+### `upload.py`
+Uploads a _converted_ checkpoint to the HuggingFace hub.
+
+```
+python upload.py <converted-ckpt-dir> <repo-name> <branch-name>
+```
+## NeoX-20B Scripts
+
+### `merge20b.py`
+Reduces model and pipeline parallelism of a 20B checkpoint to 1 and 1.
+
+```
+usage: merge20b.py [-h] [--input_dir INPUT_DIR] [--output_dir OUTPUT_DIR]
+
+Merge 20B checkpoint.
+
+options:
+  -h, --help            show this help message and exit
+  --input_dir INPUT_DIR
+                        Checkpoint dir, which should contain (e.g. a folder named "global_step150000")
+  --output_dir OUTPUT_DIR
+                        Output dir, to save the 1-GPU weights configs
+```
+## Llama Scripts
+
+### `convert_raw_llama_weights_to_neox.py`
+Takes a Llama checkpoint and puts it into a NeoX-compatible format.
+
+```
+usage: convert_raw_llama_weights_to_neox.py [-h] [--input_dir INPUT_DIR] [--model_size {7B,13B,30B,65B,tokenizer_only}] [--output_dir OUTPUT_DIR] [--num_output_shards NUM_OUTPUT_SHARDS] [--pipeline_parallel]
+
+Convert raw LLaMA checkpoints to GPT-NeoX format.
+
+options:
+  -h, --help            show this help message and exit
+  --input_dir INPUT_DIR
+                        Location of LLaMA weights, which contains tokenizer.model and model folders
+  --model_size {7B,13B,30B,65B,tokenizer_only}
+  --output_dir OUTPUT_DIR
+                        Location to write GPT-NeoX mode
+  --num_output_shards NUM_OUTPUT_SHARDS
+  --pipeline_parallel   Only use if PP>1
+```
diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py
new file mode 100644
index 000000000..5a5f3bbad
--- /dev/null
+++ b/tools/ckpts/convert_hf_to_sequential.py
@@ -0,0 +1,646 @@
+import sys
+import os
+import copy
+import deepspeed
+
+# import time
+
+import argparse
+import torch
+
+import numpy as np
+
+from functools import reduce
+from transformers import GPTNeoXForCausalLM, GPTNeoXConfig
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+from megatron.neox_arguments import NeoXArgs
+from megatron.training import get_model, get_optimizer, get_learning_rate_scheduler
+from megatron.initialize import initialize_megatron
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint, save_checkpoint
+
+# from megatron.utils import (
+#     Timers,
+#     init_wandb,
+# )
+
+"""
+A script for converting publicly available Huggingface (HF) checkpoints NeoX format.
+
+Note that this script requires access to corresponding config files for equivalent NeoX models to those found in Hugging face.
+
+Example usage: (Converts the 70M Pythia model to NeoX format)
+================================================================
+OMPI_COMM_WORLD_RANK=0 CUDA_VISIBLE_DEVICES=0 python tools/ckpts/convert_hf_to_sequential.py \
+    --hf-model-name pythia-70m-v0 \
+    --revision 143000 \
+    --output-dir checkpoints/neox_converted/pythia/70m \
+    --cache-dir checkpoints/HF \
+    --config configs/pythia/70M.yml configs/local_setup.yml \
+    --test
+
+
+For multi-gpu support we must initialize deepspeed:
+NOTE: This requires manually changing the arguments below.
+================================================================
+CUDA_VISIBLE_DEVICES=0,1,2,3 python ./deepy.py tools/ckpts/convert_hf_to_sequential.py \
+    -d configs pythia/70M.yml local_setup.yml
+"""
+
+MULTI_GPU_ARGS = " ".join(
+    [
+        "--hf-model-name pythia-70m-v0",
+        "--revision 143000",
+        "--output-dir checkpoints/neox_converted/pythia/70m",
+        "--cache-dir checkpoints/HF",
+        "--config configs/pythia/70M.yml configs/local_setup.yml",
+        "--test",
+    ]
+)
+
+
+def convert_hf_to_sequential(hf_model, seq_state_dict):
+    """Converts the weights of a HuggingFace model to neox 2.0 format.
+
+    :param hf_model: the huggingface model
+    :param seq_state_dict: the state dict of the equivalent neox model
+
+    returns the updated sequential state dict
+    """
+    num_layers = hf_model.config.num_hidden_layers
+    # Embedding is layer idx 0
+    seq_state_dict[
+        "sequential.0.word_embeddings.weight"
+    ] = hf_model.gpt_neox.embed_in.state_dict()["weight"]
+
+    for layer_hf in range(num_layers):
+        # offset by 2
+        layer_seq = layer_hf + 2
+
+        # get layer from hf model
+        hf_layer = hf_model.gpt_neox.layers[layer_hf]
+        hf_layer_sd = hf_layer.state_dict()
+
+        for key in hf_model.gpt_neox.layers[0].state_dict().keys():
+
+            if key in ["attention.bias", "attention.masked_bias"]:
+                continue
+            seq_state_dict[f"sequential.{layer_seq}.{key}"] = hf_layer_sd[key]
+
+    # Load final layer norm
+    layer_seq = num_layers + 3
+    seq_state_dict[
+        f"sequential.{layer_seq}.norm.weight"
+    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["weight"]
+    seq_state_dict[
+        f"sequential.{layer_seq}.norm.bias"
+    ] = hf_model.gpt_neox.final_layer_norm.state_dict()["bias"]
+
+    # output embedding / LM head
+    layer_seq += 1
+    seq_state_dict[
+        f"sequential.{layer_seq}.final_linear.weight"
+    ] = hf_model.embed_out.state_dict()["weight"]
+
+
+def shard_sequential_mp(num_mp_ranks, sequential):
+    """Shards the sequential model into model parallel ranks.
+
+    :param num_mp_ranks: the number of model parallel ranks
+    :param sequential: the state dict of the sequential model at mp=1
+
+    returns a dict of state dicts for each mp rank
+    """
+    ranks = {x: dict() for x in range(num_mp_ranks)}
+    for k, v in sequential.items():
+        if reduce(
+            np.logical_or,
+            [
+                x in k
+                for x in [
+                    "layernorm",
+                    "rotary_emb",
+                    "dense_4h_to_h.bias",
+                    "norm.weight",
+                    "norm.bias",
+                    "attention.dense.bias",
+                ]
+            ],
+        ):
+            # no splitting
+            for x in range(num_mp_ranks):
+                ranks[x][k] = v
+        else:
+            if len(v.shape) == 1:
+                size_per_rank = v.shape[0] / num_mp_ranks
+                if size_per_rank % 128 != 0.0:
+                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
+                    size_diff = int((padded_size * 4) - v.shape[max_])
+                    zero_pad = torch.zeros((size_diff))
+                    v = torch.cat([v, zero_pad], dim=max_)
+                else:
+                    padded_size = size_per_rank
+
+                assert size_per_rank % 1.0 == 0.0
+                assert padded_size % 1.0 == 0.0
+
+                padded_size = int(padded_size)
+                size_per_rank = int(size_per_rank)
+
+                for x in range(num_mp_ranks):
+                    if size_per_rank != padded_size:
+                        # need to pad
+                        ranks[x][k] = v[padded_size * x : padded_size * (x + 1)]
+                    else:
+                        ranks[x][k] = v[size_per_rank * x : size_per_rank * (x + 1)]
+
+            elif len(v.shape) == 2:
+
+                if reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "attention.dense.weight",
+                            "mlp.dense_4h_to_h.weight",
+                        ]
+                    ],
+                ):  # column parallel
+                    max_, min_ = 1, 0
+                elif reduce(
+                    np.logical_or,
+                    [
+                        x in k
+                        for x in [
+                            "mlp.dense_h_to_4h.weight",
+                            "mlp.dense_h_to_4h.bias",
+                            "attention.query_key_value.weight",
+                            "attention.query_key_value.bias",
+                            "word_embeddings.weight",
+                            "final_linear.weight",
+                        ]
+                    ],
+                ):
+                    # row parallel
+                    max_, min_ = 0, 1
+                else:
+                    raise Exception("Unknown weight to shard: {}".format(k))
+
+                size_per_rank = v.shape[max_] / num_mp_ranks
+                if size_per_rank % 128 != 0.0:
+                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
+                    size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
+
+                    assert (
+                        size_diff > 0
+                    ), "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
+                        size_diff, size_per_rank, k, v.shape, padded_size
+                    )
+
+                    zero_pad = (
+                        torch.zeros((size_diff, v.shape[min_]))
+                        if max_ == 0
+                        else torch.zeros((v.shape[min_], size_diff))
+                    )
+
+                    v = torch.cat([v, zero_pad], dim=max_)
+                else:
+                    padded_size = size_per_rank
+
+                assert size_per_rank % 1.0 == 0.0
+                assert padded_size % 1.0 == 0.0
+
+                padded_size = int(padded_size)
+                size_per_rank = int(size_per_rank)
+
+                for x in range(num_mp_ranks):
+                    if size_per_rank != padded_size:
+                        # need to pad
+                        ranks[x][k] = (
+                            v[padded_size * x : padded_size * (x + 1), :]
+                            if max_ == 0
+                            else v[:, padded_size * x : padded_size * (x + 1)]
+                        )
+                    else:
+                        ranks[x][k] = (
+                            v[size_per_rank * x : size_per_rank * (x + 1), ...]
+                            if max_ == 0
+                            else v[:, size_per_rank * x : size_per_rank * (x + 1)]
+                        )
+
+            else:
+                raise NotImplementedError()
+
+    return ranks
+
+
+def replace_sharded_seq(mp_checkpoints, mp_sharded_seq):
+    """replaces the values within checkpointed configs with those
+    from the sharded sequential object."""
+
+    for mp_idx, shard in mp_sharded_seq.items():
+        mp_key = f"mp_rank_{mp_idx:02}_model_states.pt"
+
+        # use for loop instead of direct assignment
+        # to check for compatibility
+        for k, v in mp_checkpoints[mp_key]["module"].items():
+            try:
+                mp_checkpoints[mp_key]["module"][k] = shard[k]
+            except KeyError:
+                print("ERROR key:{} not found in shard.".format(k))
+
+
+def shard_pp(sequential, mp_rank, num_layers):
+    """Shards the model into layers.
+
+    :param sequential: the state dict of the sequential model at mp=1
+    :param mp_rank: the model parallel rank of the layers
+
+    returns a dict of state dicts for each layer
+    """
+    suffix = f"-model_{mp_rank:02}-model_states.pt"
+
+    layers_seq = dict()
+    layers_seq[f"layer_00" + suffix] = {
+        "word_embeddings.weight": sequential[f"sequential.0.word_embeddings.weight"]
+    }
+    layers_seq[f"layer_{num_layers+3:02}" + suffix] = {
+        "norm.weight": sequential[f"sequential.{num_layers+3}.norm.weight"],
+        "norm.bias": sequential[f"sequential.{num_layers+3}.norm.bias"],
+    }
+
+    layers_seq[f"layer_{num_layers+4:02}" + suffix] = {
+        "final_linear.weight": sequential[
+            f"sequential.{num_layers+4}.final_linear.weight"
+        ]
+    }
+
+    for layer in range(2, num_layers + 2):
+        layer_keys = [x for x in sequential if ".{}.".format(layer) in x]
+        layers_seq[f"layer_{layer:02}" + suffix] = {
+            k.split(".{}.".format(layer))[1]: sequential[k] for k in layer_keys
+        }
+
+    return layers_seq
+
+
+def shard_pp_mp(num_mp_ranks, sequential, num_layers):
+    """Shards the model into layers and model parallel ranks.
+
+    :param num_mp_ranks: the number of model parallel ranks
+    :param sequential: the state dict of the sequential model at mp=1
+    :param num_layers: the number of layers in the model
+
+    returns a dict of state dicts for each layer for each model parallel rank
+    """
+    mp_sharded = shard_sequential_mp(num_mp_ranks=num_mp_ranks, sequential=sequential)
+
+    layers_pp_mp = {}
+    for mp_rank, d in mp_sharded.items():
+        layers_pp_mp.update(
+            shard_pp(sequential=d, mp_rank=mp_rank, num_layers=num_layers)
+        )
+    return layers_pp_mp
+
+
+def convert(hf_model, ckpt_dir, output_dir):
+    """Converts a huggingface model to a NeoX checkpoint for different
+        model parallel and pipeline parallel settings degrees.
+
+    :param hf_model: the huggingface model
+    :param ckpt_dir: the directory containing the NeoX checkpoint
+    :param output_dir: the directory to save the converted checkpoint
+    returns None
+    """
+
+    os.listdir(ckpt_dir)
+
+    ckpts, layers = {}, {}
+    for x in os.listdir(ckpt_dir):
+        if x.startswith("mp_rank"):
+            ckpts[x] = torch.load(os.path.join(ckpt_dir, x))
+        elif x.startswith("layer"):
+            layers[x] = torch.load(os.path.join(ckpt_dir, x))
+
+    assert len(layers) + len(ckpts) > 0, "No checkpoints found in {}".format(ckpt_dir)
+
+    os.makedirs(output_dir, exist_ok=True)
+    seq_state_dict = dict()
+    convert_hf_to_sequential(hf_model, seq_state_dict)
+
+    if len(ckpts) == 1 and len(layers) == 0:
+        # pp=0, mp=1
+        key = list(ckpts.keys())[0]
+        ckpts[key]["module"] = seq_state_dict
+        to_save = ckpts
+
+    elif len(ckpts) > 1 and len(layers) == 0:
+        # pp=0, mp>1
+        sharded_seq = shard_sequential_mp(
+            num_mp_ranks=len(ckpts), sequential=seq_state_dict
+        )
+        replace_sharded_seq(mp_checkpoints=ckpts, mp_sharded_seq=sharded_seq)
+        to_save = ckpts
+
+    elif len(ckpts) == 1 and len(layers) > 1:
+        # pp>0, mp==1
+        to_save = shard_pp(
+            sequential=seq_state_dict,
+            mp_rank=0,
+            num_layers=hf_model.config.num_hidden_layers,
+        )
+
+    elif len(ckpts) > 1 and len(layers) > 1:
+        # pp>0, mp>1
+        to_save = shard_pp_mp(
+            num_mp_ranks=len(ckpts),
+            sequential=seq_state_dict,
+            num_layers=hf_model.config.num_hidden_layers,
+        )
+
+    else:
+        raise NotImplementedError(
+            "Not implemented for len(ckpts)={} and len(layers)={}".format(
+                len(ckpts), len(layers)
+            )
+        )
+
+    for k, v in to_save.items():
+        print("saving {}...".format(os.path.join(output_dir, k)))
+        torch.save(v, os.path.join(ckpt_dir, k))
+
+    # copy the checkpoint to the output_dir
+    print("rm {}/*".format(output_dir))
+    os.system("rm {}/*".format(output_dir))
+    os.makedirs(output_dir, exist_ok=True)
+    print("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+    os.system("cp {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
+
+    # set latest file within the output_dir
+    latest_file = os.path.join("/".join(output_dir.split("/")[:-1]), "latest")
+    os.system("rm " + latest_file)
+    with open(latest_file, "w") as f:
+        f.write(output_dir.split("/")[-1])
+
+
+def consume_neox_args2(args_parsed, overwrite_values=None):
+    """
+    Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
+
+    In order not to have any problems with different configs being mismatched across machines, we instead read the .yaml configuration file from the main rank,
+    then serialize the arguments to a dictionary, which the deepspeed launcher broadcasts to all machines (`--megatron_config`).
+
+    We then instantiate a new NeoXArgs from the dictionary (`.from_dict`). This should ensure args are never inconsistent across machines.
+    """
+
+    with open(args_parsed.megatron_config) as jsonfile:
+        megatron_config = json.load(jsonfile)
+    if args_parsed.deepspeed_config is not None:
+        overwrite_values = NeoXArgs.set_up_autotuning(
+            args_parsed.deepspeed_config, overwrite_values
+        )
+    if overwrite_values is not None:
+        megatron_config.update(overwrite_values)
+    return NeoXArgs.from_dict(args_dict=megatron_config)
+
+
+def get_non_existing_dir(tmp_dir):
+    while os.path.exists(tmp_dir):
+        tmp_dir = os.path.join(tmp_dir, "tmp_dir")
+    return tmp_dir
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert a Hugging Face GPT-NeoX model back to a sequential model compatible with GPT-NeoX training."
+    )
+    parser.add_argument(
+        "--revision",
+        type=int,
+        default=143000,
+        help="Revision or step of the Pythia model to convert.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Path to save the converted GPT-NeoX model checkpoint.",
+    )
+    parser.add_argument(
+        "--config",
+        nargs="*",
+        default=[],
+        help="Path to the config file for the equivalent NeoX model.",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set, will run a test to ensure the conversion was successful.",
+    )
+    parser.add_argument(
+        "--download-only",
+        action="store_true",
+        help="If set, script will only download the model and not convert it.",
+    )
+
+    parser.add_argument(
+        "--ckpt-tmp-dir",
+        default="/tmp/ckpt_tmp_dir",
+        help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]",
+    )
+    parser.add_argument(
+        "--hf-model-name",
+        type=str,
+        help="Name of the hugging face model to download from EleutherAI/{hf-model-name}.}",
+    )
+
+    parser.add_argument(
+        "--cache-dir",
+        default="/gpfs/alpine/csc499/proj-shared/hf_checkpoints",
+        help="Directory to store cached hugging face checkpoints.",
+    )
+    try:
+        if int(os.environ["WORLD_SIZE"]) > 1:
+            args = parser.parse_args(MULTI_GPU_ARGS.split(" "))
+        else:
+            args = parser.parse_args()
+    except KeyError:
+        args = parser.parse_args()
+
+    tmp_cache_dir = get_non_existing_dir(args.ckpt_tmp_dir)
+
+    if args.download_only:
+        hf_model = GPTNeoXForCausalLM.from_pretrained(
+            f"EleutherAI/{args.hf_model_name}",
+            revision=f"step{args.revision}",
+            cache_dir=os.path.join(
+                args.cache_dir, f"{args.hf_model_name}/step{args.revision}"
+            ),
+        ).half()
+        exit(0)
+    else:
+        print("======================================================================")
+        print(
+            "Warning the following script will delete files within {}".format(
+                args.output_dir
+            )
+        )
+        print(
+            "Warning the following script will delete this directory {}".format(
+                tmp_cache_dir
+            )
+        )
+        print("======================================================================")
+        # time.sleep(5)
+
+    if int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1)) > 1:
+        neox_args = consume_neox_args2(args2)
+    else:
+        neox_args = NeoXArgs.from_ymls(args.config)
+    neox_args.configure_distributed_args()
+    neox_args.build_tokenizer()
+    neox_args.initialize_tensorboard_writer()
+
+    # setup logging and timers
+    # init_wandb(neox_args=neox_args)
+    # timers = Timers(
+    #     use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
+    # )
+    initialize_megatron(neox_args=neox_args)
+
+    torch.distributed.barrier()
+
+    model = get_model(neox_args=neox_args, use_cache=True)
+    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
+    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
+
+    model, optimizer, _, lr_scheduler = deepspeed.initialize(
+        model=model,
+        optimizer=optimizer,
+        # args=neox_args,
+        lr_scheduler=lr_scheduler,
+        dist_init_required=False,
+        model_parameters=None,
+        config_params=neox_args.deepspeed_config,
+        mpu=mpu if not neox_args.is_pipe_parallel else None,
+    )
+
+    if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0":
+        os.makedirs(f"{tmp_cache_dir}", exist_ok=True)
+
+    torch.distributed.barrier()
+    neox_args.save = tmp_cache_dir
+
+    save_checkpoint(
+        neox_args=neox_args,
+        iteration=0,
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+    )
+    print(os.listdir(f"{tmp_cache_dir}"))
+    ckpt_dir = os.path.join(tmp_cache_dir, "global_step0")
+
+    if torch.distributed.get_rank() == 0:
+        config = GPTNeoXConfig.from_pretrained(
+            f"EleutherAI/{args.hf_model_name}",
+            revision=f"step{args.revision}",
+            cache_dir=os.path.join(
+                args.cache_dir, f"{args.hf_model_name}/step{args.revision}"
+            ),
+        )
+        # does not change the weights, but is needed to align logits
+        config.update({"hidden_act": "gelu_fast"})
+        hf_model = GPTNeoXForCausalLM.from_pretrained(
+            f"EleutherAI/{args.hf_model_name}",
+            revision=f"step{args.revision}",
+            config=config,
+            cache_dir=os.path.join(
+                args.cache_dir, f"{args.hf_model_name}/step{args.revision}"
+            ),
+        ).half()
+        print("==========================================")
+        print("Loaded Hugging Face model successfully!")
+        print("==========================================")
+        convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir)
+
+        if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0":
+            # cleanup temp dir
+            os.system(f"rm -r {tmp_cache_dir}")
+
+    torch.distributed.barrier()
+
+    # verify the conversion can be loaded
+    neox_args.load = "/".join(args.output_dir.split("/")[:-1])
+    print(neox_args.load)
+    neox_args.finetune = True
+    load_checkpoint(
+        neox_args=neox_args,
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        iteration=None,
+    )
+    print("==========================================")
+    print("Converted checkpoint successfully loaded!")
+    print("==========================================")
+
+    if args.test and torch.distributed.get_world_size() == 1:
+        # only implemented for world size 1
+
+        with torch.no_grad():
+            # torch.backends.cudnn.benchmark = False
+            # torch.use_deterministic_algorithms(True) #setting the CUBLAS_WORKSPACE_CONFIG=:4096:8 environment variable is required for this to work (tested for A6000)
+            model.eval()
+            hf_model.eval()
+
+            b = 10
+            seq_len = 32
+            inputs = torch.randint(0, 50304, (b, seq_len), dtype=torch.long).cuda()
+            mask = (
+                (torch.triu(torch.ones(seq_len, seq_len)) != 1).transpose(0, 1).cuda()
+            )
+            pos_ids = torch.arange(0, seq_len).unsqueeze(0).cuda()
+
+            torch.manual_seed(0)
+            outputs_neox = model.cuda()(
+                (inputs, pos_ids, mask.unsqueeze(0).unsqueeze(0)), neox_args=neox_args
+            )
+
+            torch.manual_seed(0)
+            outputs = hf_model.cuda()(input_ids=inputs)
+
+            print("HF logits   .sum(): ", outputs.logits.to(torch.float32).sum())
+            print("NeoX logits .sum(): ", outputs_neox.to(torch.float32).sum())
+
+            print(
+                "\nLogit comparison summary for {} sequences of length {}:".format(
+                    b, seq_len
+                )
+            )
+            print("=============================================================")
+            for i in range(b):
+                abs_diff = (
+                    outputs.logits[i, ...].to(torch.float32)
+                    - outputs_neox[i, ...].to(torch.float32)
+                ).abs()
+                print(
+                    "[Random sequence {}] (hflogits - neoxlogits).abs() -- mean: {:.5f}\tmax: {:.5f}\tmin: {:.5f}\tmedian: {:.5f}".format(
+                        i,
+                        abs_diff.mean(),
+                        abs_diff.max(),
+                        abs_diff.min(),
+                        abs_diff.median(),
+                    )
+                )
+
+    elif args.test:
+        print(
+            "[INFO] Checkpoint conversion logit test not implemented for distributed world_size > 1. Current world_size: {}".format(
+                torch.distributed.get_world_size()
+            )
+        )
diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
new file mode 100644
index 000000000..a9cd4277e
--- /dev/null
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -0,0 +1,739 @@
+# Copyright (c) 2023, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import yaml
+import argparse
+from tqdm import tqdm
+
+import torch
+from transformers import (
+    MistralConfig,
+    LlamaConfig,
+    GPTNeoXConfig,
+    AutoModelForCausalLM,
+    AutoConfig,
+)
+
+from typing import List, Literal
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+from megatron.tokenizer import build_tokenizer
+
+
+"""
+A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
+
+Note that this script does not support all NeoX features.
+Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
+
+(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture).
+"""
+
+
+# Model definitions: a list of keys, and where they fall in terms of handling them in the presence of TP.
+# in format : {model arch: {param type: {param in neox: param in HF}}}
+
+MODEL_KEYS = {
+    "neox": {
+        "COLUMN_PARALLEL_LINEAR_KEYS": {
+            "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight",
+            "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias",
+            "attention.query_key_value.weight": "attention.query_key_value.weight",
+            "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
+        },
+        "ROW_PARALLEL_LINEAR_KEYS": {
+            "attention.dense.weight": "attention.dense.weight",
+            "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight",
+        },
+        "ROW_PARALLEL_BIAS_KEYS": {
+            "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias",
+            "attention.dense.bias": "attention.dense.bias",
+        },
+        "NORM_KEYS": {
+            "input_layernorm.weight": "input_layernorm.weight",
+            "input_layernorm.bias": "input_layernorm.bias",
+            "post_attention_layernorm.weight": "post_attention_layernorm.weight",
+            "post_attention_layernorm.bias": "post_attention_layernorm.bias",
+        },
+        "FINAL_NORM_KEYS": {
+            "norm.weight": "weight",
+            "norm.bias": "bias",
+        },
+    },
+    "llama": {
+        "COLUMN_PARALLEL_LINEAR_KEYS": {
+            "mlp.w1.weight": "mlp.gate_proj.weight",
+            "mlp.w3.weight": "mlp.up_proj.weight",
+        },
+        "ROW_PARALLEL_LINEAR_KEYS": {
+            "attention.dense.weight": "self_attn.o_proj.weight",
+            "mlp.w2.weight": "mlp.down_proj.weight",
+        },
+        "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
+        "NORM_KEYS": {
+            "input_layernorm.scale": "input_layernorm.weight",
+            "post_attention_layernorm.scale": "post_attention_layernorm.weight",
+        },
+        "FINAL_NORM_KEYS": {
+            "norm.scale": "weight",
+        },
+        "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
+            "attention.query_key_value.weight": [
+                "self_attn.q_proj.weight",
+                "self_attn.k_proj.weight",
+                "self_attn.v_proj.weight",
+            ],
+        },
+    },
+}
+
+MODEL_KEYS["mistral"] = MODEL_KEYS["llama"]
+
+
+def load_partitions(
+    input_checkpoint_path: str, mp_partitions: int, layer_idx: int, sequential: bool
+) -> List[torch.Tensor]:
+    """Returns a list containing all states from a model (across MP partitions)"""
+
+    if sequential:
+        filename_format = f"mp_rank_{{i:02}}_model_states.pt"
+    else:
+        filename_format = f"layer_{layer_idx:02}-model_{{i:02}}-model_states.pt"
+
+    loaded_tp_ranks = [
+        torch.load(
+            os.path.join(
+                input_checkpoint_path,
+                filename_format.format(i=i),
+            ),
+            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+        for i in range(mp_partitions)
+    ]
+
+    return loaded_tp_ranks
+
+
+def get_state(
+    state_dicts: List[torch.Tensor], key: str, layer_idx: int, sequential: bool
+) -> torch.Tensor:
+    """Helper that returns a list containing a given weight's state from each MP partition, for a given layer in the model."""
+
+    if sequential:
+        # use the correct key into the sequential dict for given weight/provided key
+        key = f"sequential.{layer_idx}.{key}"
+
+        return [state_dict["module"][key] for state_dict in state_dicts]
+    else:
+        # For the PipelineModule case, we don't need any key / module prefix. just grab this weight value.
+        # layer_idx is also ignored because we've loaded only this layer's weights, ahead of time.
+        key = key
+
+        return [state_dict[key] for state_dict in state_dicts]
+
+
+def get_key(loaded_config, key, default=None):
+    """
+    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
+    """
+    key = key.replace("_", "-")
+    try:
+        return loaded_config[key]
+    except KeyError:
+        key = key.replace("-", "_")
+        try:
+            return loaded_config[key]
+        except KeyError:
+            return default
+
+
+def create_config(neox_config, architecture="neox"):
+    """take in a loaded yaml from NeoX and assign relevant values to HF config.
+    Returns: GPTNeoXConfig() object
+    """
+
+    def gated_size(hidden_dim):
+        # takes in a hidden dim and calculates intermediate dim of a LLaMAParallelMLP.
+        # (only used if intermediate_size not specified in config)
+        # hidden-size * 8 / 3 , rounded up to nearest multiple of 256
+        ff_dim = int(2 * hidden_dim * 4 / 3)
+        ff_dim = 256 * ((ff_dim + 256 - 1) // 256)
+        return ff_dim
+
+    class TokenizerArgs:
+        # kinda hacky.
+        # this is to get something with the same interface as is used in build_tokenizer()
+        # without diving into loading a neox_args object or using argparse etc.
+        def __init__(self, neox_config):
+            self.make_vocab_size_divisible_by = get_key(
+                neox_config, "make-vocab-size-divisible-by", default=128
+            )
+            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
+            self.vocab_file = get_key(neox_config, "vocab-file")
+            self.merge_file = get_key(neox_config, "merge-file")
+            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
+
+            self.rank = 0
+
+    args = TokenizerArgs(neox_config)
+    tokenizer = build_tokenizer(args)
+    try:  # GPT2TokenizerFast raises NotImplementedError
+        pad_token = tokenizer.pad
+    except:
+        pad_token = (
+            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
+        )
+
+    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
+    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
+
+    if use_tied_lns:
+        raise NotImplementedError(
+            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
+                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
+                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
+        )
+
+    # set all config values.
+
+    # shared config parameters.
+    args = {
+        "vocab_size": args.padded_vocab_size,
+        "hidden_size": get_key(neox_config, "hidden-size"),
+        "num_hidden_layers": get_key(neox_config, "num-layers"),
+        "num_attention_heads": get_key(neox_config, "num-attention-heads"),
+        "max_position_embeddings": get_key(neox_config, "max-position-embeddings"),
+        "initializer_range": get_key(neox_config, "init-method-std", 0.02),
+        "tie_word_embeddings": (not get_key(neox_config, "no-weight-tying", False)),
+        "use_cache": True,
+    }
+    if architecture == "mistral" or architecture == "llama":
+        args.update(
+            {
+                "intermediate_size": get_key(
+                    neox_config,
+                    "intermediate-size",
+                    gated_size(get_key(neox_config, "hidden-size")),
+                ),
+                "num_key_value_heads": get_key(
+                    neox_config,
+                    "num-kv-heads",
+                    get_key(neox_config, "num-attention-heads"),
+                ),
+                "hidden_act": get_key(neox_config, "activation", default="silu"),
+                "rms_norm_eps": get_key(neox_config, "rms-norm-epsilon", 1.0e-6),
+                "bos_token_id": tokenizer.eod,
+                "eos_token_id": tokenizer.eod,
+                "rope_theta": get_key(neox_config, "rotary-emb-base", 10000.0),
+            }
+        )
+
+        if args["num_attention_heads"] != args["num_key_value_heads"]:
+            assert (
+                False
+            ), "Got num_key_value_heads != num_attention_heads, but Grouped-Query Attention is not yet supported by NeoX."
+
+        if architecture == "mistral":
+            # mistral-specific options
+            args.update(
+                {
+                    "sliding_window": get_key(
+                        neox_config, "sliding-window-width", 4096
+                    ),
+                }
+            )
+            hf_config = MistralConfig(**args)
+        elif architecture == "llama":
+            # llama-specific options
+            args.update(
+                {
+                    # NeoX library defaults to using bias in attention
+                    "attention_bias": get_key(
+                        neox_config, "use_bias_in_attn_linear", True
+                    ),
+                }
+            )
+            hf_config = LlamaConfig(**args)
+    else:
+        # GPT-NeoX HF model class-specific options
+        args.update(
+            {
+                "rotary_pct": get_key(neox_config, "rotary-pct", default=1.0),
+                "rotary_emb_base": get_key(
+                    neox_config, "rotary-emb-base", default=1000.0
+                ),
+                "use_parallel_residual": get_key(neox_config, "gpt-j-residual", False),
+                "layer_norm_eps": get_key(neox_config, "layernorm-epsilon", 1e-5),
+            }
+        )
+        hf_config = GPTNeoXConfig(**args)
+
+    return hf_config
+
+
+def reshard_and_split_qkv(
+    param_mapping: dict,  # a dictionary mapping the QKV weight keys in GPT-NeoX -> a list of keys representing the Q, K, and V weight keys the HF model will use
+    hf_config: AutoConfig,  # a HF model config for the model
+    loaded_tp_ranks: List[torch.Tensor],
+    layer_idx: int,
+    sequential: bool,
+):
+    """
+    A helper function which performs reshaping and sharding to make the QKV projection from NeoX compatible with HF Llama models,
+    even when grouped-query attention is required.
+    """
+    for key, hf_keys in param_mapping.items():
+        assert (
+            isinstance(hf_keys, list) and len(hf_keys) == 3
+        ), "Must map QKV to precisely 3 resulting weight matrices."
+
+    for key, hf_keys in param_mapping.items():
+        # we first merge the QKV proj. across TP ranks
+        sharded_qkv = torch.stack(
+            get_state(loaded_tp_ranks, key, layer_idx, sequential), dim=0
+        )
+        # should now have shape [TP_SIZE, (hidden_size + 2 * kv_hidden_size) / TP_SIZE, hidden_size].
+
+        sharded_qkv = sharded_qkv.view(
+            len(loaded_tp_ranks),
+            hf_config.num_attention_heads // len(loaded_tp_ranks),
+            int(
+                hf_config.hidden_size
+                // hf_config.num_attention_heads
+                * (
+                    1
+                    + 2 * hf_config.num_key_value_heads / hf_config.num_attention_heads
+                )
+            ),
+            hf_config.hidden_size,
+        )  # is meant to convert to shape [TP_SIZE, NUM_QUERY_HEADS_PER_SHARD, dims_per_head * (1 + 2 * kv-to-q head ratio), hidden_size]
+
+        q, k, v = torch.split(
+            sharded_qkv,
+            [
+                hf_config.hidden_size // hf_config.num_attention_heads,
+                int(
+                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
+                    * hf_config.hidden_size
+                    // hf_config.num_attention_heads
+                ),
+                int(
+                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
+                    * hf_config.hidden_size
+                    // hf_config.num_attention_heads
+                ),
+            ],
+            dim=2,
+        )
+        # splits along the (dims_per_head * (1 + 2 * kv-to-q head ratio)_ dim to get 3 tensors:
+        # 1 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, dims_per_head, hidden_size] and 2 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, (dims_per_head / kv-to-q head ratio), hidden_size]
+        # these are the Q, and K, V tensors respectively.
+
+        # we have to do additional reshape for each individual tensor now,
+        # into the expected square (or smaller than square, for K/V tensors) shape
+        q, k, v = q.squeeze(dim=2), k.squeeze(dim=2), v.squeeze(dim=2)
+        q = q.view(
+            hf_config.num_attention_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(hf_config.hidden_size, hf_config.hidden_size)
+        k = k.reshape(
+            hf_config.num_key_value_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(
+            hf_config.hidden_size
+            // hf_config.num_attention_heads
+            * hf_config.num_key_value_heads,
+            hf_config.hidden_size,
+        )
+        v = v.reshape(
+            hf_config.num_key_value_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(
+            hf_config.hidden_size
+            // hf_config.num_attention_heads
+            * hf_config.num_key_value_heads,
+            hf_config.hidden_size,
+        )
+
+        # return these
+        state_dict = {}
+        for hf_key, proj in zip(hf_keys, [q, k, v]):
+            state_dict[hf_key] = proj.clone()
+        return state_dict
+
+
+def convert(
+    input_checkpoint_path,
+    loaded_config,
+    output_checkpoint_path,
+    sequential: bool = True,
+    precision: Literal["auto", "fp16", "bf16", "fp32"] = "auto",
+    architecture: Literal["neox", "llama", "mistral"] = "neox",
+):
+    """convert a NeoX checkpoint to a HF model format.
+    should perform model-parallel merging correctly
+    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
+    """
+
+    ARCH = MODEL_KEYS[architecture]
+
+    hf_config = create_config(loaded_config, architecture=architecture)
+
+    hf_model = AutoModelForCausalLM.from_config(hf_config)
+
+    if architecture == "neox":
+        hf_transformer = hf_model.gpt_neox
+    else:
+        hf_transformer = hf_model.model
+
+    if precision == "auto":
+        print("Auto-detecting precision to save model into...")
+        # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
+        fp16 = get_key(loaded_config, "fp16")
+
+        if fp16:
+            try:
+                # current behavior is to pass "fp16": {"enabled": true}, when using upstream Deepspeed
+                if fp16["enabled"]:
+                    hf_model.half()
+                    print("Saving weights in fp16 precision...")
+            except:
+                try:
+                    # attempt to access bf16 dict in yaml file, if fp16 not enabled
+                    bf16 = get_key(loaded_config, "bf16")
+                    if bf16:
+                        hf_model.to(dtype=torch.bfloat16)
+                        print("Saving weights in bf16 precision...")
+                except:
+                    hf_model.to(dtype=torch.float)
+                    print(
+                        "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
+                    )
+    else:
+        name_to_dtype = {
+            "bf16": torch.bfloat16,
+            "fp16": torch.float16,
+            "fp32": torch.float,
+        }
+        print(f"Saving model into specified {precision} precision...")
+        hf_model.to(dtype=name_to_dtype[precision])
+
+    mp_partitions = get_key(loaded_config, "model-parallel-size")
+
+    # Sequential saves all model states from an MP rank in one file.
+    # so we only load the MP ranks only once and index into them with get_state().
+    # for the pipeline-parallel case (pipeline-parallel-size >= 1),
+    # we must load the correct layer's states at each step.
+    # (this does mean that less memory is required for PP conversion.)
+    loaded_tp_ranks = load_partitions(
+        input_checkpoint_path, mp_partitions, layer_idx=0, sequential=sequential
+    )
+
+    ### Embedding layer ###
+    # Embedding is layer idx 0
+    if architecture == "neox":
+        embed_in = hf_transformer.embed_in
+    else:
+        embed_in = hf_transformer.embed_tokens
+    embed_in.load_state_dict(  # TODO: embed_in is not always model's name for embedding
+        {
+            "weight": torch.cat(
+                get_state(
+                    loaded_tp_ranks,
+                    "word_embeddings.weight",
+                    layer_idx=0,
+                    sequential=sequential,
+                ),
+                dim=0,
+            )
+        }
+    )
+    assert (
+        hf_config.vocab_size == embed_in.weight.shape[0]
+    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {embed_in.shape[0]}"
+    ### End Embedding Layer ###
+
+    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
+
+        # get layer from hf model
+        hf_layer = hf_transformer.layers[layer_i]  # TODO: model module names
+
+        if not sequential:
+            # in the non-sequential case, must load from each layer individually.
+            # use layer index + 2 bc of embed layer and a dummy _pre_transformer_block, which are "layers 0 and 1"
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                layer_idx=layer_i + 2,
+                sequential=sequential,
+            )
+
+        # + 2 bc of embed layer and a dummy _pre_transformer_block
+        state_dict = {}
+        for key, hf_key in ARCH["ROW_PARALLEL_LINEAR_KEYS"].items():
+            state_dict[hf_key] = torch.cat(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                ),
+                dim=1,
+            )
+
+        # average layernorm stats over mp ranks
+        for key, hf_key in ARCH["NORM_KEYS"].items():
+            state_dict[hf_key] = sum(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                )
+            ) / len(loaded_tp_ranks)
+
+        # LinearWithTPMerge
+        for key, hf_key in ARCH["COLUMN_PARALLEL_LINEAR_KEYS"].items():
+            state_dict[hf_key] = torch.cat(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                ),
+                dim=0,
+            )
+
+        # LinearWithTPSplitBias
+        for key, hf_key in ARCH["ROW_PARALLEL_BIAS_KEYS"].items():
+            state_dict[hf_key] = sum(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                )
+            )
+
+        # Just take one
+        if "attention.rotary_emb.inv_freq" in hf_layer.state_dict():
+            state_dict["attention.rotary_emb.inv_freq"] = get_state(
+                loaded_tp_ranks,
+                "attention.rotary_emb.inv_freq",
+                layer_idx=layer_i + 2,
+                sequential=sequential,
+            )[0]
+
+        if "attention.bias" in hf_layer.state_dict():
+            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
+        if "attention.masked_bias" in hf_layer.state_dict():
+            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
+                "attention.masked_bias"
+            ]
+
+        # some architectures, like Mistral and Llama, have the following which must be handled specially:
+        # - Q, K, V projections are performed separately, so we must split apart GPT-NeoX library's single QKV proj
+        # - Support for Grouped-Query Attention, meaning the Q and the K, V projections may not be the same size
+        if "GQA_QKV_KEYS" in ARCH:
+            state_dict.update(
+                reshard_and_split_qkv(
+                    param_mapping=ARCH["GQA_QKV_KEYS"],
+                    hf_config=hf_config,
+                    loaded_tp_ranks=loaded_tp_ranks,
+                    layer_idx=layer_i + 2,
+                    sequential=sequential,
+                )
+            )
+        # load state_dict into layer
+        hf_layer.load_state_dict(state_dict)
+
+    if not sequential:
+        loaded_tp_ranks = load_partitions(
+            input_checkpoint_path,
+            mp_partitions,
+            get_key(loaded_config, "num-layers") + 3,
+            sequential=sequential,
+        )
+    # Load final layer norm
+    if architecture == "neox":
+        lm_head = hf_model.embed_out
+    else:
+        lm_head = hf_model.lm_head
+    norm_state_dict = {}
+    for key, hf_key in ARCH["FINAL_NORM_KEYS"].items():
+        norm_state_dict[hf_key] = sum(
+            get_state(
+                loaded_tp_ranks,
+                key,
+                layer_idx=get_key(loaded_config, "num-layers") + 3,
+                sequential=sequential,
+            )
+        ) / len(loaded_tp_ranks)
+
+    if architecture == "neox":
+        final_layer_norm = hf_transformer.final_layer_norm
+    else:
+        final_layer_norm = hf_transformer.norm
+
+    final_layer_norm.load_state_dict(norm_state_dict)
+
+    # Load output embedding
+    if not sequential:
+        loaded_tp_ranks = load_partitions(
+            input_checkpoint_path,
+            mp_partitions,
+            get_key(loaded_config, "num-layers") + 4,
+            sequential=sequential,
+        )
+    # output embedding / LM head
+    if architecture == "neox":  # name of lm head / final linear proj varies
+        lm_head = hf_model.embed_out
+    else:
+        lm_head = hf_model.lm_head
+    lm_head.load_state_dict(
+        {
+            "weight": torch.cat(
+                get_state(
+                    loaded_tp_ranks,
+                    "final_linear.weight",
+                    layer_idx=get_key(loaded_config, "num-layers") + 4,
+                    sequential=sequential,
+                ),
+                dim=0,
+            ),
+        }
+    )
+
+    del loaded_tp_ranks
+
+    return hf_model
+
+
+def main(input_args=None, overwrite_values=None):
+    from huggingface_hub import create_repo, HfApi
+
+    parser = argparse.ArgumentParser(
+        description="Merge MP partitions and convert to HF Model."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
+    )
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to config file for the input NeoX checkpoint.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Output dir, where to save the HF Model, tokenizer, and configs",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="auto",
+        help="What precision to save the model into. Defaults to auto, which auto-detects which 16-bit dtype to save into, or falls back to fp32.",
+    )
+    parser.add_argument(
+        "--no_save_tokenizer",
+        action="store_true",
+        help="Whether to skip saving the tokenizer alongside a model.",
+    )
+    parser.add_argument(
+        "--architecture",
+        type=str,
+        default="neox",
+        help="What HF model class type to export into.",
+    )
+    args = parser.parse_args(input_args)
+
+    # validate arguments
+    assert args.precision in [
+        "auto",
+        "fp16",
+        "bf16",
+        "fp32",
+    ], f"expected --precision to be one of 'auto', 'fp16', 'bf16', 'fp32' but got '{args.precision}' !"
+    assert args.architecture in [
+        "neox",
+        "llama",
+        "mistral",
+    ], f"expected --architecture to be one of 'neox', 'mistral', 'llama', but got '{args.architecture}' !"
+
+    if args.architecture == "mistral":
+        # Mistral Support Coming Soon
+        assert (
+            False
+        ), "Got num_key_value_heads != num_attention_heads, but Grouped-Query Attention is not yet supported by NeoX."
+
+    with open(args.config_file) as f:
+        loaded_config = yaml.full_load(f)
+        if overwrite_values:
+            loaded_config.update(overwrite_values)
+
+    # Determine the checkpoint format of the model.
+    # DeepSpeed saves models wrapped in a PipelineModule differently from those not.
+    # PipelineModule models are saved as per-layer state dicts per TP shard,
+    # while Sequential model state dicts are saved all together in one mp_rank_xx_model_states.pt
+    # file per tensor/model parallel shard.
+    pipeline_world_size = get_key(loaded_config, "pipe-parallel-size", 1)
+    if pipeline_world_size == 0:
+        sequential = True
+        print(
+            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as Sequential..."
+        )
+    else:
+        sequential = False
+        print(
+            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as PipelineModule..."
+        )
+
+    # convert the model to HF.
+    hf_model = convert(
+        args.input_dir,
+        loaded_config,
+        args.output_dir,
+        sequential=sequential,
+        architecture=args.architecture,
+    )
+
+    # Save to disk.
+    hf_model.save_pretrained(args.output_dir)
+
+    if not args.no_save_tokenizer:
+        # save tokenizer to directory as well, for easy loading of model as a HF model.
+        tokenizer_type = get_key(loaded_config, "tokenizer-type")
+
+        if tokenizer_type == "HFTokenizer":  # TODO: handle sentencepiece tokenizers?
+            print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
+            print(
+                "Warning: please check that your model config and tokenizer end with the correct special tokens (EOS, BOS)."
+            )
+            from transformers import PreTrainedTokenizerFast
+
+            tokenizer = PreTrainedTokenizerFast(
+                tokenizer_file=get_key(loaded_config, "vocab-file")
+            )
+            print("loaded tokenizer: ", tokenizer)
+            tokenizer.save_pretrained(args.output_dir)
+            print("tokenizer saved!")
+
+
+if __name__ == "__main__":
+
+    # before running script:
+    # `pip install --upgrade transformers`
+    # `huggingface-cli login`
+    #
+    main()
diff --git a/tools/ckpts/convert_raw_llama_weights_to_neox.py b/tools/ckpts/convert_raw_llama_weights_to_neox.py
new file mode 100644
index 000000000..6fc9f8a7e
--- /dev/null
+++ b/tools/ckpts/convert_raw_llama_weights_to_neox.py
@@ -0,0 +1,693 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import torch
+import json
+import math
+import tqdm.auto as tqdm
+
+
+INTERMEDIATE_SIZE_MAP = {
+    "7B": 11008,
+    "13B": 13824,
+    "30B": 17920,
+    "34B": 22016,
+    "65B": 22016,
+    "70B": 28672,
+    "mistral-7B-v0.1": 14336,
+}
+NUM_SHARDS = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "34B": 4,
+    "65B": 8,
+    "70B": 8,
+    "mistral-7B-v0.1": 1,
+}
+
+
+def compute_intermediate_size(n):
+    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_file(text, path):
+    with open(path, "w") as f:
+        f.write(text)
+
+
+def convert_model_pipeline(
+    output_base_path, input_base_path, model_size: str, num_output_shards: int
+):
+    assert model_size in NUM_SHARDS
+
+    model_path = os.path.join(output_base_path, "global_step0")
+    os.makedirs(model_path, exist_ok=True)
+    write_file("global_step0", os.path.join(output_base_path, "latest"))
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_input_shards = NUM_SHARDS[model_size]
+    num_layers = params["n_layers"]
+    num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        assert (
+            False
+        ), "Found `n_kv_heads` != `n_heads` in checkpoint config. However, Grouped-Query Attention is not yet supported by NeoX"
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
+    num_heads_per_input_shard = num_heads // num_input_shards
+    num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
+    hidden_size = params["dim"]
+    dims_per_head = hidden_size // num_heads
+    # base = 10000.0
+    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    def permute_rotary(w):
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
+        return (
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
+            .transpose(1, 2)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
+        )
+
+    pbar = tqdm.tqdm(total=num_input_shards + num_layers + 3)
+
+    pbar.set_description(f"Loading shard")
+    loaded = []
+    for i in range(num_input_shards):
+        loaded.append(
+            torch.load(
+                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
+                map_location="cpu",
+            )
+        )
+        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
+        pbar.update(1)
+    helper = Helper(
+        loaded=loaded,
+        model_path=model_path,
+        num_output_shards=num_output_shards,
+        model_size=model_size,
+        pipeline_parallel=False,
+    )
+
+    sequential_cache = [{} for _ in range(num_output_shards)]
+
+    # Embedding in
+    embeddings_in = torch.cat(
+        [
+            loaded[rank]["tok_embeddings.weight"].cpu()
+            for rank in range(num_input_shards)
+        ],
+        dim=1,
+    )
+    print(embeddings_in.shape)
+    helper.save_shards(
+        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
+    )
+    helper.del_loaded("tok_embeddings.weight")
+    pbar.set_description(f"Saved embeddings")
+    pbar.update(1)
+
+    # Norms
+    helper.save_duplicates(
+        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
+    )
+    helper.del_loaded("norm.weight")
+    pbar.set_description(f"Saved final norm")
+    pbar.update(1)
+
+    # Embedding out
+    embeddings_out = torch.cat(
+        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
+    )
+    helper.save_shards(
+        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
+        layer_i=num_layers + 4,
+    )
+    helper.del_loaded("output.weight")
+    pbar.set_description(f"Saved out embeddings")
+    pbar.update(1)
+
+    # Layers
+    if model_size == "7B":
+        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
+        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    elif "mistral" in model_size:
+        rope_freqs = None
+    else:
+        rope_freqs = loaded[0]["rope.freqs"]
+        helper.del_loaded("rope.freqs")
+    for layer_i in range(num_layers):
+
+        # Linear
+        attn_wo = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w1 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        mlp_w2 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w3 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
+
+        # Attention
+        w_q = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
+                        num_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        )
+        w_k = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        w_v = torch.cat(
+            [
+                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                )
+                for rank in range(num_input_shards)
+            ],
+            dim=0,
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        sharded_qkv = torch.cat(
+            [
+                helper.shard(
+                    w_q, dim=0
+                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
+                helper.shard(w_k, dim=0),
+                helper.shard(w_v, dim=0),
+            ],
+            dim=2,
+        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
+        sharded_qkv = sharded_qkv.view(
+            num_output_shards,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
+            hidden_size,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
+
+        # Duplicated
+        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
+        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
+        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
+        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
+
+        for out_rank in range(num_output_shards):
+            helper.save(
+                {
+                    "attention.query_key_value.weight": sharded_qkv[out_rank],
+                    # Sharded layers
+                    "attention.dense.weight": attn_wo[out_rank].clone(),
+                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
+                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
+                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
+                    # Duplicated layers
+                    "input_layernorm.scale": input_layernorm,
+                    "post_attention_layernorm.scale": post_attention_layernorm,
+                    **(
+                        {"attention.rotary_emb.inv_freq": rope_freqs}
+                        if "mistral" not in model_size
+                        else {}
+                    ),
+                },
+                layer_i=layer_i + 2,
+                rank=out_rank,
+            )
+
+        pbar.set_description(f"Saved layer {layer_i} / {num_layers}")
+        pbar.update(1)
+
+    model_state = {
+        "dp_world_size": 1,
+        "mp_world_size": num_output_shards,
+        "module": {},
+        "optimizer": {},
+        "global_steps": 1,
+        "skipped_steps": 1,
+        "iteration": 1,
+    }
+    for rank in range(num_output_shards):
+        torch.save(
+            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
+        )
+    pbar.set_description("Done.")
+
+
+def convert_model_sequential(
+    output_base_path, input_base_path, model_size: str, num_output_shards: int
+):
+    assert model_size in NUM_SHARDS
+
+    model_path = os.path.join(output_base_path, "global_step0")
+    os.makedirs(model_path, exist_ok=True)
+    write_file("global_step0", os.path.join(output_base_path, "latest"))
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_input_shards = NUM_SHARDS[model_size]
+    num_layers = params["n_layers"]
+    num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        assert (
+            False
+        ), "Found `n_kv_heads` != `n_heads` in checkpoint config. However, Grouped-Query Attention is not yet supported by NeoX"
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
+    num_heads_per_input_shard = num_heads // num_input_shards
+    num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
+    hidden_size = params["dim"]
+    dims_per_head = hidden_size // num_heads
+    # base = 10000.0
+    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    def permute_rotary(w):
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
+        return (
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
+            .transpose(1, 2)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
+        )
+
+    pbar = tqdm.tqdm(total=num_input_shards + num_output_shards)
+
+    pbar.set_description(f"Loading shard")
+    loaded = []
+    for i in range(num_input_shards):
+        loaded.append(
+            torch.load(
+                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
+                map_location="cpu",
+            )
+        )
+        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
+        pbar.update(1)
+    helper = Helper(
+        loaded=loaded,
+        model_path=model_path,
+        num_output_shards=num_output_shards,
+        model_size=model_size,
+        pipeline_parallel=False,
+    )
+
+    # Embedding in
+    embeddings_in = torch.cat(
+        [
+            loaded[rank]["tok_embeddings.weight"].cpu()
+            for rank in range(num_input_shards)
+        ],
+        dim=1,
+    )
+
+    helper.add_sequential_shard(
+        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
+    )
+    helper.del_loaded("tok_embeddings.weight")
+
+    # Norms
+    helper.add_sequential_duplicates(
+        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
+    )
+    helper.del_loaded("norm.weight")
+
+    # Embedding out
+    embeddings_out = torch.cat(
+        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
+    )
+    helper.add_sequential_shard(
+        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
+        layer_i=num_layers + 4,
+    )
+    helper.del_loaded("output.weight")
+
+    # Layers
+    if model_size == "7B":
+        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
+        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    elif "mistral" in model_size:
+        rope_freqs = None
+    else:
+        rope_freqs = loaded[0]["rope.freqs"]
+        helper.del_loaded("rope.freqs")
+    for layer_i in range(num_layers):
+
+        # Linear
+        attn_wo = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w1 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        mlp_w2 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w3 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
+
+        # Attention
+        w_q = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
+                        num_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        )
+
+        w_k = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        w_v = torch.cat(
+            [
+                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                )
+                for rank in range(num_input_shards)
+            ],
+            dim=0,
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        sharded_qkv = torch.cat(
+            [
+                helper.shard(
+                    w_q, dim=0
+                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
+                helper.shard(w_k, dim=0),
+                helper.shard(w_v, dim=0),
+            ],
+            dim=2,
+        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
+        sharded_qkv = sharded_qkv.view(
+            num_output_shards,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
+            hidden_size,
+        )
+
+        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
+
+        # Duplicated
+        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
+        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
+        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
+        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
+
+        for out_rank in range(num_output_shards):
+            helper.add_sequential(
+                {
+                    "attention.query_key_value.weight": sharded_qkv[out_rank],
+                    # Sharded layers
+                    "attention.dense.weight": attn_wo[out_rank].clone(),
+                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
+                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
+                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
+                    # Duplicated layers
+                    "input_layernorm.scale": input_layernorm,
+                    "post_attention_layernorm.scale": post_attention_layernorm,
+                    **(
+                        {"attention.rotary_emb.inv_freq": rope_freqs}
+                        if "mistral" not in model_size
+                        else {}
+                    ),
+                },
+                layer_i=layer_i + 2,
+                rank=out_rank,
+            )
+
+    for rank in range(num_output_shards):
+        model_state = {
+            "dp_world_size": 1,
+            "mp_world_size": num_output_shards,
+            "module": helper.sequential_cache[rank],
+            "optimizer": {},
+            "global_steps": 1,
+            "skipped_steps": 1,
+            "iteration": 1,
+        }
+        torch.save(
+            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
+        )
+        pbar.set_description(f"Saved shard {rank}")
+        pbar.update(1)
+    pbar.set_description("Done.")
+
+
+class Helper:
+    def __init__(
+        self, loaded, model_size, num_output_shards, model_path, pipeline_parallel
+    ):
+        self.loaded = loaded
+        self.model_size = model_size
+        self.num_output_shards = num_output_shards
+        self.model_path = model_path
+
+        self.pipeline_parallel = pipeline_parallel
+        self.sequential_cache = [{} for _ in range(num_output_shards)]
+
+    def del_loaded(self, key: str):
+        # Remove from memory as we go along
+        for loaded_shared in self.loaded:
+            del loaded_shared[key]
+
+    def save_shards(self, dictionary, layer_i: int):
+        for k, v in dictionary.items():
+            assert v.shape[0] == self.num_output_shards
+        for rank in range(self.num_output_shards):
+            torch.save(
+                {k: v[rank].clone() for k, v in dictionary.items()},
+                self.save_path(layer_i=layer_i, rank=rank),
+            )
+
+    def save_duplicates(self, dictionary, layer_i: int):
+        for rank in range(self.num_output_shards):
+            torch.save(
+                {k: v.clone() for k, v in dictionary.items()},
+                self.save_path(layer_i=layer_i, rank=rank),
+            )
+
+    def save(self, obj, layer_i, rank):
+        torch.save(obj, self.save_path(layer_i=layer_i, rank=rank))
+
+    def shard(self, x, dim):
+        x_shape = list(x.shape)
+        assert x_shape[dim] % self.num_output_shards == 0
+        new_x_shape = (
+            x_shape[:dim]
+            + [self.num_output_shards, x_shape[dim] // self.num_output_shards]
+            + x_shape[dim + 1 :]
+        )
+        x = x.view(*new_x_shape)
+        return torch.movedim(x, 0, dim)
+
+    def save_path(self, layer_i, rank):
+        return os.path.join(
+            self.model_path, f"layer_{layer_i:02d}-model_{rank:02d}-model_states.pt"
+        )
+
+    def add_sequential_shard(self, dictionary, layer_i):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            for rank in range(self.num_output_shards):
+                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v[
+                    rank
+                ].clone()
+
+    def add_sequential_duplicates(self, dictionary, layer_i):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            for rank in range(self.num_output_shards):
+                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
+
+    def add_sequential(self, dictionary, layer_i, rank):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert raw LLaMA or Mistral checkpoints to GPT-NeoX format."
+    )
+    parser.add_argument(
+        "--input_dir",
+        help="Location of parent directory, which contains tokenizer.model and model weights subfolders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "mistral-7B-v0.1", "13B", "30B", "34B", "65B", "tokenizer_only"],
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write GPT-NeoX model",
+    )
+    parser.add_argument(
+        "--num_output_shards",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--pipeline_parallel",
+        action="store_true",
+        help="Only use if PP>1",
+    )
+    args = parser.parse_args()
+    if args.pipeline_parallel:
+        print("parallel")
+        convert_model_pipeline(
+            output_base_path=args.output_dir,
+            input_base_path=os.path.join(args.input_dir, args.model_size),
+            model_size=args.model_size,
+            num_output_shards=args.num_output_shards,
+        )
+    else:
+        print("sequential")
+        convert_model_sequential(
+            output_base_path=args.output_dir,
+            input_base_path=os.path.join(args.input_dir, args.model_size),
+            model_size=args.model_size,
+            num_output_shards=args.num_output_shards,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/ckpts/inspect_checkpoints.py b/tools/ckpts/inspect_checkpoints.py
new file mode 100644
index 000000000..a23d456ca
--- /dev/null
+++ b/tools/ckpts/inspect_checkpoints.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from https://github.com/awaelchli/pytorch-lightning-snippets/blob/master/checkpoint/peek.py
+
+import code
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+
+import torch
+
+
+class COLORS:
+    BLUE = "\033[94m"
+    CYAN = "\033[96m"
+    GREEN = "\033[92m"
+    RED = "\033[31m"
+    YELLOW = "\033[33m"
+    MAGENTA = "\033[35m"
+    WHITE = "\033[37m"
+    UNDERLINE = "\033[4m"
+    END = "\033[0m"
+
+
+PRIMITIVE_TYPES = (int, float, bool, str, type)
+
+
+def natural_sort(l):
+    convert = lambda text: int(text) if text.isdigit() else text.lower()
+    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", str(key))]
+    return sorted(l, key=alphanum_key)
+
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, "Yi", suffix)
+
+
+def pretty_print(contents: dict):
+    """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
+    col_size = max(len(str(k)) for k in contents)
+    for k, v in sorted(contents.items()):
+        key_length = len(str(k))
+        line = " " * (col_size - key_length)
+        line += f"{k}: {COLORS.BLUE}{type(v).__name__}{COLORS.END}"
+        if isinstance(v, dict):
+            pretty_print(v)
+        elif isinstance(v, PRIMITIVE_TYPES):
+            line += f" = "
+            line += f"{COLORS.CYAN}{repr(v)}{COLORS.END}"
+        elif isinstance(v, Sequence):
+            line += ", "
+            line += f"{COLORS.CYAN}len={len(v)}{COLORS.END}"
+        elif isinstance(v, torch.Tensor):
+            if v.ndimension() in (0, 1) and v.numel() == 1:
+                line += f" = "
+                line += f"{COLORS.CYAN}{v.item()}{COLORS.END}"
+            else:
+                line += ", "
+                line += f"{COLORS.CYAN}shape={list(v.shape)}{COLORS.END}"
+                line += ", "
+                line += f"{COLORS.CYAN}dtype={v.dtype}{COLORS.END}"
+            line += (
+                ", "
+                + f"{COLORS.CYAN}size={sizeof_fmt(v.nelement() * v.element_size())}{COLORS.END}"
+            )
+        print(line)
+
+
+def common_entries(*dcts):
+    if not dcts:
+        return
+    for i in set(dcts[0]).intersection(*dcts[1:]):
+        yield (i,) + tuple(d[i] for d in dcts)
+
+
+def pretty_print_double(contents1: dict, contents2: dict, args):
+    """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
+    col_size = max(
+        max(len(str(k)) for k in contents1), max(len(str(k)) for k in contents2)
+    )
+    common_keys = list(contents1.keys() & contents2.keys())
+    uncommon_keys_1 = [i for i in contents2.keys() if i not in common_keys]
+    uncommon_keys_2 = [i for i in contents1.keys() if i not in common_keys]
+    diffs_found = False
+    if uncommon_keys_1 + uncommon_keys_2:
+        diffs_found = True
+        if uncommon_keys_1:
+            print(
+                f"{COLORS.RED}{len(uncommon_keys_1)} key(s) found in ckpt 1 that isn't present in ckpt 2:{COLORS.END} \n\t{COLORS.BLUE}{' '.join(uncommon_keys_1)}{COLORS.END}"
+            )
+        if uncommon_keys_2:
+            print(
+                f"{COLORS.RED}{len(uncommon_keys_2)} key(s) found in ckpt 2 that isn't present in ckpt 1:{COLORS.END} \n\t{COLORS.BLUE}{' '.join(uncommon_keys_2)}{COLORS.END}"
+            )
+    for k, v1, v2 in sorted(common_entries(contents1, contents2)):
+        key_length = len(str(k))
+        line = " " * (col_size - key_length)
+        if type(v1) != type(v2):
+            print(
+                f"{COLORS.RED}{k} is a different type between ckpt1 and ckpt2: ({type(v1).__name__} vs. {type(v2).__name__}){COLORS.END}"
+            )
+            continue
+        else:
+            prefix = f"{k}: {COLORS.BLUE}{type(v1).__name__} | {type(v2).__name__}{COLORS.END}"
+        if isinstance(v1, dict):
+            pretty_print_double(v1, v2, args)
+        elif isinstance(v1, PRIMITIVE_TYPES):
+            if repr(v1) != repr(v2):
+                c = COLORS.RED
+                line += f" = "
+                line += f"{c}{repr(v1)} | {repr(v2)}{COLORS.END}"
+            else:
+                c = COLORS.CYAN
+                if not args.diff:
+                    line += f" = "
+                    line += f"{c}{repr(v1)} | {repr(v2)}{COLORS.END}"
+        elif isinstance(v1, Sequence):
+            if len(v1) != len(v2):
+                c = COLORS.RED
+                line += ", "
+                line += f"{c}len={len(v1)} | len={len(v2)}{COLORS.END}"
+            else:
+                c = COLORS.CYAN
+                if not args.diff:
+                    line += ", "
+                    line += f"{c}len={len(v1)} | len={len(v2)}{COLORS.END}"
+        elif isinstance(v1, torch.Tensor):
+            if v1.ndimension() != v2.ndimension():
+                c = COLORS.RED
+            else:
+                c = COLORS.CYAN
+
+            if (v1.ndimension() in (0, 1) and v1.numel() == 1) and (
+                v2.ndimension() in (0, 1) and v2.numel() == 1
+            ):
+                if not args.diff:
+                    line += f" = "
+                    line += f"{c}{v1.item()} | {c}{v2.item()}{COLORS.END}"
+            else:
+                if list(v1.shape) != list(v2.shape):
+                    c = COLORS.RED
+                    line += ", "
+                    line += f"{c}shape={list(v1.shape)} | shape={list(v2.shape)}{COLORS.END}"
+                else:
+                    c = COLORS.CYAN
+                    if not args.diff:
+                        line += ", "
+                        line += f"{c}shape={list(v1.shape)} | shape={list(v2.shape)}{COLORS.END}"
+                if v1.dtype != v2.dtype:
+                    c = COLORS.RED
+                    line += f"{c}dtype={v1.dtype} | dtype={v2.dtype}{COLORS.END}"
+
+                else:
+                    c = COLORS.CYAN
+                    if not args.diff:
+                        line += ", "
+                        line += f"{c}dtype={v1.dtype} | dtype={v2.dtype}{COLORS.END}"
+                if list(v1.shape) == list(v2.shape):
+                    if torch.allclose(v1, v2):
+                        if not args.diff:
+                            line += f", {COLORS.CYAN}VALUES EQUAL{COLORS.END}"
+                    else:
+                        line += f", {COLORS.RED}VALUES DIFFER{COLORS.END}"
+
+        if line.replace(" ", "") != "":
+            line = prefix + line
+            print(line)
+            diffs_found = True
+    if args.diff and not diffs_found:
+        pass
+    else:
+        if not args.diff:
+            print("\n")
+
+    return diffs_found
+
+
+def get_attribute(obj: object, name: str) -> object:
+    if isinstance(obj, Mapping):
+        return obj[name]
+    if isinstance(obj, Namespace):
+        return obj.name
+    return getattr(object, name)
+
+
+def get_files(pth):
+    if os.path.isdir(pth):
+        files = list(Path(pth).glob("*.pt")) + list(Path(pth).glob("*.ckpt"))
+    elif os.path.isfile(pth):
+        assert pth.endswith(".pt") or pth.endswith(".ckpt")
+        files = [Path(pth)]
+    else:
+        raise ValueError("Dir / File not found.")
+    return natural_sort(files)
+
+
+def peek(args: Namespace):
+
+    files = get_files(args.dir)
+
+    for file in files:
+        file = Path(file).absolute()
+        print(f"{COLORS.GREEN}{file.name}:{COLORS.END}")
+        ckpt = torch.load(file, map_location=torch.device("cpu"))
+        selection = dict()
+        attribute_names = args.attributes or list(ckpt.keys())
+        for name in attribute_names:
+            parts = name.split("/")
+            current = ckpt
+            for part in parts:
+                current = get_attribute(current, part)
+            selection.update({name: current})
+        pretty_print(selection)
+        print("\n")
+
+        if args.interactive:
+            code.interact(
+                banner="Entering interactive shell. You can access the checkpoint contents through the local variable 'checkpoint'.",
+                local={"checkpoint": ckpt, "torch": torch},
+            )
+
+
+def get_shared_fnames(files_1, files_2):
+    names_1 = [Path(i).name for i in files_1]
+    names_1_parent = Path(files_1[0]).parent
+    names_2 = [Path(i).name for i in files_2]
+    names_2_parent = Path(files_2[0]).parent
+    shared_names = list(set.intersection(*map(set, [names_1, names_2])))
+    return [names_1_parent / i for i in shared_names], [
+        names_2_parent / i for i in shared_names
+    ]
+
+
+def get_selection(filename, args):
+    ckpt = torch.load(filename, map_location=torch.device("cpu"))
+    selection = dict()
+    attribute_names = args.attributes or list(ckpt.keys())
+    for name in attribute_names:
+        parts = name.split("/")
+        current = ckpt
+        for part in parts:
+            current = get_attribute(current, part)
+        selection.update({name: current})
+    return selection
+
+
+def compare(args: Namespace):
+    dirs = [i.strip() for i in args.dir.split(",")]
+    assert len(dirs) == 2, "Only works with 2 directories / files"
+    files_1 = get_files(dirs[0])
+    files_2 = get_files(dirs[1])
+    files_1, files_2 = get_shared_fnames(files_1, files_2)
+
+    for file1, file2 in zip(files_1, files_2):
+        file1 = Path(file1).absolute()
+        file2 = Path(file2).absolute()
+        print(f"COMPARING {COLORS.GREEN}{file1.name} & {file2.name}:{COLORS.END}")
+        selection_1 = get_selection(file1, args)
+        selection_2 = get_selection(file2, args)
+        diffs_found = pretty_print_double(selection_1, selection_2, args)
+        if args.diff and diffs_found:
+            print(
+                f"{COLORS.RED}THE ABOVE DIFFS WERE FOUND IN {file1.name} & {file2.name} ^{COLORS.END}\n"
+            )
+
+        if args.interactive:
+            code.interact(
+                banner="Entering interactive shell. You can access the checkpoint contents through the local variable 'selection_1' / 'selection_2'.\nPress Ctrl-D to exit.",
+                local={
+                    "selection_1": selection_1,
+                    "selection_2": selection_2,
+                    "torch": torch,
+                },
+            )
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "dir",
+        type=str,
+        help="The checkpoint dir to inspect. Must be either: \
+         - a directory containing pickle binaries saved with 'torch.save' ending in .pt or .ckpt \
+         - a single path to a .pt or .ckpt file \
+         - two comma separated directories - in which case the script will *compare* the two checkpoints",
+    )
+    parser.add_argument(
+        "--attributes",
+        nargs="*",
+        help="Name of one or several attributes to query. To access an attribute within a nested structure, use '/' as separator.",
+        default=None,
+    )
+    parser.add_argument(
+        "--interactive",
+        "-i",
+        action="store_true",
+        help="Drops into interactive shell after printing the summary.",
+    )
+    parser.add_argument(
+        "--compare",
+        "-c",
+        action="store_true",
+        help="If true, script will compare two directories separated by commas",
+    )
+    parser.add_argument(
+        "--diff", "-d", action="store_true", help="In compare mode, only print diffs"
+    )
+
+    args = parser.parse_args()
+    if args.compare:
+        compare(args)
+    else:
+        peek(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/ckpts/merge20b.py b/tools/ckpts/merge20b.py
new file mode 100644
index 000000000..09029bb5b
--- /dev/null
+++ b/tools/ckpts/merge20b.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import torch
+import yaml
+import shutil
+from tqdm import auto as tqdm_lib
+
+
+VOCAB_SIZE = 50432
+IGNORED_MODEL_STATE_KEYS = [
+    "optimizer",
+    "random_rng_state",
+    "np_rng_state",
+    "torch_rng_state",
+    "cuda_rng_state",
+    "rng_tracker_states",
+]
+
+
+def modify_config(input_config_path, output_config_path, output_dir):
+    with open(input_config_path) as f:
+        loaded_config = yaml.full_load(f)
+
+    # replace model/pipeline parallel
+    loaded_config["model_parallel_size"] = 1
+    loaded_config["pipe_parallel_size"] = 1
+
+    # replace load / save directories:
+    loaded_config["load"] = output_dir
+    loaded_config["save"] = output_dir
+
+    # replace some other paths
+    loaded_config["vocab_file"] = os.path.join(output_dir, "20B_tokenizer.json")
+    loaded_config["log_dir"] = "./logs"
+
+    # we need to make sure the resulting vocab size is correct
+    # do this by modifying the 'make_vocab_size_divisible_by' argument to be
+    # orig * (orig_mp / mp_out)
+    loaded_config["make_vocab_size_divisible_by"] = VOCAB_SIZE
+
+    # remove zero optimizer
+    loaded_config["zero_optimization"]["stage"] = 0
+
+    with open(output_config_path, "w") as f:
+        yaml.dump(loaded_config, f)
+
+
+def modify_model_states(input_model_state_path, output_model_state_path):
+    model_state = torch.load(input_model_state_path)
+    for key in IGNORED_MODEL_STATE_KEYS:
+        del model_state[key]
+    model_state["mp_world_size"] = 1
+    model_state["dp_world_size"] = 1  # could make this configurable?
+    model_state["args"]["model_parallel_size"] = 1
+    model_state["args"]["make_vocab_size_divisible_by"] = VOCAB_SIZE
+    torch.save(model_state, output_model_state_path)
+
+
+def merge_model_weights(input_checkpoint_path, output_checkpoint_path):
+    pbar = tqdm_lib.tqdm(total=47)
+
+    # Load transformer layers
+    for layer_i in range(44):
+        pbar.set_description(f"Merging layer {layer_i}")
+        filename_tp1 = f"layer_{layer_i + 2:02d}-model_00-model_states.pt"
+        filename_tp2 = f"layer_{layer_i + 2:02d}-model_01-model_states.pt"
+        loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, filename_tp1))
+        loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, filename_tp2))
+        # noinspection PyDictCreation
+        merged = {}
+
+        # RowParallelLinear
+        merged["mlp.dense_4h_to_h.weight"] = torch.cat(
+            [
+                loaded_tp1["mlp.dense_4h_to_h.weight"],
+                loaded_tp2["mlp.dense_4h_to_h.weight"],
+            ],
+            dim=1,
+        )
+        merged["attention.dense.weight"] = torch.cat(
+            [
+                loaded_tp1["attention.dense.weight"],
+                loaded_tp2["attention.dense.weight"],
+            ],
+            dim=1,
+        )
+        merged["mlp.dense_4h_to_h.bias"] = (
+            loaded_tp1["mlp.dense_4h_to_h.bias"] + loaded_tp2["mlp.dense_4h_to_h.bias"]
+        )
+        merged["attention.dense.bias"] = (
+            loaded_tp1["attention.dense.bias"] + loaded_tp2["attention.dense.bias"]
+        )
+
+        # Layer Norms
+        merged["input_layernorm.weight"] = (
+            loaded_tp1["input_layernorm.weight"] + loaded_tp2["input_layernorm.weight"]
+        ) / 2
+        merged["input_layernorm.bias"] = (
+            loaded_tp1["input_layernorm.bias"] + loaded_tp2["input_layernorm.bias"]
+        ) / 2
+        merged["post_attention_layernorm.weight"] = (
+            loaded_tp1["post_attention_layernorm.weight"]
+            + loaded_tp2["post_attention_layernorm.weight"]
+        ) / 2
+        merged["post_attention_layernorm.bias"] = (
+            loaded_tp1["post_attention_layernorm.bias"]
+            + loaded_tp2["post_attention_layernorm.bias"]
+        ) / 2
+
+        # ColumnParallelLinear
+        merged["mlp.dense_h_to_4h.weight"] = torch.cat(
+            [
+                loaded_tp1["mlp.dense_h_to_4h.weight"],
+                loaded_tp2["mlp.dense_h_to_4h.weight"],
+            ],
+            dim=0,
+        )
+        merged["mlp.dense_h_to_4h.bias"] = torch.cat(
+            [
+                loaded_tp1["mlp.dense_h_to_4h.bias"],
+                loaded_tp2["mlp.dense_h_to_4h.bias"],
+            ],
+            dim=0,
+        )
+        merged["attention.query_key_value.weight"] = torch.cat(
+            [
+                loaded_tp1["attention.query_key_value.weight"],
+                loaded_tp2["attention.query_key_value.weight"],
+            ],
+            dim=0,
+        )
+        merged["attention.query_key_value.bias"] = torch.cat(
+            [
+                loaded_tp1["attention.query_key_value.bias"],
+                loaded_tp2["attention.query_key_value.bias"],
+            ],
+            dim=0,
+        )
+
+        # Just take one
+        merged["attention.rotary_emb.inv_freq"] = loaded_tp1[
+            "attention.rotary_emb.inv_freq"
+        ]
+
+        torch.save(merged, os.path.join(output_checkpoint_path, filename_tp1))
+        del loaded_tp1
+        del loaded_tp2
+        pbar.update(1)
+
+    # Load input embedding
+    pbar.set_description(f"Merging input embedding")
+    loaded_tp1 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt")
+    )
+    loaded_tp2 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt")
+    )
+    merged = {
+        "word_embeddings.weight": torch.cat(
+            [
+                loaded_tp1["word_embeddings.weight"],
+                loaded_tp2["word_embeddings.weight"],
+            ],
+            dim=0,
+        )
+    }
+    torch.save(
+        merged,
+        os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"),
+    )
+    del loaded_tp1
+    del loaded_tp2
+    pbar.update(1)
+
+    # Load final layer norm
+    pbar.set_description(f"Merging final layer norm")
+    loaded_tp1 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt")
+    )
+    loaded_tp2 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt")
+    )
+    merged = {
+        "norm.weight": (loaded_tp1["norm.weight"] + loaded_tp2["norm.weight"]) / 2,
+        "norm.bias": (loaded_tp1["norm.bias"] + loaded_tp2["norm.bias"]) / 2,
+    }
+    torch.save(
+        merged,
+        os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"),
+    )
+    del loaded_tp1
+    del loaded_tp2
+    pbar.update(1)
+
+    # Load output embedding
+    pbar.set_description(f"Merging output embedding")
+    loaded_tp1 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt")
+    )
+    loaded_tp2 = torch.load(
+        os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt")
+    )
+    merged = {
+        "final_linear.weight": torch.cat(
+            [
+                loaded_tp1["final_linear.weight"],
+                loaded_tp2["final_linear.weight"],
+            ],
+            dim=0,
+        ),
+    }
+    torch.save(
+        merged,
+        os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"),
+    )
+    del loaded_tp1
+    del loaded_tp2
+    pbar.update(1)
+    pbar.set_description("Done.")
+
+
+def merge(input_dir, output_dir):
+    input_checkpoint_path = os.path.join(input_dir, "global_step150000")
+    output_checkpoint_path = os.path.join(output_dir, "global_step150000")
+    os.makedirs(output_checkpoint_path, exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "configs"), exist_ok=True)
+    for i in range(8):
+        modify_model_states(
+            input_model_state_path=os.path.join(
+                input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
+            ),
+            output_model_state_path=os.path.join(
+                output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
+            ),
+        )
+    modify_config(
+        input_config_path=os.path.join(input_dir, "configs", "20B.yml"),
+        output_config_path=os.path.join(output_dir, "configs", "20B.yml"),
+        output_dir=output_dir,
+    )
+    merge_model_weights(
+        input_checkpoint_path=input_checkpoint_path,
+        output_checkpoint_path=output_checkpoint_path,
+    )
+    shutil.copyfile(
+        os.path.join(input_dir, "20B_tokenizer.json"),
+        os.path.join(output_dir, "20B_tokenizer.json"),
+    )
+    with open(os.path.join(output_dir, "latest"), "w") as f:
+        f.write("global_step150000")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Merge 20B checkpoint.")
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")',
+    )
+    parser.add_argument(
+        "--output_dir", type=str, help="Output dir, to save the 1-GPU weights configs"
+    )
+    args = parser.parse_args()
+    merge(args.input_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/ckpts/upload.py b/tools/ckpts/upload.py
new file mode 100644
index 000000000..01d585be8
--- /dev/null
+++ b/tools/ckpts/upload.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+from huggingface_hub import HfApi, create_repo
+
+converted_ckpt = sys.argv[1]
+repo_name = sys.argv[2]
+branch_name = sys.argv[3]
+try:
+    create_repo(repo_name, repo_type="model", private=False)
+except:
+    print("repo {repo_name} already exists!")
+    pass
+
+files = os.listdir(converted_ckpt)
+
+api = HfApi()
+if branch_name != "main":
+    try:
+        api.create_branch(
+            repo_id=repo_name,
+            repo_type="model",
+            branch=branch_name,
+        )
+    except:
+        print(f"branch {branch_name} already exists, try again...")
+print(f"to upload: {files}")
+for file in files:
+    print(f"Uploading {file} to branch {branch_name}...")
+    api.upload_file(
+        path_or_fileobj=os.path.join(converted_ckpt, file),
+        path_in_repo=file,
+        repo_id=repo_name,
+        repo_type="model",
+        commit_message=f"Upload {file}",
+        revision=branch_name,
+    )
+    print(f"Successfully uploaded {file} !")
diff --git a/tools/convert_gqa_llama.py b/tools/convert_gqa_llama.py
new file mode 100644
index 000000000..7d0e98049
--- /dev/null
+++ b/tools/convert_gqa_llama.py
@@ -0,0 +1,672 @@
+# Copyright (c) 2023, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import torch
+import json
+import math
+import tqdm.auto as tqdm
+
+
+INTERMEDIATE_SIZE_MAP = {
+    "7B": 11008,
+    "13B": 13824,
+    "30B": 17920,
+    "34B": 22016,
+    "65B": 22016,
+    "70B": 28672,
+    "Mistral-7B-v0.1": 14336,
+}
+NUM_SHARDS = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "34B": 4,
+    "65B": 8,
+    "70B": 8,
+    "Mistral-7B-v0.1": 1,
+}
+
+
+def compute_intermediate_size(n):
+    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_file(text, path):
+    with open(path, "w") as f:
+        f.write(text)
+
+
+def convert_model_pipeline(
+    output_base_path, input_base_path, model_size: str, num_output_shards: int
+):
+    assert model_size in NUM_SHARDS
+
+    model_path = os.path.join(output_base_path, "global_step0")
+    os.makedirs(model_path, exist_ok=True)
+    write_file("global_step0", os.path.join(output_base_path, "latest"))
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_input_shards = NUM_SHARDS[model_size]
+    num_layers = params["n_layers"]
+    num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
+    num_heads_per_input_shard = num_heads // num_input_shards
+    num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
+    hidden_size = params["dim"]
+    dims_per_head = hidden_size // num_heads
+    # base = 10000.0
+    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    def permute_rotary(w):
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
+        return (
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
+            .transpose(1, 2)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
+        )
+
+    pbar = tqdm.tqdm(total=num_input_shards + num_layers + 3)
+
+    pbar.set_description(f"Loading shard")
+    loaded = []
+    for i in range(num_input_shards):
+        loaded.append(
+            torch.load(
+                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
+                map_location="cpu",
+            )
+        )
+        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
+        pbar.update(1)
+    helper = Helper(
+        loaded=loaded,
+        model_path=model_path,
+        num_output_shards=num_output_shards,
+        model_size=model_size,
+        pipeline_parallel=False,
+    )
+
+    sequential_cache = [{} for _ in range(num_output_shards)]
+
+    # Embedding in
+    embeddings_in = torch.cat(
+        [
+            loaded[rank]["tok_embeddings.weight"].cpu()
+            for rank in range(num_input_shards)
+        ],
+        dim=1,
+    )
+    print(embeddings_in.shape)
+    helper.save_shards(
+        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
+    )
+    helper.del_loaded("tok_embeddings.weight")
+    pbar.set_description(f"Saved embeddings")
+    pbar.update(1)
+
+    # Norms
+    helper.save_duplicates(
+        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
+    )
+    helper.del_loaded("norm.weight")
+    pbar.set_description(f"Saved final norm")
+    pbar.update(1)
+
+    # Embedding out
+    embeddings_out = torch.cat(
+        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
+    )
+    helper.save_shards(
+        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
+        layer_i=num_layers + 4,
+    )
+    helper.del_loaded("output.weight")
+    pbar.set_description(f"Saved out embeddings")
+    pbar.update(1)
+
+    # Layers
+    if model_size == "7B":
+        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
+        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    else:
+        rope_freqs = loaded[0]["rope.freqs"]
+        helper.del_loaded("rope.freqs")
+    for layer_i in range(num_layers):
+
+        # Linear
+        attn_wo = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w1 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        mlp_w2 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w3 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
+
+        # Attention
+        w_q = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
+                        num_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        )
+        w_k = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        w_v = torch.cat(
+            [
+                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                )
+                for rank in range(num_input_shards)
+            ],
+            dim=0,
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        sharded_qkv = torch.cat(
+            [
+                helper.shard(
+                    w_q, dim=0
+                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
+                helper.shard(w_k, dim=0),
+                helper.shard(w_v, dim=0),
+            ],
+            dim=2,
+        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
+        sharded_qkv = sharded_qkv.view(
+            num_output_shards,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
+            hidden_size,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
+
+        # Duplicated
+        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
+        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
+        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
+        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
+
+        for out_rank in range(num_output_shards):
+            helper.save(
+                {
+                    "attention.query_key_value.weight": sharded_qkv[out_rank],
+                    # Sharded layers
+                    "attention.dense.weight": attn_wo[out_rank].clone(),
+                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
+                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
+                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
+                    # Duplicated layers
+                    "input_layernorm.scale": input_layernorm,
+                    "post_attention_layernorm.scale": post_attention_layernorm,
+                    "attention.rotary_emb.inv_freq": rope_freqs,
+                },
+                layer_i=layer_i + 2,
+                rank=out_rank,
+            )
+
+        pbar.set_description(f"Saved layer {layer_i} / {num_layers}")
+        pbar.update(1)
+
+    model_state = {
+        "dp_world_size": 1,
+        "mp_world_size": num_output_shards,
+        "module": {},
+        "optimizer": {},
+        "global_steps": 1,
+        "skipped_steps": 1,
+        "iteration": 1,
+    }
+    for rank in range(num_output_shards):
+        torch.save(
+            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
+        )
+    pbar.set_description("Done.")
+
+
+def convert_model_sequential(
+    output_base_path, input_base_path, model_size: str, num_output_shards: int
+):
+    assert model_size in NUM_SHARDS
+
+    model_path = os.path.join(output_base_path, "global_step0")
+    os.makedirs(model_path, exist_ok=True)
+    write_file("global_step0", os.path.join(output_base_path, "latest"))
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_input_shards = NUM_SHARDS[model_size]
+    num_layers = params["n_layers"]
+    num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
+    num_heads_per_input_shard = num_heads // num_input_shards
+    num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
+    hidden_size = params["dim"]
+    dims_per_head = hidden_size // num_heads
+    # base = 10000.0
+    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    def permute_rotary(w):
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
+        return (
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
+            .transpose(1, 2)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
+        )
+
+    pbar = tqdm.tqdm(total=num_input_shards + num_output_shards)
+
+    pbar.set_description(f"Loading shard")
+    loaded = []
+    for i in range(num_input_shards):
+        loaded.append(
+            torch.load(
+                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
+                map_location="cpu",
+            )
+        )
+        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
+        pbar.update(1)
+    helper = Helper(
+        loaded=loaded,
+        model_path=model_path,
+        num_output_shards=num_output_shards,
+        model_size=model_size,
+        pipeline_parallel=False,
+    )
+
+    # Embedding in
+    embeddings_in = torch.cat(
+        [
+            loaded[rank]["tok_embeddings.weight"].cpu()
+            for rank in range(num_input_shards)
+        ],
+        dim=1,
+    )
+
+    helper.add_sequential_shard(
+        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
+    )
+    helper.del_loaded("tok_embeddings.weight")
+
+    # Norms
+    helper.add_sequential_duplicates(
+        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
+    )
+    helper.del_loaded("norm.weight")
+
+    # Embedding out
+    embeddings_out = torch.cat(
+        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
+    )
+    helper.add_sequential_shard(
+        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
+        layer_i=num_layers + 4,
+    )
+    helper.del_loaded("output.weight")
+
+    # Layers
+    if model_size == "7B":
+        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
+        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    else:
+        rope_freqs = loaded[0]["rope.freqs"]
+        helper.del_loaded("rope.freqs")
+    for layer_i in range(num_layers):
+
+        # Linear
+        attn_wo = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w1 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        mlp_w2 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=1,
+            ),
+            dim=1,
+        )
+        mlp_w3 = helper.shard(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            ),
+            dim=0,
+        )
+        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
+        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
+
+        # Attention
+        w_q = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
+                        num_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        )
+        w_k = permute_rotary(
+            torch.cat(
+                [
+                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                    )
+                    for rank in range(num_input_shards)
+                ],
+                dim=0,
+            )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+        w_v = torch.cat(
+            [
+                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
+                )
+                for rank in range(num_input_shards)
+            ],
+            dim=0,
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+        sharded_qkv = torch.cat(
+            [
+                helper.shard(
+                    w_q, dim=0
+                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
+                helper.shard(w_k, dim=0),
+                helper.shard(w_v, dim=0),
+            ],
+            dim=2,
+        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
+        sharded_qkv = sharded_qkv.view(
+            num_output_shards,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
+            hidden_size,
+        )
+
+        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
+        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
+
+        # Duplicated
+        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
+        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
+        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
+        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
+
+        for out_rank in range(num_output_shards):
+            helper.add_sequential(
+                {
+                    "attention.query_key_value.weight": sharded_qkv[out_rank],
+                    # Sharded layers
+                    "attention.dense.weight": attn_wo[out_rank].clone(),
+                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
+                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
+                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
+                    # Duplicated layers
+                    "input_layernorm.scale": input_layernorm,
+                    "post_attention_layernorm.scale": post_attention_layernorm,
+                    "attention.rotary_emb.inv_freq": rope_freqs,
+                },
+                layer_i=layer_i + 2,
+                rank=out_rank,
+            )
+
+    for rank in range(num_output_shards):
+        model_state = {
+            "dp_world_size": 1,
+            "mp_world_size": num_output_shards,
+            "module": helper.sequential_cache[rank],
+            "optimizer": {},
+            "global_steps": 1,
+            "skipped_steps": 1,
+            "iteration": 1,
+        }
+        torch.save(
+            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
+        )
+        pbar.set_description(f"Saved shard {rank}")
+        pbar.update(1)
+    pbar.set_description("Done.")
+
+
+class Helper:
+    def __init__(
+        self, loaded, model_size, num_output_shards, model_path, pipeline_parallel
+    ):
+        self.loaded = loaded
+        self.model_size = model_size
+        self.num_output_shards = num_output_shards
+        self.model_path = model_path
+
+        self.pipeline_parallel = pipeline_parallel
+        self.sequential_cache = [{} for _ in range(num_output_shards)]
+
+    def del_loaded(self, key: str):
+        # Remove from memory as we go along
+        for loaded_shared in self.loaded:
+            del loaded_shared[key]
+
+    def save_shards(self, dictionary, layer_i: int):
+        for k, v in dictionary.items():
+            assert v.shape[0] == self.num_output_shards
+        for rank in range(self.num_output_shards):
+            torch.save(
+                {k: v[rank].clone() for k, v in dictionary.items()},
+                self.save_path(layer_i=layer_i, rank=rank),
+            )
+
+    def save_duplicates(self, dictionary, layer_i: int):
+        for rank in range(self.num_output_shards):
+            torch.save(
+                {k: v.clone() for k, v in dictionary.items()},
+                self.save_path(layer_i=layer_i, rank=rank),
+            )
+
+    def save(self, obj, layer_i, rank):
+        torch.save(obj, self.save_path(layer_i=layer_i + 2, rank=rank))
+
+    def shard(self, x, dim):
+        x_shape = list(x.shape)
+        assert x_shape[dim] % self.num_output_shards == 0
+        new_x_shape = (
+            x_shape[:dim]
+            + [self.num_output_shards, x_shape[dim] // self.num_output_shards]
+            + x_shape[dim + 1 :]
+        )
+        x = x.view(*new_x_shape)
+        return torch.movedim(x, 0, dim)
+
+    def save_path(self, layer_i, rank):
+        return os.path.join(
+            self.model_path, f"layer_{layer_i:02d}-model_{rank:02d}-model_states.pt"
+        )
+
+    def add_sequential_shard(self, dictionary, layer_i):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            for rank in range(self.num_output_shards):
+                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v[
+                    rank
+                ].clone()
+
+    def add_sequential_duplicates(self, dictionary, layer_i):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            for rank in range(self.num_output_shards):
+                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
+
+    def add_sequential(self, dictionary, layer_i, rank):
+        assert not self.pipeline_parallel
+        for k, v in dictionary.items():
+            self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert raw LLaMA checkpoints to GPT-NeoX format."
+    )
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "Mistral-7b-v0.1", "13B", "30B", "34B", "65B", "tokenizer_only"],
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write GPT-NeoX mode",
+    )
+    parser.add_argument(
+        "--num_output_shards",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--pipeline_parallel",
+        action="store_true",
+        help="Only use if PP>1",
+    )
+    args = parser.parse_args()
+    if args.pipeline_parallel:
+        print("parallel")
+        convert_model_pipeline(
+            output_base_path=args.output_dir,
+            input_base_path=os.path.join(args.input_dir, args.model_size),
+            model_size=args.model_size,
+            num_output_shards=args.num_output_shards,
+        )
+    else:
+        print("sequential")
+        convert_model_sequential(
+            output_base_path=args.output_dir,
+            input_base_path=os.path.join(args.input_dir, args.model_size),
+            model_size=args.model_size,
+            num_output_shards=args.num_output_shards,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
new file mode 100644
index 000000000..9b459e1e3
--- /dev/null
+++ b/tools/datasets/README.md
@@ -0,0 +1,118 @@
+# Data Scripts
+
+## `preprocess_data.py`
+Takes a raw dataset, splits it up, tokenizes it, and saves it as numpy files that can be memmapped and used efficiently by the training code.
+
+```
+usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS]
+                          --tokenizer-type
+                          {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer,TiktokenTokenizer,SPMTokenizer}
+                          [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix
+                          OUTPUT_PREFIX [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS]
+                          [--log-interval LOG_INTERVAL]
+
+options:
+  -h, --help            show this help message and exit
+
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma
+                        separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Defa
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+
+tokenizer:
+  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer,TiktokenTokenizer,SPMTokenizer}
+                        What type of tokenizer to use.
+  --vocab-file VOCAB_FILE
+                        Path to the vocab file
+  --merge-file MERGE_FILE
+                        Path to the BPE merge file (if necessary).
+  --append-eod          Append an <eod> token to the end of a document.
+  --ftfy                Use ftfy to clean text
+
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+```
+## `preprocess_data_with_mask.py`
+Does the same but also creates `label` tensors if the dataset has labels.
+
+N.B. If using this, you  **must** specify your data when training/finetuning with the following configs
+```json
+"train_data_paths": ["train_documents"],
+"test_data_paths": ["test_documents"],
+"valid_data_paths": ["test_documents"],
+"label_data_paths": ["label_documents"]
+```
+
+the `"data_path"` option will not work with `"label_data_paths"`.
+
+
+```
+usage: preprocess_data_with_mask.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]]
+                                    [--mask-before-token MASK_BEFORE_TOKEN] [--num-docs NUM_DOCS] --tokenizer-type
+                                    {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
+                                    [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy]
+                                    --output-prefix OUTPUT_PREFIX [--dataset-impl {lazy,cached,mmap}]
+                                    [--workers WORKERS] [--log-interval LOG_INTERVAL]
+
+options:
+  -h, --help            show this help message and exit
+
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma
+                        separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Defa
+  --mask-before-token MASK_BEFORE_TOKEN
+                        apply loss masks before certain token(s). If multi-token pattern, separate by commas without
+                        space, e.g. --mask-before-token 0,1,1270 to use the token pattern [0,1,1270].
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+
+tokenizer:
+  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
+                        What type of tokenizer to use.
+  --vocab-file VOCAB_FILE
+                        Path to the vocab file
+  --merge-file MERGE_FILE
+                        Path to the BPE merge file (if necessary).
+  --append-eod          Append an <eod> token to the end of a document.
+  --ftfy                Use ftfy to clean text
+
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+```
+## `multinode_prepare_data.sh`
+Does the same but distributed over multiple nodes.
+
+```
+# USAGE:
+# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
+# over the processes.
+# This bash script takes a single text file as input argument.
+# The text file contains a valid filepath in each line, leading to a jsonl-file.
+# Furthermore an environment variable for the rank and the world size needs to be set.
+# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
+# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
+# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.
+```
+
+
+## `corpora.py`
+Has information for common datasets. Primarily meant for use in top-level `prepare_data.py` script.
diff --git a/tools/datasets/corpora.py b/tools/datasets/corpora.py
new file mode 100644
index 000000000..2c440dc0a
--- /dev/null
+++ b/tools/datasets/corpora.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+from abc import ABC, abstractmethod
+from multiprocessing import cpu_count
+
+"""
+This registry is for automatically downloading and extracting datasets.
+
+To register a class you need to inherit the DataDownloader class, and provide name and url attributes, and (optionally)
+the number of documents.
+
+When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected
+dataset.
+"""
+
+GPT2_VOCAB_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
+GPT2_MERGE_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
+
+
+class DataDownloader(ABC):
+    """Dataset registry class to automatically download / extract datasets"""
+
+    def __init__(
+        self,
+        tokenizer_type=None,
+        merge_file=None,
+        vocab_file=None,
+        data_dir=None,
+        force_redownload=None,
+        num_workers=None,
+    ):
+        if tokenizer_type is None:
+            tokenizer_type = "GPT2BPETokenizer"
+        if data_dir is None:
+            data_dir = os.environ.get("DATA_DIR", "./data")
+        if merge_file is None:
+            merge_file = f"{data_dir}/gpt2-merges.txt"
+        if force_redownload is None:
+            force_redownload = False
+        if vocab_file is None:
+            if tokenizer_type == "GPT2BPETokenizer":
+                vocab_file = f"{data_dir}/gpt2-vocab.json"
+            elif tokenizer_type == "HFGPT2Tokenizer":
+                vocab_file = "gpt2"
+            elif tokenizer_type == "CharLevelTokenizer":
+                pass
+            else:
+                assert vocab_file is not None, "No vocab file provided"
+        if num_workers is None:
+            num_workers = cpu_count()
+        self._tokenizer_type = tokenizer_type
+        self._merge_file = merge_file
+        self._vocab_file = vocab_file
+        self._data_dir = data_dir
+        self._force_redownload = force_redownload
+        self._num_workers = num_workers
+
+    @property
+    def base_dir(self):
+        """base data directory"""
+        return self._data_dir
+
+    @property
+    @abstractmethod
+    def name(self):
+        """name of dataset"""
+        pass
+
+    @property
+    @abstractmethod
+    def urls(self):
+        """URLs from which to download dataset"""
+        pass
+
+    @property
+    def tokenizer_type(self):
+        """tokenizer type to use when tokenizing data"""
+        return self._tokenizer_type
+
+    @property
+    def merge_file(self):
+        """Merge file for tokenizer"""
+        return self._merge_file
+
+    @property
+    def vocab_file(self):
+        """Vocab file for tokenizer"""
+        return self._vocab_file
+
+    @property
+    def num_workers(self):
+        """Number of workers to use in preprocessing"""
+        return self._num_workers
+
+    @property
+    def num_docs(self):
+        """Number of documents in the dataset (if known)"""
+        return None
+
+    @property
+    def ftfy(self):
+        """Use ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix text encodings"""
+        return False
+
+    def exists(self):
+        """Checks if the dataset is present"""
+        return os.path.isdir(f"{self.base_dir}/{self.name}")
+
+    def download(self):
+        """downloads dataset"""
+        os.makedirs(os.path.join(self.base_dir, self.name), exist_ok=True)
+        for url in self.urls:
+            try:
+                os_cmd = f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}"
+                if os.system(os_cmd) != 0:
+                    raise Exception(
+                        f"Cannot download file at URL {url}: server may be down"
+                    )
+            except Exception as e:
+                raise Exception(f"Download error: {e}")
+
+    def tokenize(self):
+        """tokenizes dataset"""
+        parent_folder = os.path.join(self.base_dir, self.name)
+        jsonl_filepath = ",".join(
+            [os.path.join(parent_folder, os.path.basename(url)) for url in self.urls]
+        )
+
+        cmd = f"python tools/datasets/preprocess_data.py \
+            --input {jsonl_filepath} \
+            --output-prefix {parent_folder}/{self.name} \
+            --vocab {self.vocab_file} \
+            --dataset-impl mmap \
+            --tokenizer-type {self.tokenizer_type} \
+            --merge-file {self.merge_file} \
+            --append-eod \
+            --workers {self.num_workers} "
+
+        if self.num_docs is not None:
+            cmd += f"--num-docs {self.num_docs} "
+
+        if self.ftfy:
+            cmd += f"--ftfy "
+
+        os.system(cmd)
+
+    def prepare(self):
+        if self._force_redownload:
+            self.download()
+        else:
+            if not self.exists():
+                self.download()
+
+        self.tokenize()
+
+
+class Enron(DataDownloader):
+    name = "enron"
+    urls = ["http://eaidata.bmk.sh/data/enron_emails.jsonl.zst"]
+    num_docs = 517401
+
+
+class PileSubset(DataDownloader):
+    name = "pile_00"
+    urls = ["https://the-eye.eu/public/AI/pile/train/00.jsonl.zst"]
+
+
+class Pile(DataDownloader):
+    name = "pile"
+    urls = [
+        f"https://the-eye.eu/public/AI/pile/train/{i:02}.jsonl.zst" for i in range(30)
+    ]
+
+
+class Github(DataDownloader):
+    name = "github"
+    urls = ["http://eaidata.bmk.sh/data/github_small.jsonl.zst"]
+
+
+class ArXiv(DataDownloader):
+    name = "arxiv"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/2020-09-08-arxiv-extracts-nofallback-until-2007-068.tar.gz"
+    ]
+
+
+class EuroParl(DataDownloader):
+    name = "europarl"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/EuroParliamentProceedings_1996_2011.jsonl.zst"
+    ]
+
+
+class FreeLaw(DataDownloader):
+    name = "freelaw"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"
+    ]
+
+
+class NiH(DataDownloader):
+    name = "nih"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/NIH_ExPORTER_awarded_grant_text.jsonl.zst"
+    ]
+
+
+class PubMed(DataDownloader):
+    name = "pubmed"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz"
+    ]
+
+
+class Books1(DataDownloader):
+    name = "books1"
+    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"]
+
+
+class Books3(DataDownloader):
+    name = "books3"
+    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz"]
+
+
+class HackerNews(DataDownloader):
+    name = "hackernews"
+    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/hn.tar.gz"]
+    num_docs = 373000
+
+
+class OpenWebText2(DataDownloader):
+    name = "openwebtext2"
+    urls = [
+        "https://huggingface.co/datasets/segyges/OpenWebText2/resolve/main/openwebtext2.jsonl.zst.tar"
+    ]
+    num_docs = 17103000
+
+
+class StackExchange(DataDownloader):
+    name = "stackexchange"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar"
+    ]
+
+
+class UbuntuIRC(DataDownloader):
+    name = "ubuntu_irc"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/ubuntu_irc_until_2020_9_1.jsonl.zst"
+    ]
+
+
+class YoutubeSubtitles(DataDownloader):
+    name = "youtube_subtitles"
+    urls = [
+        "https://the-eye.eu/public/AI/pile_preliminary_components/yt_subs.jsonl.zst"
+    ]
+
+
+class C4(DataDownloader):
+    name = "c4"
+    urls = [
+        f"https://the-eye.eu/eleuther_staging/c4/en/c4-train.{i:05}-of-01024.json.gz"
+        for i in range(1024)
+    ]
+
+
+class C4OpenWebText(DataDownloader):
+    name = "c4_openwebtext"
+    urls = [
+        f"https://the-eye.eu/eleuther_staging/c4/realnewslike/c4-train.{i:05}-of-00512.json.gz"
+        for i in range(512)
+    ]
+
+
+class Enwik8(DataDownloader):
+    name = "enwik8"
+    urls = ["http://mattmahoney.net/dc/enwik8.zip"]
+
+
+def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
+    if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
+        GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
+        GPT2_MERGE_FP = f"{data_dir}/gpt2-merges.txt"
+        if not os.path.isfile(GPT2_VOCAB_FP):
+            os.system(f"wget {GPT2_VOCAB_URL} -O {GPT2_VOCAB_FP}")
+        if not os.path.isfile(GPT2_MERGE_FP):
+            os.system(f"wget {GPT2_MERGE_URL} -O {GPT2_MERGE_FP}")
+
+
+DATA_DOWNLOADERS = {
+    "pass": "pass",
+    "enron": Enron,
+    "pile_subset": PileSubset,
+    "pile": Pile,
+    "github": Github,
+    "arxiv": ArXiv,
+    "europarl": EuroParl,
+    "freelaw": FreeLaw,
+    "nih": NiH,
+    "pubmed": PubMed,
+    "books1": Books1,
+    "books3": Books3,
+    "hackernews": HackerNews,
+    "openwebtext2": OpenWebText2,
+    "stackexchange": StackExchange,
+    "ubuntu_irc": UbuntuIRC,
+    "youtube_subtitles": YoutubeSubtitles,
+    "c4": C4,
+    "c4_openwebtext": C4OpenWebText,
+    "enwik8": Enwik8,
+}
+
+
+def prepare_dataset(
+    dataset_name: str,
+    tokenizer_type: str = None,
+    data_dir: str = None,
+    vocab_file: str = None,
+    merge_file: str = None,
+    force_redownload: bool = None,
+    num_workers: int = None,
+):
+    """
+    Downloads + tokenizes a dataset in the registry (dataset_name) and saves output .npy files to data_dir.
+    """
+    if data_dir is None:
+        data_dir = os.environ.get("DATA_DIR", "./data")
+    os.makedirs(data_dir, exist_ok=True)
+    maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir)
+    DownloaderClass = DATA_DOWNLOADERS.get(dataset_name.lower(), None)
+    if DownloaderClass is None:
+        raise NotImplementedError(
+            f'Dataset "{dataset_name}" not recognized - please choose from {list(DATA_DOWNLOADERS.keys())}'
+        )
+    elif DownloaderClass == "pass":
+        # pass on building dataset (for unit tests)
+        pass
+    else:
+        num_workers = 1 if dataset_name == "enwik8" else num_workers
+        d = DownloaderClass(
+            tokenizer_type=tokenizer_type,
+            vocab_file=vocab_file,
+            merge_file=merge_file,
+            data_dir=data_dir,
+            force_redownload=force_redownload,
+            num_workers=num_workers,
+        )
+        d.prepare()
diff --git a/tools/datasets/merge_datasets.py b/tools/datasets/merge_datasets.py
new file mode 100644
index 000000000..21567bd80
--- /dev/null
+++ b/tools/datasets/merge_datasets.py
@@ -0,0 +1,86 @@
+import os
+import sys
+import json
+import argparse
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+
+from megatron.data import indexed_dataset
+
+
+def main(args):
+
+    prefixes = set()
+    for basename in os.listdir(args.input):
+        prefix, ext = os.path.splitext(basename)
+
+        if prefix in prefixes:
+            continue
+
+        if not os.path.isfile(os.path.join(args.input, basename)):
+            continue
+
+        ext_pair = ".bin" if ext == ".idx" else ".idx"
+        assert os.path.isfile(
+            os.path.join(args.input, prefix) + ext_pair
+        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
+
+        prefixes.add(prefix)
+
+    builder = None
+    for prefix in sorted(prefixes):
+        if builder is None:
+            dataset = indexed_dataset.make_dataset(
+                os.path.join(args.input, prefix), "infer"
+            )
+
+            if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
+                builder = indexed_dataset.MMapIndexedDatasetBuilder(
+                    args.output_prefix + ".bin", dtype=dataset._index.dtype
+                )
+            else:
+                builder = indexed_dataset.IndexedDatasetBuilder(
+                    args.output_prefix + ".bin"
+                )
+
+            del dataset
+
+        builder.merge_file_(os.path.join(args.input, prefix))
+
+    builder.finalize(args.output_prefix + ".idx")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to directory containing all document files to merge",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+
+    args = parser.parse_args()
+
+    assert os.path.isdir(
+        args.input
+    ), f"ERROR: {args.input} is not a directory or does not exist"
+
+    assert os.path.isdir(
+        os.path.dirname(args.output_prefix)
+    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
+
+    main(args)
diff --git a/tools/datasets/multinode_prepare_data.sh b/tools/datasets/multinode_prepare_data.sh
new file mode 100644
index 000000000..87cb8ef31
--- /dev/null
+++ b/tools/datasets/multinode_prepare_data.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# USAGE:
+# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
+# over the processes.
+# This bash script takes a single text file as input argument.
+# The text file contains a valid filepath in each line, leading to a jsonl-file.
+# Furthermore an environment variable for the rank and the world size needs to be set.
+# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
+# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
+# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.
+
+# Parse command-line arguments
+text_file="$1"
+rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
+world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
+num_lines=$(wc -l < "$text_file")
+chunk_size=$((num_lines / world_size))
+start_line=$((rank * chunk_size + 1))
+end_line=$((start_line + chunk_size - 1))
+
+# Make sure the last chunk includes all remaining lines
+if [[ $rank == $((world_size - 1)) ]]; then
+    end_line=$num_lines
+fi
+
+# Select the chunk of the text file that corresponds to the rank
+chunk_file="chunk_${rank}.txt"
+sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"
+
+# Parse additional flags to be passed to the Python script
+shift 1  # Shift past the first three arguments
+py_args=""
+prefix_arg=""
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --output-prefix=*) prefix_arg="$1"; shift;;
+        --output-prefix) prefix_arg="$1 $2"; shift 2;;
+        --*) py_args="$py_args $1 $2"; shift 2;;
+        *) echo "Unknown argument: $1"; exit 1;;
+    esac
+done
+
+# Add the rank to the --output-prefix argument if it is set
+if [[ -n "$prefix_arg" ]]; then
+    py_args="$py_args $prefix_arg$rank"
+else
+    # Inject a default --output-prefix argument containing the rank
+    py_args="$py_args --output-prefix rank${rank}"
+fi
+
+
+echo "processing $chunk_file with rank $rank at world size $world_size"
+echo "using the following args: $py_args"
+# Call the Python script with the list of file paths in the chunk
+python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args
+
+# Clean up
+rm "$chunk_file"
diff --git a/tools/datasets/preprocess_data.py b/tools/datasets/preprocess_data.py
new file mode 100644
index 000000000..32e656ace
--- /dev/null
+++ b/tools/datasets/preprocess_data.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processing data for pretraining."""
+
+import argparse
+import multiprocessing
+import os
+import sys
+
+import lm_dataformat as lmd
+import numpy as np
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+import time
+import tqdm
+import torch
+import ftfy
+
+from megatron.tokenizer import build_tokenizer
+from megatron.data import indexed_dataset
+from threading import Semaphore
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+
+    def encode(self, text):
+        if self.args.ftfy:
+            text = ftfy.fix_text(text)
+        ids = {}
+        for key in self.args.jsonl_keys:
+            doc_ids = []
+            text_ids = Encoder.tokenizer.tokenize(text)
+            if len(text_ids) > 0:
+                doc_ids.append(text_ids)
+            if self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eod)
+            ids[key] = doc_ids
+        return ids, len(text)
+
+
+def get_args(input_args=None):
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
+        "list",
+    )
+    group.add_argument(
+        "--jsonl-keys",
+        nargs="+",
+        default=["text"],
+        help="space separate listed of keys to extract from jsonl. Defa",
+    )
+    group.add_argument(
+        "--num-docs",
+        default=None,
+        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
+        type=int,
+    )
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-type",
+        type=str,
+        required=True,
+        choices=[
+            "HFGPT2Tokenizer",
+            "HFTokenizer",
+            "GPT2BPETokenizer",
+            "CharLevelTokenizer",
+            "TiktokenTokenizer",
+            "SPMTokenizer",
+        ],
+        help="What type of tokenizer to use.",
+    )
+    group.add_argument(
+        "--vocab-file", type=str, default=None, help="Path to the vocab file"
+    )
+    group.add_argument(
+        "--merge-file",
+        type=str,
+        default=None,
+        help="Path to the BPE merge file (if necessary).",
+    )
+    group.add_argument(
+        "--append-eod",
+        action="store_true",
+        help="Append an <eod> token to the end of a document.",
+    )
+    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+    group.add_argument(
+        "--dataset-impl",
+        type=str,
+        default="mmap",
+        choices=["lazy", "cached", "mmap"],
+        help="Dataset implementation to use. Default: mmap",
+    )
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to launch"
+    )
+    group.add_argument(
+        "--log-interval",
+        type=int,
+        default=100,
+        help="Interval between progress updates",
+    )
+    args = parser.parse_args(input_args)
+    args.keep_empty = False
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    return args
+
+
+def yield_from_files(fnames: list, semaphore):
+    """
+    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
+    other compressed formats. Also filters out empty documents.
+
+    :param fnames: list of filenames
+    """
+
+    def yielder(fname, semaphore):
+        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            semaphore.acquire()
+            yield f
+
+    for fname in fnames:
+        semaphore.acquire()
+
+        yield from yielder(fname, semaphore)
+
+
+def main(input_args=None):
+    args = get_args(input_args)
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+
+    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
+    # hence building up memory
+    semaphore = Semaphore(10000 + args.workers)
+
+    # use multiprocessing to iterate over input documents
+    fin = yield_from_files(args.input.split(","), semaphore)
+
+    if args.workers > 1:
+        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+    else:
+        encoder.initializer()
+        encoded_docs = (encoder.encode(doc) for doc in fin)
+
+    # make a dataset builder for each key in args.jsonl_keys
+    # each key will output to a different file beginning with args.output_prefix
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.jsonl_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(
+            args.output_prefix, key, "document"
+        )
+        output_idx_files[key] = "{}_{}_{}.idx".format(
+            args.output_prefix, key, "document"
+        )
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            vocab_size=tokenizer.vocab_size,
+        )
+
+    # actually do tokenization
+    proc_start = time.time()
+    total_bytes_processed = 0
+    pbar = tqdm.tqdm()
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+
+        # release semaphore so `yield_from_files` can add another file to the buffer
+        semaphore.release()
+
+        # add each tokenized document / sentence
+        for key, sentences in doc.items():
+            for sentence in sentences:
+                builders[key].add_item(np.array(sentence, dtype=builders[key].dtype))
+            # separate with eos token
+            builders[key].end_document()
+
+        # log progress
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            pbar.set_description(
+                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed :.2f} docs/s, {mbs:.2f} MB/s)."
+            )
+            if i != 0:
+                pbar.update(args.log_interval)
+
+    # save output file
+    for key in args.jsonl_keys:
+        builders[key].finalize(output_idx_files[key])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/datasets/preprocess_data_with_mask.py b/tools/datasets/preprocess_data_with_mask.py
new file mode 100644
index 000000000..ec2fddbe4
--- /dev/null
+++ b/tools/datasets/preprocess_data_with_mask.py
@@ -0,0 +1,386 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A script for processing a dataset such that corresponding labels are also produced. These are then used to perform masked finetuning
+(for example, finetuning a model to only output the text following some delimiter in the finetuning dataset such as "Answer: "
+rather than generating the entire "Question: ... Answer: " turns of conversation.
+
+To run this script, first edit `tools/datasets/corpora.py` such that the command to call `tools/datasets/preprocess_data.py` is as follows:
+
+```
+cmd = f"python tools/datasets/preprocess_data_with_mask.py \
+    --input {jsonl_filepath} \
+    --output-prefix {parent_folder}/{self.name} \
+    --vocab {self.vocab_file} \
+    --dataset-impl mmap \
+    --tokenizer-type {self.tokenizer_type} \
+    --merge-file {self.merge_file} \
+    --append-eod \
+    --mask-before-token X,Y,Z \
+    --workers {self.num_workers} "
+
+if self.num_docs is not None:
+    cmd += f"--num-docs {self.num_docs} "
+
+if self.ftfy:
+    cmd += f"--ftfy "
+```
+where --mask-before-token must be the (comma-separated) list of tokens produced by encoding your delimiter string.
+Up to and including the first occurrence of this token sequence in a document, all tokens will have their loss mask zeroed out when the label dataset is provided to NeoX.
+
+Then, specify
+```
+"train_data_paths": ["/path/to/dataset/name_text_document"],
+"label_data_paths": ["/path/to/dataset/name_label_document"]
+```
+in your YML config. This will then allow for finetuning on the data with loss masks set appropriately.
+(However, be warned that NeoX packs documents to fill context windows, which may degrade performance in some finetuning situations where instead padding out to the context length may be preferred.)
+"""
+
+import argparse
+import multiprocessing
+import os
+import sys
+import re
+
+import lm_dataformat as lmd
+import numpy as np
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+import time
+import tqdm
+import torch
+import ftfy
+
+from megatron.tokenizer import build_tokenizer
+from megatron.data import indexed_dataset
+from threading import Semaphore
+from functools import lru_cache
+
+
+@lru_cache(maxsize=None)
+def build_nxt(pattern: tuple) -> tuple:
+    # The function is being cached. Use tuple to avoid the cache being tampered out of scope.
+    nxt = [0]
+    current = 1
+    match_idx = 0
+
+    while current < len(pattern):
+        if pattern[match_idx] == pattern[current]:
+            current += 1
+            match_idx += 1
+            nxt.append(match_idx)
+        elif match_idx != 0:
+            match_idx = nxt[match_idx - 1]
+        else:
+            nxt.append(0)
+            current += 1
+
+    return tuple(nxt)
+
+
+def kmp(seq, pattern, first_appearance=False):
+    """
+    Search for the location of a subsequence in a list. Not sure if there is a python built-in
+    implementation of kmp somewhere...
+    """
+    nxt = build_nxt(tuple(pattern))
+    current = 0
+    match_idx = 0
+
+    matched = []
+
+    while current < len(seq):
+        if seq[current] == pattern[match_idx]:
+            current += 1
+            match_idx += 1
+        elif match_idx != 0:
+            match_idx = nxt[match_idx - 1]
+        else:
+            current += 1
+
+        if match_idx == len(pattern):
+            matched.append(current - len(pattern))
+            if first_appearance:
+                return matched
+            match_idx = nxt[match_idx - 1]
+
+    return matched
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+
+    def encode(self, text):
+        if self.args.ftfy:
+            text = ftfy.fix_text(text)
+        if isinstance(text, str):
+            text = {"text": text}
+        ids = {}
+        for key in self.args.jsonl_keys:
+            doc_ids = []
+            text_ids = Encoder.tokenizer.tokenize(text["text"])
+            if len(text_ids) > 0:
+                doc_ids.append(text_ids)
+            if self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eod)
+            ids[key] = doc_ids
+        return ids, len(text)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
+        "list",
+    )
+    group.add_argument(
+        "--jsonl-keys",
+        nargs="+",
+        default=["text"],
+        help="space separate listed of keys to extract from jsonl. Defa",
+    )
+    group.add_argument(
+        "--mask-before-token",
+        default=None,
+        help="apply loss masks before certain token(s). If multi-token pattern, separate by commas without space, e.g. --mask-before-token 0,1,1270 to use the token pattern [0,1,1270].",
+        type=str,
+    )
+    group.add_argument(
+        "--num-docs",
+        default=None,
+        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
+        type=int,
+    )
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-type",
+        type=str,
+        required=True,
+        choices=[
+            "HFGPT2Tokenizer",
+            "HFTokenizer",
+            "GPT2BPETokenizer",
+            "CharLevelTokenizer",
+        ],
+        help="What type of tokenizer to use.",
+    )
+    group.add_argument(
+        "--vocab-file", type=str, default=None, help="Path to the vocab file"
+    )
+    group.add_argument(
+        "--merge-file",
+        type=str,
+        default=None,
+        help="Path to the BPE merge file (if necessary).",
+    )
+    group.add_argument(
+        "--append-eod",
+        action="store_true",
+        help="Append an <eod> token to the end of a document.",
+    )
+    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+    group.add_argument(
+        "--dataset-impl",
+        type=str,
+        default="mmap",
+        choices=["lazy", "cached", "mmap"],
+        help="Dataset implementation to use. Default: mmap",
+    )
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to launch"
+    )
+    group.add_argument(
+        "--log-interval",
+        type=int,
+        default=100,
+        help="Interval between progress updates",
+    )
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    return args
+
+
+def yield_from_files(fnames: list, semaphore):
+    """
+    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
+    other compressed formats. Also filters out empty documents.
+
+    :param fnames: list of filenames
+    """
+
+    def yielder(fname, semaphore):
+        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            semaphore.acquire()
+            yield f
+
+    for fname in fnames:
+        semaphore.acquire()
+
+        yield from yielder(fname, semaphore)
+
+
+def mask(sentence: list, pivot_tokens: list, include_pivot=True):
+    inds = kmp(sentence, pivot_tokens)
+    if not inds:
+        return sentence
+    index = inds[0]
+    if include_pivot:
+        index += len(pivot_tokens)
+
+    return [-100] * index + sentence[index:]
+
+
+def main():
+    args = get_args()
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+
+    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
+    # hence building up memory
+    semaphore = Semaphore(10000 + args.workers)
+
+    # use multiprocessing to iterate over input documents
+    fin = yield_from_files(args.input.split(","), semaphore)
+
+    if args.workers > 1:
+        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+    else:
+        encoder.initializer()
+        encoded_docs = (encoder.encode(doc) for doc in fin)
+
+    if args.mask_before_token is not None:
+        token_mask = [
+            int(re.sub(r"[^0-9]", "", r))
+            for r in args.mask_before_token.split(",")
+            if re.sub(r"[^0-9]", "", r)
+        ]
+    else:
+        token_mask = []
+
+    # make a dataset builder for each key in args.jsonl_keys
+    # each key will output to a different file beginning with args.output_prefix
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.jsonl_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(
+            args.output_prefix, key, "document"
+        )
+        output_idx_files[key] = "{}_{}_{}.idx".format(
+            args.output_prefix, key, "document"
+        )
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            vocab_size=tokenizer.vocab_size,
+        )
+    if token_mask:
+        assert (
+            "label" not in args.jsonl_keys
+        ), "label should not be included as it will be generated according to the mask."
+        key = "label"
+        output_bin_files[key] = "{}_{}_{}.bin".format(
+            args.output_prefix, key, "document"
+        )
+        output_idx_files[key] = "{}_{}_{}.idx".format(
+            args.output_prefix, key, "document"
+        )
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            vocab_size=tokenizer.vocab_size,
+        )
+    int32_labels = ["text", "label"]
+    for l in int32_labels:
+        builders[l]._dtype = np.int32
+
+    # actually do tokenization
+    proc_start = time.time()
+    total_bytes_processed = 0
+    pbar = tqdm.tqdm()
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+
+        # release semaphore so `yield_from_files` can add another file to the buffer
+        semaphore.release()
+
+        # add each tokenized document / sentence
+        for key, sentences in doc.items():
+            for sentence in sentences:
+                builders[key].add_item(np.array(sentence, dtype=builders[key].dtype))
+                if token_mask:
+                    masked_sentence = mask(sentence, token_mask)
+                    builders["label"].add_item(
+                        np.array(masked_sentence, dtype=builders["text"].dtype)
+                    )
+            # separate with eos token
+            builders[key].end_document()
+            if token_mask:
+                builders["label"].end_document()
+
+        # log progress
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            pbar.set_description(
+                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)."
+            )
+            if i != 0:
+                pbar.update(args.log_interval)
+
+    # save output file
+    update_keys = args.jsonl_keys + ["label"] if token_mask else args.jsonl_keys
+    for key in update_keys:
+        builders[key].finalize(output_idx_files[key])
+
+
+if __name__ == "__main__":
+    main()

From 9b2331f5d33a65ed317a304cbc4ab71774556c97 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:49:15 +0000
Subject: [PATCH 41/64] docs: re-add mistral , GQA as supported

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2a06ac9bf..d5df6bede 100644
--- a/README.md
+++ b/README.md
@@ -507,13 +507,14 @@ Though NeoX supports a number of different architectural configurations, includi
 
 NeoX supports export of compatible models into the following architectures:
 - GPTNeoXForCausalLM
-- LlamaForCausalLM (GQA Support Coming Soon -- all Llama 1 models and Llama 2 / Codellama up to size 13B supported)
+- LlamaForCausalLM
+- MistralForCausalLM
 
 Training a model which does not fit into one of these Hugging Face Transformers architectures cleanly will require writing custom modeling code for the exported model.
 
 To convert a GPT-NeoX library checkpoint to Hugging Face-loadable format, run:
 ```bash
-python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,llama}
+python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,mistral,llama}
 ```
 
 Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
@@ -527,11 +528,11 @@ and input the requested information, including HF hub user token.
 
 NeoX supplies several utilities for converting a pretrained model checkpoint into a format that can be trained within the library.
 
-The following models can be loaded in GPT-NeoX:
+The following models or model families can be loaded in GPT-NeoX:
 - Llama 1
-- Llama 2 (Up to size 13B)
-- CodeLlama (Up to size 13B)
-- Mistral-7b-v0.1 (Coming Soon!)
+- Llama 2
+- CodeLlama
+- Mistral-7b-v0.1
 
 We provide two utilities for converting from two different checkpoint formats into a format compatible with GPT-NeoX.
 

From d6accd89182e0f95de9ab3a587a952b175497b0d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:51:40 +0000
Subject: [PATCH 42/64] cleanup: delete duplicate tools/ files

---
 tools/convert_gqa_llama.py                 | 672 ---------------------
 tools/convert_hf_to_sequential.py          | 573 ------------------
 tools/convert_module_to_hf.py              | 334 ----------
 tools/convert_raw_llama_weights_to_neox.py | 635 -------------------
 tools/convert_sequential_to_hf.py          | 373 ------------
 tools/corpora.py                           | 364 -----------
 tools/inspect_checkpoints.py               | 335 ----------
 tools/kill.sh                              |   1 -
 tools/killall.sh                           |   1 -
 tools/merge20b.py                          | 282 ---------
 tools/merge_datasets.py                    |  84 ---
 tools/merge_mp_partitions.py               | 293 ---------
 tools/multinode_prepare_data.sh            |  59 --
 tools/preprocess_data.py                   | 244 --------
 tools/preprocess_data_with_mask.py         | 381 ------------
 tools/sync.sh                              |  28 -
 tools/sync_cmd.sh                          |  22 -
 tools/syncdir.sh                           |  29 -
 tools/upload.py                            |  52 --
 19 files changed, 4762 deletions(-)
 delete mode 100644 tools/convert_gqa_llama.py
 delete mode 100644 tools/convert_hf_to_sequential.py
 delete mode 100644 tools/convert_module_to_hf.py
 delete mode 100644 tools/convert_raw_llama_weights_to_neox.py
 delete mode 100644 tools/convert_sequential_to_hf.py
 delete mode 100644 tools/corpora.py
 delete mode 100644 tools/inspect_checkpoints.py
 delete mode 100755 tools/kill.sh
 delete mode 100755 tools/killall.sh
 delete mode 100644 tools/merge20b.py
 delete mode 100644 tools/merge_datasets.py
 delete mode 100644 tools/merge_mp_partitions.py
 delete mode 100644 tools/multinode_prepare_data.sh
 delete mode 100644 tools/preprocess_data.py
 delete mode 100644 tools/preprocess_data_with_mask.py
 delete mode 100755 tools/sync.sh
 delete mode 100644 tools/sync_cmd.sh
 delete mode 100755 tools/syncdir.sh
 delete mode 100644 tools/upload.py

diff --git a/tools/convert_gqa_llama.py b/tools/convert_gqa_llama.py
deleted file mode 100644
index 7d0e98049..000000000
--- a/tools/convert_gqa_llama.py
+++ /dev/null
@@ -1,672 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import torch
-import json
-import math
-import tqdm.auto as tqdm
-
-
-INTERMEDIATE_SIZE_MAP = {
-    "7B": 11008,
-    "13B": 13824,
-    "30B": 17920,
-    "34B": 22016,
-    "65B": 22016,
-    "70B": 28672,
-    "Mistral-7B-v0.1": 14336,
-}
-NUM_SHARDS = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "34B": 4,
-    "65B": 8,
-    "70B": 8,
-    "Mistral-7B-v0.1": 1,
-}
-
-
-def compute_intermediate_size(n):
-    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_file(text, path):
-    with open(path, "w") as f:
-        f.write(text)
-
-
-def convert_model_pipeline(
-    output_base_path, input_base_path, model_size: str, num_output_shards: int
-):
-    assert model_size in NUM_SHARDS
-
-    model_path = os.path.join(output_base_path, "global_step0")
-    os.makedirs(model_path, exist_ok=True)
-    write_file("global_step0", os.path.join(output_base_path, "latest"))
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_input_shards = NUM_SHARDS[model_size]
-    num_layers = params["n_layers"]
-    num_heads = params["n_heads"]
-    if "n_kv_heads" in params:
-        num_kv_heads = params["n_kv_heads"]
-    else:
-        num_kv_heads = num_heads
-    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
-    num_heads_per_input_shard = num_heads // num_input_shards
-    num_heads_per_output_shard = num_heads // num_output_shards
-    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
-    hidden_size = params["dim"]
-    dims_per_head = hidden_size // num_heads
-    # base = 10000.0
-    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    def permute_rotary(w):
-        if w.shape == (num_heads, dims_per_head, hidden_size):
-            N_HEADS = num_heads
-        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
-            N_HEADS = num_kv_heads
-        else:
-            assert False
-        return (
-            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
-            .transpose(1, 2)
-            .reshape(N_HEADS, dims_per_head, hidden_size)
-        )
-
-    pbar = tqdm.tqdm(total=num_input_shards + num_layers + 3)
-
-    pbar.set_description(f"Loading shard")
-    loaded = []
-    for i in range(num_input_shards):
-        loaded.append(
-            torch.load(
-                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
-                map_location="cpu",
-            )
-        )
-        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
-        pbar.update(1)
-    helper = Helper(
-        loaded=loaded,
-        model_path=model_path,
-        num_output_shards=num_output_shards,
-        model_size=model_size,
-        pipeline_parallel=False,
-    )
-
-    sequential_cache = [{} for _ in range(num_output_shards)]
-
-    # Embedding in
-    embeddings_in = torch.cat(
-        [
-            loaded[rank]["tok_embeddings.weight"].cpu()
-            for rank in range(num_input_shards)
-        ],
-        dim=1,
-    )
-    print(embeddings_in.shape)
-    helper.save_shards(
-        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
-    )
-    helper.del_loaded("tok_embeddings.weight")
-    pbar.set_description(f"Saved embeddings")
-    pbar.update(1)
-
-    # Norms
-    helper.save_duplicates(
-        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
-    )
-    helper.del_loaded("norm.weight")
-    pbar.set_description(f"Saved final norm")
-    pbar.update(1)
-
-    # Embedding out
-    embeddings_out = torch.cat(
-        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
-    )
-    helper.save_shards(
-        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
-        layer_i=num_layers + 4,
-    )
-    helper.del_loaded("output.weight")
-    pbar.set_description(f"Saved out embeddings")
-    pbar.update(1)
-
-    # Layers
-    if model_size == "7B":
-        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
-        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
-    else:
-        rope_freqs = loaded[0]["rope.freqs"]
-        helper.del_loaded("rope.freqs")
-    for layer_i in range(num_layers):
-
-        # Linear
-        attn_wo = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w1 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        mlp_w2 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w3 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
-
-        # Attention
-        w_q = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_k = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
-
-        w_v = torch.cat(
-            [
-                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
-                )
-                for rank in range(num_input_shards)
-            ],
-            dim=0,
-        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
-
-        sharded_qkv = torch.cat(
-            [
-                helper.shard(
-                    w_q, dim=0
-                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
-                helper.shard(w_k, dim=0),
-                helper.shard(w_v, dim=0),
-            ],
-            dim=2,
-        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
-
-        sharded_qkv = sharded_qkv.view(
-            num_output_shards,
-            num_heads_per_output_shard * dims_per_head
-            + 2 * num_kv_heads_per_output_shard * dims_per_head,
-            hidden_size,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
-
-        # Duplicated
-        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
-        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
-        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
-        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
-
-        for out_rank in range(num_output_shards):
-            helper.save(
-                {
-                    "attention.query_key_value.weight": sharded_qkv[out_rank],
-                    # Sharded layers
-                    "attention.dense.weight": attn_wo[out_rank].clone(),
-                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
-                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
-                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
-                    # Duplicated layers
-                    "input_layernorm.scale": input_layernorm,
-                    "post_attention_layernorm.scale": post_attention_layernorm,
-                    "attention.rotary_emb.inv_freq": rope_freqs,
-                },
-                layer_i=layer_i + 2,
-                rank=out_rank,
-            )
-
-        pbar.set_description(f"Saved layer {layer_i} / {num_layers}")
-        pbar.update(1)
-
-    model_state = {
-        "dp_world_size": 1,
-        "mp_world_size": num_output_shards,
-        "module": {},
-        "optimizer": {},
-        "global_steps": 1,
-        "skipped_steps": 1,
-        "iteration": 1,
-    }
-    for rank in range(num_output_shards):
-        torch.save(
-            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
-        )
-    pbar.set_description("Done.")
-
-
-def convert_model_sequential(
-    output_base_path, input_base_path, model_size: str, num_output_shards: int
-):
-    assert model_size in NUM_SHARDS
-
-    model_path = os.path.join(output_base_path, "global_step0")
-    os.makedirs(model_path, exist_ok=True)
-    write_file("global_step0", os.path.join(output_base_path, "latest"))
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_input_shards = NUM_SHARDS[model_size]
-    num_layers = params["n_layers"]
-    num_heads = params["n_heads"]
-    if "n_kv_heads" in params:
-        num_kv_heads = params["n_kv_heads"]
-    else:
-        num_kv_heads = num_heads
-    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
-    num_heads_per_input_shard = num_heads // num_input_shards
-    num_heads_per_output_shard = num_heads // num_output_shards
-    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
-    hidden_size = params["dim"]
-    dims_per_head = hidden_size // num_heads
-    # base = 10000.0
-    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    def permute_rotary(w):
-        if w.shape == (num_heads, dims_per_head, hidden_size):
-            N_HEADS = num_heads
-        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
-            N_HEADS = num_kv_heads
-        else:
-            assert False
-        return (
-            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
-            .transpose(1, 2)
-            .reshape(N_HEADS, dims_per_head, hidden_size)
-        )
-
-    pbar = tqdm.tqdm(total=num_input_shards + num_output_shards)
-
-    pbar.set_description(f"Loading shard")
-    loaded = []
-    for i in range(num_input_shards):
-        loaded.append(
-            torch.load(
-                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
-                map_location="cpu",
-            )
-        )
-        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
-        pbar.update(1)
-    helper = Helper(
-        loaded=loaded,
-        model_path=model_path,
-        num_output_shards=num_output_shards,
-        model_size=model_size,
-        pipeline_parallel=False,
-    )
-
-    # Embedding in
-    embeddings_in = torch.cat(
-        [
-            loaded[rank]["tok_embeddings.weight"].cpu()
-            for rank in range(num_input_shards)
-        ],
-        dim=1,
-    )
-
-    helper.add_sequential_shard(
-        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
-    )
-    helper.del_loaded("tok_embeddings.weight")
-
-    # Norms
-    helper.add_sequential_duplicates(
-        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
-    )
-    helper.del_loaded("norm.weight")
-
-    # Embedding out
-    embeddings_out = torch.cat(
-        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
-    )
-    helper.add_sequential_shard(
-        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
-        layer_i=num_layers + 4,
-    )
-    helper.del_loaded("output.weight")
-
-    # Layers
-    if model_size == "7B":
-        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
-        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
-    else:
-        rope_freqs = loaded[0]["rope.freqs"]
-        helper.del_loaded("rope.freqs")
-    for layer_i in range(num_layers):
-
-        # Linear
-        attn_wo = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w1 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        mlp_w2 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w3 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
-
-        # Attention
-        w_q = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_k = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
-        w_v = torch.cat(
-            [
-                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
-                )
-                for rank in range(num_input_shards)
-            ],
-            dim=0,
-        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
-        sharded_qkv = torch.cat(
-            [
-                helper.shard(
-                    w_q, dim=0
-                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
-                helper.shard(w_k, dim=0),
-                helper.shard(w_v, dim=0),
-            ],
-            dim=2,
-        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
-
-        sharded_qkv = sharded_qkv.view(
-            num_output_shards,
-            num_heads_per_output_shard * dims_per_head
-            + 2 * num_kv_heads_per_output_shard * dims_per_head,
-            hidden_size,
-        )
-
-        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
-
-        # Duplicated
-        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
-        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
-        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
-        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
-
-        for out_rank in range(num_output_shards):
-            helper.add_sequential(
-                {
-                    "attention.query_key_value.weight": sharded_qkv[out_rank],
-                    # Sharded layers
-                    "attention.dense.weight": attn_wo[out_rank].clone(),
-                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
-                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
-                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
-                    # Duplicated layers
-                    "input_layernorm.scale": input_layernorm,
-                    "post_attention_layernorm.scale": post_attention_layernorm,
-                    "attention.rotary_emb.inv_freq": rope_freqs,
-                },
-                layer_i=layer_i + 2,
-                rank=out_rank,
-            )
-
-    for rank in range(num_output_shards):
-        model_state = {
-            "dp_world_size": 1,
-            "mp_world_size": num_output_shards,
-            "module": helper.sequential_cache[rank],
-            "optimizer": {},
-            "global_steps": 1,
-            "skipped_steps": 1,
-            "iteration": 1,
-        }
-        torch.save(
-            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
-        )
-        pbar.set_description(f"Saved shard {rank}")
-        pbar.update(1)
-    pbar.set_description("Done.")
-
-
-class Helper:
-    def __init__(
-        self, loaded, model_size, num_output_shards, model_path, pipeline_parallel
-    ):
-        self.loaded = loaded
-        self.model_size = model_size
-        self.num_output_shards = num_output_shards
-        self.model_path = model_path
-
-        self.pipeline_parallel = pipeline_parallel
-        self.sequential_cache = [{} for _ in range(num_output_shards)]
-
-    def del_loaded(self, key: str):
-        # Remove from memory as we go along
-        for loaded_shared in self.loaded:
-            del loaded_shared[key]
-
-    def save_shards(self, dictionary, layer_i: int):
-        for k, v in dictionary.items():
-            assert v.shape[0] == self.num_output_shards
-        for rank in range(self.num_output_shards):
-            torch.save(
-                {k: v[rank].clone() for k, v in dictionary.items()},
-                self.save_path(layer_i=layer_i, rank=rank),
-            )
-
-    def save_duplicates(self, dictionary, layer_i: int):
-        for rank in range(self.num_output_shards):
-            torch.save(
-                {k: v.clone() for k, v in dictionary.items()},
-                self.save_path(layer_i=layer_i, rank=rank),
-            )
-
-    def save(self, obj, layer_i, rank):
-        torch.save(obj, self.save_path(layer_i=layer_i + 2, rank=rank))
-
-    def shard(self, x, dim):
-        x_shape = list(x.shape)
-        assert x_shape[dim] % self.num_output_shards == 0
-        new_x_shape = (
-            x_shape[:dim]
-            + [self.num_output_shards, x_shape[dim] // self.num_output_shards]
-            + x_shape[dim + 1 :]
-        )
-        x = x.view(*new_x_shape)
-        return torch.movedim(x, 0, dim)
-
-    def save_path(self, layer_i, rank):
-        return os.path.join(
-            self.model_path, f"layer_{layer_i:02d}-model_{rank:02d}-model_states.pt"
-        )
-
-    def add_sequential_shard(self, dictionary, layer_i):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            for rank in range(self.num_output_shards):
-                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v[
-                    rank
-                ].clone()
-
-    def add_sequential_duplicates(self, dictionary, layer_i):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            for rank in range(self.num_output_shards):
-                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
-
-    def add_sequential(self, dictionary, layer_i, rank):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert raw LLaMA checkpoints to GPT-NeoX format."
-    )
-    parser.add_argument(
-        "--input_dir",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B", "Mistral-7b-v0.1", "13B", "30B", "34B", "65B", "tokenizer_only"],
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write GPT-NeoX mode",
-    )
-    parser.add_argument(
-        "--num_output_shards",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--pipeline_parallel",
-        action="store_true",
-        help="Only use if PP>1",
-    )
-    args = parser.parse_args()
-    if args.pipeline_parallel:
-        print("parallel")
-        convert_model_pipeline(
-            output_base_path=args.output_dir,
-            input_base_path=os.path.join(args.input_dir, args.model_size),
-            model_size=args.model_size,
-            num_output_shards=args.num_output_shards,
-        )
-    else:
-        print("sequential")
-        convert_model_sequential(
-            output_base_path=args.output_dir,
-            input_base_path=os.path.join(args.input_dir, args.model_size),
-            model_size=args.model_size,
-            num_output_shards=args.num_output_shards,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/convert_hf_to_sequential.py b/tools/convert_hf_to_sequential.py
deleted file mode 100644
index 4ed5c67f4..000000000
--- a/tools/convert_hf_to_sequential.py
+++ /dev/null
@@ -1,573 +0,0 @@
-import sys
-import os
-import copy
-import deepspeed
-# import time
-
-import argparse
-import torch
-
-import numpy as np
-
-from functools import reduce
-from transformers import GPTNeoXForCausalLM, GPTNeoXConfig
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-from megatron.neox_arguments import NeoXArgs
-from megatron.training import get_model, get_optimizer, get_learning_rate_scheduler
-from megatron.initialize import initialize_megatron
-from megatron import mpu
-from megatron.checkpointing import load_checkpoint, save_checkpoint
-# from megatron.utils import (
-#     Timers,
-#     init_wandb,
-# )
-
-"""
-A script for converting publicly available Huggingface (HF) checkpoints NeoX format.
-
-Note that this script requires access to correspoinding config files for equivalent NeoX models to those found ing Hugging face.
-
-Example usage: (Converts the 70M Pythia model to NeoX format)
-================================================================
-OMPI_COMM_WORLD_RANK=0 CUDA_VISIBLE_DEVICES=0 python tools/convert_hf_to_sequential.py \
-    --hf-model-name pythia-70m-v0 \
-    --revision 143000 \
-    --output-dir checkpoints/neox_converted/pythia/70m \
-    --cache-dir checkpoints/HF \
-    --config configs/pythia/70M.yml configs/local_setup.yml \
-    --test
-
-
-For multi-gpu support we must initiliaze deepspeed:
-NOTE: This requires manually changing the arguments below.
-================================================================
-CUDA_VISIBLE_DEVICES=0,1,2,3 python ./deepy.py tools/convert_hf_to_sequential.py \
-    -d configs pythia/70M.yml local_setup.yml
-"""
-
-MULTI_GPU_ARGS = " ".join(
-        [
-            "--hf-model-name pythia-70m-v0",
-            "--revision 143000",
-            "--output-dir checkpoints/neox_converted/pythia/70m",
-            "--cache-dir checkpoints/HF",
-            "--config configs/pythia/70M.yml configs/local_setup.yml",
-            "--test",
-        ]
-    )
-
-
-
-def convert_hf_to_sequential(hf_model,seq_state_dict):
-    """Converts the weights of a HuggingFace model to neox 2.0 format.
-    
-    :param hf_model: the huggingface model
-    :param seq_state_dict: the state dict of the equivalent neox model
-    
-    returns the updated sequential state dict
-    """
-    num_layers = hf_model.config.num_hidden_layers
-    # Embedding is layer idx 0
-    seq_state_dict['sequential.0.word_embeddings.weight'] =\
-        hf_model.gpt_neox.embed_in.state_dict()['weight']
-    
-    for layer_hf in range(num_layers):
-        # offset by 2
-        layer_seq = layer_hf + 2
-
-        # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_hf]
-        hf_layer_sd = hf_layer.state_dict()
-        
-        
-        for key in hf_model.gpt_neox.layers[0].state_dict().keys():
-            
-            if key in ["attention.bias","attention.masked_bias"]:
-                continue
-            seq_state_dict[f"sequential.{layer_seq}.{key}"] = hf_layer_sd[key]
-            
-    # Load final layer norm
-    layer_seq = num_layers + 3
-    seq_state_dict[f"sequential.{layer_seq}.norm.weight"] = \
-        hf_model.gpt_neox.final_layer_norm.state_dict()['weight']
-    seq_state_dict[f"sequential.{layer_seq}.norm.bias"] = \
-        hf_model.gpt_neox.final_layer_norm.state_dict()['bias']
-    
-    # output embedding / LM head
-    layer_seq += 1
-    seq_state_dict[f"sequential.{layer_seq}.final_linear.weight"] = \
-        hf_model.embed_out.state_dict()['weight']
-
-
-
-def shard_sequential_mp(num_mp_ranks,sequential):
-    """Shards the sequential model into model parallel ranks.
-
-    :param num_mp_ranks: the number of model parallel ranks
-    :param sequential: the state dict of the sequential model at mp=1
-
-    returns a dict of state dicts for each mp rank
-    """
-    ranks = {x:dict() for x in range(num_mp_ranks)}
-    for k,v in sequential.items():
-        if reduce(np.logical_or,[x in k for x in ['layernorm', 
-                                                  'rotary_emb',
-                                                  'dense_4h_to_h.bias',
-                                                  'norm.weight',
-                                                  'norm.bias',
-                                                  'attention.dense.bias']]):
-            # no splitting
-            for x in range(num_mp_ranks):
-                ranks[x][k] = v
-        else:
-            if len(v.shape) == 1:
-                size_per_rank = v.shape[0] / num_mp_ranks
-                if size_per_rank % 128 != 0.:
-                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
-                    size_diff = int((padded_size * 4) - v.shape[max_])
-                    zero_pad = torch.zeros((size_diff))
-                    v = torch.cat([v,zero_pad],dim=max_)
-                else:
-                    padded_size = size_per_rank
-
-                assert size_per_rank % 1. == 0.
-                assert padded_size % 1. == 0.
-
-                padded_size = int(padded_size)
-                size_per_rank = int(size_per_rank)
-
-                for x in range(num_mp_ranks):
-                    if size_per_rank != padded_size:
-                        #need to pad
-                        ranks[x][k] = v[padded_size * x : padded_size * (x+1)] 
-                    else:
-                        ranks[x][k] = v[size_per_rank * x : size_per_rank * (x+1)]
-                
-            elif len(v.shape) == 2:
-
-                if reduce(np.logical_or,[x in k for x in [ "attention.dense.weight", 
-                                                           "mlp.dense_4h_to_h.weight", ]]):\
-                    # column parallel
-                    max_, min_ = 1, 0
-                elif reduce(np.logical_or,[x in k for x in [ "mlp.dense_h_to_4h.weight", 
-                                                             "mlp.dense_h_to_4h.bias", 
-                                                             "attention.query_key_value.weight", 
-                                                             "attention.query_key_value.bias",
-                                                             "word_embeddings.weight",
-                                                             "final_linear.weight" ]]):
-                    # row parallel
-                    max_, min_ = 0, 1
-                else:
-                    raise Exception("Unknown weight to shard: {}".format(k))
-
-
-                size_per_rank = v.shape[max_] / num_mp_ranks
-                if size_per_rank % 128 != 0.:
-                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
-                    size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
-
-                    assert size_diff > 0, \
-                        "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
-                        size_diff,size_per_rank,k,v.shape,padded_size)
-
-                    zero_pad = torch.zeros((size_diff,v.shape[min_])) if max_ == 0 \
-                                else torch.zeros((v.shape[min_],size_diff))
-
-                    v = torch.cat([v,zero_pad],dim=max_)
-                else:
-                    padded_size = size_per_rank
-
-                assert size_per_rank % 1. == 0.
-                assert padded_size % 1. == 0.
-
-                padded_size = int(padded_size)
-                size_per_rank = int(size_per_rank)
-
-                for x in range(num_mp_ranks):
-                    if size_per_rank != padded_size:
-                        #need to pad
-                        ranks[x][k] = v[padded_size * x : padded_size * (x+1),:] if max_ == 0 \
-                                        else  v[:,padded_size * x : padded_size * (x+1)]
-                    else:
-                        ranks[x][k] = v[size_per_rank * x : size_per_rank * (x+1),...] if max_ == 0 \
-                                        else  v[:,size_per_rank * x : size_per_rank * (x+1)]
-            
-            else: 
-                raise NotImplementedError()
-
-    return ranks
-
-
-
-def replace_sharded_seq(mp_checkpoints,mp_sharded_seq):
-    """replaces the values within checkpointed configs with those 
-        from the sharded sequential object."""
-    
-    for mp_idx, shard in mp_sharded_seq.items():
-        mp_key = f'mp_rank_{mp_idx:02}_model_states.pt'
-        
-        # use for loop instead of direct assignment
-        # to check for compatibility
-        for k,v in mp_checkpoints[mp_key]['module'].items():
-            try:
-                mp_checkpoints[mp_key]['module'][k] = shard[k]
-            except KeyError:
-                print("ERROR key:{} not found in shard.".format(k))
-
-
-def shard_pp(sequential,mp_rank,num_layers):
-    """Shards the model into layers.
-
-    :param sequential: the state dict of the sequential model at mp=1
-    :param mp_rank: the model parallel rank of the layers
-
-    returns a dict of state dicts for each layer
-    """
-    suffix = f"-model_{mp_rank:02}-model_states.pt"
-    
-    layers_seq = dict()
-    layers_seq[f"layer_00" + suffix] = {
-        "word_embeddings.weight" : sequential[f"sequential.0.word_embeddings.weight"]
-    }
-    layers_seq[f"layer_{num_layers+3:02}" + suffix] = { 
-        "norm.weight" : sequential[f"sequential.{num_layers+3}.norm.weight"],
-        "norm.bias"   : sequential[f"sequential.{num_layers+3}.norm.bias"],
-    }
-    
-    layers_seq[f"layer_{num_layers+4:02}" + suffix] = {
-        "final_linear.weight" : sequential[f"sequential.{num_layers+4}.final_linear.weight"]
-    }
-    
-    for layer in range(2,num_layers+2):
-        layer_keys = [x for x in sequential if ".{}.".format(layer) in x]
-        layers_seq[f"layer_{layer:02}" + suffix] = \
-            {k.split('.{}.'.format(layer))[1] : sequential[k] for k in layer_keys}
-
-    return layers_seq
-
-
-def shard_pp_mp(num_mp_ranks,sequential,num_layers):
-    """Shards the model into layers and model parallel ranks.
-
-    :param num_mp_ranks: the number of model parallel ranks
-    :param sequential: the state dict of the sequential model at mp=1
-    :param num_layers: the number of layers in the model
-
-    returns a dict of state dicts for each layer for each model parallel rank
-    """
-    mp_sharded = shard_sequential_mp(num_mp_ranks=num_mp_ranks,
-                                     sequential=sequential)
-    
-    layers_pp_mp = {}
-    for mp_rank, d in mp_sharded.items():
-        layers_pp_mp.update(
-            shard_pp(sequential=d,
-                     mp_rank=mp_rank,
-                     num_layers=num_layers)
-        )
-    return layers_pp_mp
-
-def convert(hf_model, ckpt_dir, output_dir):
-    """Converts a huggingface model to a NeoX checkpoint for different 
-        model parallel and pipeline parallel settings degrees.
-
-    :param hf_model: the huggingface model
-    :param ckpt_dir: the directory containing the NeoX checkpoint
-    :param output_dir: the directory to save the converted checkpoint
-    returns None
-    """
-
-
-    os.listdir(ckpt_dir)
-
-    ckpts, layers = {}, {}
-    for x in os.listdir(ckpt_dir):
-        if x.startswith("mp_rank"):
-            ckpts[x] = torch.load(os.path.join(ckpt_dir,x))
-        elif x.startswith("layer"):
-            layers[x] = torch.load(os.path.join(ckpt_dir,x))
-
-    assert len(layers) + len(ckpts) > 0, "No checkpoints found in {}".format(ckpt_dir)
-
-    os.makedirs(output_dir, exist_ok=True)
-    seq_state_dict = dict()
-    convert_hf_to_sequential(hf_model,seq_state_dict)
-
-    if len(ckpts) == 1 and len(layers) == 0:
-        # pp=0, mp=1
-        key = list(ckpts.keys())[0]
-        ckpts[key]['module'] = seq_state_dict
-        to_save = ckpts
-
-    elif len(ckpts) > 1 and len(layers) == 0:
-        # pp=0, mp>1
-        sharded_seq = shard_sequential_mp(num_mp_ranks=len(ckpts),sequential=seq_state_dict)
-        replace_sharded_seq(mp_checkpoints=ckpts,mp_sharded_seq=sharded_seq)
-        to_save = ckpts
-
-    elif len(ckpts) == 1 and len(layers) > 1:
-        # pp>0, mp==1
-        to_save = shard_pp(sequential=seq_state_dict,
-                              mp_rank=0,
-                              num_layers=hf_model.config.num_hidden_layers)
-
-    elif len(ckpts) > 1 and len(layers) > 1:
-        # pp>0, mp>1
-        to_save = shard_pp_mp(num_mp_ranks=len(ckpts),
-                                    sequential=seq_state_dict,
-                                    num_layers=hf_model.config.num_hidden_layers)
-
-    else:
-        raise NotImplementedError("Not implemented for len(ckpts)={} and len(layers)={}".format(
-                                    len(ckpts),len(layers)))
-
-    for k,v in to_save.items():
-        print("saving {}...".format(os.path.join(output_dir,k)))
-        torch.save(v,os.path.join(ckpt_dir,k))
-
-    # copy the checkpoint to the output_dir
-    print("rm {}/*".format(output_dir))
-    os.system("rm {}/*".format(output_dir))
-    os.makedirs(output_dir,exist_ok=True)
-    print("cp {} {}".format(os.path.join(ckpt_dir,'*'),output_dir))
-    os.system("cp {} {}".format(os.path.join(ckpt_dir,'*'),output_dir))
-
-
-    # set latest file within the output_dir
-    latest_file = os.path.join("/".join(output_dir.split("/")[:-1]),'latest')
-    os.system('rm '+latest_file)
-    with open(latest_file,'w') as f: 
-        f.write(output_dir.split("/")[-1])
-
-
-
-def consume_neox_args2(args_parsed, overwrite_values=None):
-    """
-    Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
-
-    In order not to have any problems with different configs being mismatched across machines, we instead read the .yaml configuration file from the main rank,
-    then serialize the arguments to a dictionary, which the deepspeed launcher broadcasts to all machines (`--megatron_config`).
-
-    We then instantiate a new NeoXArgs from the dictionary (`.from_dict`). This should ensure args are never inconsistent across machines.
-    """
-
-    with open(args_parsed.megatron_config) as jsonfile:
-        megatron_config = json.load(jsonfile)
-    if args_parsed.deepspeed_config is not None:
-        overwrite_values = NeoXArgs.set_up_autotuning(
-            args_parsed.deepspeed_config, overwrite_values
-        )
-    if overwrite_values is not None:
-        megatron_config.update(overwrite_values)
-    return NeoXArgs.from_dict(args_dict=megatron_config)
-
-def get_non_existing_dir(tmp_dir):
-    while os.path.exists(tmp_dir):
-        tmp_dir = os.path.join(tmp_dir, "tmp_dir")
-    return tmp_dir
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Convert a Hugging Face GPT-NeoX model back to a sequential model compatible with GPT-NeoX training."
-    )
-    parser.add_argument(
-        "--revision",
-        type=int,
-        default=143000,
-        help="Revision or step of the Pythia model to convert.",
-    )
-    parser.add_argument( 
-        "--output-dir",
-        type=str,
-        help="Path to save the converted GPT-NeoX model checkpoint.",
-    )
-    parser.add_argument(
-        "--config", 
-        nargs="*",
-        default=[],
-        help="Path to the config file for the equivalent NeoX model."
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set, will run a test to ensure the conversion was successful."
-    )
-    parser.add_argument(
-        "--download-only",
-        action="store_true",
-        help="If set, script will only download the model and not convert it."
-    )
-
-    parser.add_argument(
-        "--ckpt-tmp-dir",
-        default="/tmp/ckpt_tmp_dir",
-        help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]"
-    )
-    parser.add_argument(
-        "--hf-model-name",
-        type=str,
-        help="Name of the hugging face model to download from EleutherAI/{hf-model-name}.}"
-    )
-
-    parser.add_argument(
-        "--cache-dir",
-        default="/gpfs/alpine/csc499/proj-shared/hf_checkpoints",
-        help="Directory to store cached hugging face checkpoints."
-    )
-    try:
-        if int(os.environ['WORLD_SIZE']) > 1:
-            args = parser.parse_args(MULTI_GPU_ARGS.split(" "))
-        else:
-            args = parser.parse_args()
-    except KeyError:
-        args = parser.parse_args()
-
-
-    tmp_cache_dir = get_non_existing_dir(args.ckpt_tmp_dir)
-
-    if args.download_only:
-        hf_model = GPTNeoXForCausalLM.from_pretrained(
-            f"EleutherAI/{args.hf_model_name}",
-            revision=f"step{args.revision}",
-            cache_dir=os.path.join(args.cache_dir,f"{args.hf_model_name}/step{args.revision}")
-        ).half()
-        exit(0)
-    else:
-        print("======================================================================")
-        print("Warning the following script will delete files withing {}".format(args.output_dir))
-        print("Warning the following script will delete this directory {}".format(tmp_cache_dir))
-        print("======================================================================")
-        # time.sleep(5)
-
-    
-    if int(os.environ.get('OMPI_COMM_WORLD_SIZE',1)) > 1:
-        neox_args = consume_neox_args2(args2)
-    else:
-        neox_args = NeoXArgs.from_ymls(args.config)
-    neox_args.configure_distributed_args()
-    neox_args.build_tokenizer()
-    neox_args.initialize_tensorboard_writer()
-
-
-    # setup logging and timers
-    # init_wandb(neox_args=neox_args)
-    # timers = Timers(
-    #     use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
-    # )
-    initialize_megatron(neox_args=neox_args)
-
-    torch.distributed.barrier()
-
-    model = get_model(neox_args=neox_args, use_cache=True)
-    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
-    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
-
-    model, optimizer, _, lr_scheduler = deepspeed.initialize(
-        model=model,
-        optimizer=optimizer,
-        args=neox_args,
-        lr_scheduler=lr_scheduler,
-        dist_init_required=False,
-        model_parameters=None,
-        config_params=neox_args.deepspeed_config,
-        mpu=mpu if not neox_args.is_pipe_parallel else None,
-    )
-
-    if os.environ['OMPI_COMM_WORLD_RANK'] == '0':
-        os.makedirs(f'{tmp_cache_dir}',exist_ok=True)
-
-    torch.distributed.barrier()
-    neox_args.save = tmp_cache_dir
-
-    save_checkpoint(
-        neox_args=neox_args,
-        iteration=0,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-    )
-    print(os.listdir(f'{tmp_cache_dir}'))
-    ckpt_dir = os.path.join(tmp_cache_dir,'global_step0')
-
-
-    if torch.distributed.get_rank() == 0:
-        config = GPTNeoXConfig.from_pretrained(
-            f"EleutherAI/{args.hf_model_name}",
-            revision=f"step{args.revision}",
-            cache_dir=os.path.join(args.cache_dir,f"{args.hf_model_name}/step{args.revision}"))
-        # does not change the weights, but is needed to align logits
-        config.update({'hidden_act':'gelu_fast'})
-        hf_model = GPTNeoXForCausalLM.from_pretrained(
-            f"EleutherAI/{args.hf_model_name}",
-            revision=f"step{args.revision}",
-            config=config,
-            cache_dir=os.path.join(args.cache_dir,f"{args.hf_model_name}/step{args.revision}")
-        ).half()
-        print("==========================================")
-        print("Loaded Hugging Face model successfully!")
-        print("==========================================")
-        convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir)
-
-        if os.environ['OMPI_COMM_WORLD_RANK'] == '0':
-            # cleanup temp dir
-            os.system(f"rm -r {tmp_cache_dir}")
-
-    torch.distributed.barrier()
-
-    #verify the conversion can be loaded
-    neox_args.load = "/".join(args.output_dir.split("/")[:-1])
-    print(neox_args.load)
-    neox_args.finetune=True
-    load_checkpoint(
-        neox_args=neox_args,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        iteration=None,
-    )
-    print("==========================================")
-    print("Converted checkpoint successfully loaded!")
-    print("==========================================")
-
-
-    if args.test and torch.distributed.get_world_size() == 1:
-        # only implemented for world size 1
-
-        
-        with torch.no_grad():
-            # torch.backends.cudnn.benchmark = False
-            # torch.use_deterministic_algorithms(True) #setting the CUBLAS_WORKSPACE_CONFIG=:4096:8 environment variable is required for this to work (tested for A6000)
-            model.eval()
-            hf_model.eval()
-
-            b = 10
-            seq_len = 32
-            inputs = torch.randint(0, 50304, (b, seq_len), dtype=torch.long).cuda()
-            mask = (torch.triu(torch.ones(seq_len, seq_len)) != 1).transpose(0, 1).cuda()
-            pos_ids = torch.arange(0,seq_len).unsqueeze(0).cuda()
-
-            torch.manual_seed(0)
-            outputs_neox = model.cuda()((inputs,pos_ids,mask.unsqueeze(0).unsqueeze(0)), neox_args=neox_args)
-
-            torch.manual_seed(0)
-            outputs = hf_model.cuda()(input_ids=inputs)
-
-            print("HF logits   .sum(): ", outputs.logits.to(torch.float32).sum())
-            print("NeoX logits .sum(): ", outputs_neox.to(torch.float32).sum())
-            
-            print("\nLogit comparison summary for {} sequences of length {}:".format(b,seq_len))
-            print("=============================================================")
-            for i in range(b):
-                abs_diff = (outputs.logits[i,...].to(torch.float32) - outputs_neox[i,...].to(torch.float32)).abs()
-                print("[Random sequence {}] (hflogits - neoxlogits).abs() -- mean: {:.5f}\tmax: {:.5f}\tmin: {:.5f}\tmedian: {:.5f}".format(
-                    i,abs_diff.mean(),abs_diff.max(),abs_diff.min(),abs_diff.median()))
-
-    elif args.test:
-        print("[INFO] Checkpoint conversion logit test not implemented for distributed world_size > 1. Current world_size: {}".format(torch.distributed.get_world_size()))
-
-
-
diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
deleted file mode 100644
index 905bdfa16..000000000
--- a/tools/convert_module_to_hf.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-import yaml
-import argparse
-from tqdm import tqdm
-from typing import List
-
-import torch
-from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
-
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-from megatron.tokenizer import build_tokenizer
-
-
-"""
-A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
-
-Note that this script does not support all NeoX features.
-Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
-
-(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture.
-"""
-
-
-def load_partitions(
-    input_checkpoint_path, mp_partitions, layer_idx
-) -> List[torch.Tensor]:
-    """Returns a list containing all weights in a given layer from a model (across MP partitions)"""
-
-    loaded_tp_ranks = [
-        torch.load(
-            os.path.join(
-                input_checkpoint_path,
-                f"layer_{layer_idx:02}-model_{i:02}-model_states.pt",
-            ),
-            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
-        )
-        for i in range(mp_partitions)
-    ]
-
-    return loaded_tp_ranks
-
-
-def get_key(loaded_config, key, default=None):
-    """
-    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
-    """
-    key = key.replace("_", "-")
-    try:
-        return loaded_config[key]
-    except KeyError:
-        key = key.replace("-", "_")
-        try:
-            return loaded_config[key]
-        except KeyError:
-            return default
-
-
-def create_config(neox_config):
-    """take in a loaded yaml from NeoX and assign relevant values to HF config.
-    Returns: GPTNeoXConfig() object
-    """
-
-    class TokenizerArgs:
-        # kinda hacky.
-        # this is to get something with the same interface as is used in build_tokenizer()
-        # without diving into loading a neox_args object or using argparse etc.
-        def __init__(self, neox_config):
-            self.make_vocab_size_divisible_by = get_key(
-                neox_config, "make-vocab-size-divisible-by", default=128
-            )
-            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
-            self.vocab_file = get_key(neox_config, "vocab-file")
-            self.merge_file = get_key(neox_config, "merge-file")
-            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
-
-            self.rank = 0
-
-    args = TokenizerArgs(neox_config)
-    tokenizer = build_tokenizer(args)
-    try:  # GPT2TokenizerFast raises NotImplementedError
-        pad_token = tokenizer.pad
-    except:
-        pad_token = (
-            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
-        )
-
-    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
-    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
-
-    if use_tied_lns:
-        raise NotImplementedError(
-            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
-                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
-                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
-        )
-
-    # set all config values.
-    hf_config = GPTNeoXConfig(
-        vocab_size=args.padded_vocab_size,
-        hidden_size=get_key(neox_config, "hidden-size"),
-        num_hidden_layers=get_key(neox_config, "num-layers"),
-        num_attention_heads=get_key(neox_config, "num-attention-heads"),
-        intermediate_size=(get_key(neox_config, "hidden-size") * 4),
-        hidden_act=get_key(neox_config, "activation", default="gelu"),
-        rotary_pct=get_key(neox_config, "rotary-pct", default=1.0),
-        rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000),
-        max_position_embeddings=get_key(neox_config, "max-position-embeddings"),
-        initializer_range=get_key(neox_config, "init-method-std", 0.02),
-        layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5),
-        use_cache=True,
-        bos_token_id=tokenizer.eod,
-        eos_token_id=tokenizer.eod,
-        tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)),
-        use_parallel_residual=get_key(neox_config, "gpt-j-residual", False),
-    )
-    return hf_config
-
-
-def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
-    """convert a NeoX checkpoint to a HF model format.
-    should perform model-parallel merging correctly
-    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
-    """
-
-    hf_config = GPTNeoXConfig()
-
-    hf_config = create_config(loaded_config)
-
-    hf_model = GPTNeoXForCausalLM(hf_config)
-
-    # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
-    fp16 = get_key(loaded_config, "fp16")
-    if fp16:
-        try:
-            # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training
-            # in DeeperSpeed v1.0 .
-            if (fp16.get("fp16", None) or fp16["enabled"]) and not (fp16.get("type", None) == "bfloat16"):
-                hf_model.half()
-                print("Saving weights in fp16 precision...")
-            elif fp16.get("type", None) == "bfloat16":
-                hf_model.to(dtype=torch.bfloat16)
-                print("Saving weights in bf16 precision...")
-        except:
-            print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...")
-    
-    mp_partitions = get_key(loaded_config, "model-parallel-size")
-
-    ### Embedding layer ###
-    loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions, 0)
-    hf_model.gpt_neox.embed_in.load_state_dict(
-        {
-            "weight": torch.cat(
-                [t["word_embeddings.weight"] for t in loaded_tp_ranks], dim=0
-            )
-        }
-    )
-
-    assert (
-        hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0]
-    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}"
-    ### End Embedding Layer ###
-
-    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
-
-        # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_i]
-
-        # + 2 bc of embed layer and a dummy _pre_transformer_block
-        loaded_tp_ranks = load_partitions(
-            input_checkpoint_path, mp_partitions, layer_i + 2
-        )
-
-        state_dict = {}
-        for key in [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-        ]:
-            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1)
-
-        # average layernorm stats over mp ranks
-        for key in [
-            "input_layernorm.weight",
-            "input_layernorm.bias",
-            "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias",
-        ]:
-            state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len(
-                loaded_tp_ranks
-            )
-
-        # LinearWithTPMerge
-        for key in [
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias",
-            "attention.query_key_value.weight",
-            "attention.query_key_value.bias",
-        ]:
-            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=0)
-
-        # LinearWithTPSplitBias
-        for key in [
-            "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias",
-        ]:
-            state_dict[key] = sum([t[key] for t in loaded_tp_ranks])
-
-        # Just take one
-        state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
-            "attention.rotary_emb.inv_freq"
-        ]
-        state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-        state_dict["attention.masked_bias"] = hf_layer.state_dict()[
-            "attention.masked_bias"
-        ]
-
-        # load state_dict into layer
-        hf_layer.load_state_dict(state_dict)
-
-    # Load final layer norm
-    loaded_tp_ranks = load_partitions(
-        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 3
-    )
-
-    hf_model.gpt_neox.final_layer_norm.load_state_dict(
-        {
-            "weight": (sum([t["norm.weight"] for t in loaded_tp_ranks]))
-            / len(loaded_tp_ranks),
-            "bias": (sum([t["norm.bias"] for t in loaded_tp_ranks]))
-            / len(loaded_tp_ranks),
-        }
-    )
-    del loaded_tp_ranks
-
-    # Load output embedding
-    loaded_tp_ranks = load_partitions(
-        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 4
-    )
-
-    hf_model.embed_out.load_state_dict(
-        {
-            "weight": torch.cat(
-                [t["final_linear.weight"] for t in loaded_tp_ranks], dim=0
-            ),
-        }
-    )
-
-    del loaded_tp_ranks
-
-    return hf_model
-
-
-if __name__ == "__main__":
-
-    # before running script:
-    # `pip install --upgrade transformers`
-    # `huggingface-cli login`
-    #
-    from huggingface_hub import create_repo, HfApi
-
-    parser = argparse.ArgumentParser(
-        description="Merge MP partitions and convert to HF Model."
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
-    )
-    parser.add_argument(
-        "--config_file",
-        type=str,
-        help="Path to config file for the input NeoX checkpoint.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="Output dir, where to save the HF Model, tokenizer, and configs",
-    )
-    parser.add_argument(
-        "--upload",
-        action="store_true",
-        help="Set to true in order to upload to the HF Hub directly.",
-    )
-    args = parser.parse_args()
-
-    with open(args.config_file) as f:
-        loaded_config = yaml.full_load(f)
-
-    hf_model = convert(args.input_dir, loaded_config, args.output_dir)
-
-    hf_model.save_pretrained(args.output_dir)
-
-    # save tokenizer to directory as well, for easy loading of model as a HF model
-    tokenizer_type = get_key(loaded_config, "tokenizer-type")
-
-    if tokenizer_type == "HFTokenizer":
-        print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
-        from transformers import PreTrainedTokenizerFast
-
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_file=get_key(loaded_config, "vocab-file")
-        )
-        print("loaded tokenizer: ", tokenizer)
-        tokenizer.save_pretrained(args.output_dir)
-        print("tokenizer saved!")
-
-    if args.upload:
-        repo_name = input("Provide a repository name for the HF Hub: ")
-        create_repo(repo_name, repo_type="model", private=False, use_auth_token=True)
-
-        api = HfApi()
-        api.upload_folder(
-            folder_path=args.output_dir,
-            repo_id=repo_name,
-            repo_type="model",
-        )
diff --git a/tools/convert_raw_llama_weights_to_neox.py b/tools/convert_raw_llama_weights_to_neox.py
deleted file mode 100644
index 5940fa856..000000000
--- a/tools/convert_raw_llama_weights_to_neox.py
+++ /dev/null
@@ -1,635 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import torch
-import json
-import math
-import tqdm.auto as tqdm
-
-
-INTERMEDIATE_SIZE_MAP = {
-    "7B": 11008,
-    "13B": 13824,
-    "30B": 17920,
-    "65B": 22016,
-}
-NUM_SHARDS = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "65B": 8,
-}
-
-
-def compute_intermediate_size(n):
-    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_file(text, path):
-    with open(path, "w") as f:
-        f.write(text)
-
-
-def convert_model_pipeline(
-    output_base_path, input_base_path, model_size: str, num_output_shards: int
-):
-    assert model_size in NUM_SHARDS
-
-    model_path = os.path.join(output_base_path, "global_step0")
-    os.makedirs(model_path, exist_ok=True)
-    write_file("global_step0", os.path.join(output_base_path, "latest"))
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_input_shards = NUM_SHARDS[model_size]
-    num_layers = params["n_layers"]
-    num_heads = params["n_heads"]
-    num_heads_per_input_shard = num_heads // num_input_shards
-    num_heads_per_output_shard = num_heads // num_output_shards
-    hidden_size = params["dim"]
-    dims_per_head = hidden_size // num_heads
-    # base = 10000.0
-    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    def permute_rotary(w):
-        assert w.shape == (num_heads, dims_per_head, hidden_size)
-        return (
-            w.view(num_heads, dims_per_head // 2, 2, hidden_size)
-            .transpose(1, 2)
-            .reshape(num_heads, dims_per_head, hidden_size)
-        )
-
-    pbar = tqdm.tqdm(total=num_input_shards + num_layers + 3)
-
-    pbar.set_description(f"Loading shard")
-    loaded = []
-    for i in range(num_input_shards):
-        loaded.append(
-            torch.load(
-                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
-                map_location="cpu",
-            )
-        )
-        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
-        pbar.update(1)
-    helper = Helper(
-        loaded=loaded,
-        model_path=model_path,
-        num_output_shards=num_output_shards,
-        model_size=model_size,
-        pipeline_parallel=False,
-    )
-
-    sequential_cache = [{} for _ in range(num_output_shards)]
-
-    # Embedding in
-    embeddings_in = torch.cat(
-        [
-            loaded[rank]["tok_embeddings.weight"].cpu()
-            for rank in range(num_input_shards)
-        ],
-        dim=1,
-    )
-    helper.save_shards(
-        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
-    )
-    helper.del_loaded("tok_embeddings.weight")
-    pbar.set_description(f"Saved embeddings")
-    pbar.update(1)
-
-    # Norms
-    helper.save_duplicates(
-        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
-    )
-    helper.del_loaded("norm.weight")
-    pbar.set_description(f"Saved final norm")
-    pbar.update(1)
-
-    # Embedding out
-    embeddings_out = torch.cat(
-        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
-    )
-    helper.save_shards(
-        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
-        layer_i=num_layers + 4,
-    )
-    helper.del_loaded("output.weight")
-    pbar.set_description(f"Saved out embeddings")
-    pbar.update(1)
-
-    # Layers
-    if model_size == "7B":
-        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
-        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
-    else:
-        rope_freqs = loaded[0]["rope.freqs"]
-        helper.del_loaded("rope.freqs")
-    for layer_i in range(num_layers):
-
-        # Linear
-        attn_wo = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w1 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        mlp_w2 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w3 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
-
-        # Attention
-        w_q = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_k = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_v = torch.cat(
-            [
-                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_heads_per_input_shard, dims_per_head, hidden_size
-                )
-                for rank in range(num_input_shards)
-            ],
-            dim=0,
-        )
-        sharded_qkv = torch.stack(
-            [
-                helper.shard(
-                    w_q, dim=0
-                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
-                helper.shard(w_k, dim=0),
-                helper.shard(w_v, dim=0),
-            ],
-            dim=2,
-        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
-        sharded_qkv = sharded_qkv.view(
-            num_output_shards,
-            num_heads_per_output_shard * 3 * dims_per_head,
-            hidden_size,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
-
-        # Duplicated
-        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
-        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
-        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
-        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
-
-        for out_rank in range(num_output_shards):
-            helper.save(
-                {
-                    "attention.query_key_value.weight": sharded_qkv[out_rank],
-                    # Sharded layers
-                    "attention.dense.weight": attn_wo[out_rank].clone(),
-                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
-                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
-                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
-                    # Duplicated layers
-                    "input_layernorm.scale": input_layernorm,
-                    "post_attention_layernorm.scale": post_attention_layernorm,
-                    "attention.rotary_emb.inv_freq": rope_freqs,
-                },
-                layer_i=layer_i + 2,
-                rank=out_rank,
-            )
-
-        pbar.set_description(f"Saved layer {layer_i} / {num_layers}")
-        pbar.update(1)
-
-    model_state = {
-        "dp_world_size": 1,
-        "mp_world_size": num_output_shards,
-        "module": {},
-        "optimizer": {},
-        "global_steps": 1,
-        "skipped_steps": 1,
-        "iteration": 1,
-    }
-    for rank in range(num_output_shards):
-        torch.save(
-            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
-        )
-    pbar.set_description("Done.")
-
-
-def convert_model_sequential(
-    output_base_path, input_base_path, model_size: str, num_output_shards: int
-):
-    assert model_size in NUM_SHARDS
-
-    model_path = os.path.join(output_base_path, "global_step0")
-    os.makedirs(model_path, exist_ok=True)
-    write_file("global_step0", os.path.join(output_base_path, "latest"))
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_input_shards = NUM_SHARDS[model_size]
-    num_layers = params["n_layers"]
-    num_heads = params["n_heads"]
-    num_heads_per_input_shard = num_heads // num_input_shards
-    num_heads_per_output_shard = num_heads // num_output_shards
-    hidden_size = params["dim"]
-    dims_per_head = hidden_size // num_heads
-    # base = 10000.0
-    # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    def permute_rotary(w):
-        assert w.shape == (num_heads, dims_per_head, hidden_size)
-        return (
-            w.view(num_heads, dims_per_head // 2, 2, hidden_size)
-            .transpose(1, 2)
-            .reshape(num_heads, dims_per_head, hidden_size)
-        )
-
-    pbar = tqdm.tqdm(total=num_input_shards + num_output_shards)
-
-    pbar.set_description(f"Loading shard")
-    loaded = []
-    for i in range(num_input_shards):
-        loaded.append(
-            torch.load(
-                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"),
-                map_location="cpu",
-            )
-        )
-        pbar.set_description(f"Loaded shard {i}/{num_input_shards}")
-        pbar.update(1)
-    helper = Helper(
-        loaded=loaded,
-        model_path=model_path,
-        num_output_shards=num_output_shards,
-        model_size=model_size,
-        pipeline_parallel=False,
-    )
-
-    # Embedding in
-    embeddings_in = torch.cat(
-        [
-            loaded[rank]["tok_embeddings.weight"].cpu()
-            for rank in range(num_input_shards)
-        ],
-        dim=1,
-    )
-    helper.add_sequential_shard(
-        {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
-    )
-    helper.del_loaded("tok_embeddings.weight")
-
-    # Norms
-    helper.add_sequential_duplicates(
-        {"norm.scale": loaded[0]["norm.weight"]}, layer_i=num_layers + 3
-    )
-    helper.del_loaded("norm.weight")
-
-    # Embedding out
-    embeddings_out = torch.cat(
-        [loaded[rank]["output.weight"].cpu() for rank in range(num_input_shards)], dim=0
-    )
-    helper.add_sequential_shard(
-        {"final_linear.weight": helper.shard(embeddings_out, dim=0)},
-        layer_i=num_layers + 4,
-    )
-    helper.del_loaded("output.weight")
-
-    # Layers
-    if model_size == "7B":
-        rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
-        helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
-    else:
-        rope_freqs = loaded[0]["rope.freqs"]
-        helper.del_loaded("rope.freqs")
-    for layer_i in range(num_layers):
-
-        # Linear
-        attn_wo = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wo.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w1 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w1.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        mlp_w2 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w2.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=1,
-            ),
-            dim=1,
-        )
-        mlp_w3 = helper.shard(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.feed_forward.w3.weight"]
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            ),
-            dim=0,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wo.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w1.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w2.weight")
-        helper.del_loaded(f"layers.{layer_i}.feed_forward.w3.weight")
-
-        # Attention
-        w_q = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wq.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_k = permute_rotary(
-            torch.cat(
-                [
-                    loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
-                    )
-                    for rank in range(num_input_shards)
-                ],
-                dim=0,
-            )
-        )
-        w_v = torch.cat(
-            [
-                loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_heads_per_input_shard, dims_per_head, hidden_size
-                )
-                for rank in range(num_input_shards)
-            ],
-            dim=0,
-        )
-        sharded_qkv = torch.stack(
-            [
-                helper.shard(
-                    w_q, dim=0
-                ),  # num_output_shards, num_heads_per_output_shard, dims_per_head, hidden_size
-                helper.shard(w_k, dim=0),
-                helper.shard(w_v, dim=0),
-            ],
-            dim=2,
-        )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
-        sharded_qkv = sharded_qkv.view(
-            num_output_shards,
-            num_heads_per_output_shard * 3 * dims_per_head,
-            hidden_size,
-        )
-        helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
-        helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
-
-        # Duplicated
-        input_layernorm = loaded[0][f"layers.{layer_i}.attention_norm.weight"]
-        post_attention_layernorm = loaded[0][f"layers.{layer_i}.ffn_norm.weight"]
-        helper.del_loaded(f"layers.{layer_i}.attention_norm.weight")
-        helper.del_loaded(f"layers.{layer_i}.ffn_norm.weight")
-
-        for out_rank in range(num_output_shards):
-            helper.add_sequential(
-                {
-                    "attention.query_key_value.weight": sharded_qkv[out_rank],
-                    # Sharded layers
-                    "attention.dense.weight": attn_wo[out_rank].clone(),
-                    "mlp.w1.weight": mlp_w1[out_rank].clone(),
-                    "mlp.w2.weight": mlp_w2[out_rank].clone(),
-                    "mlp.w3.weight": mlp_w3[out_rank].clone(),
-                    # Duplicated layers
-                    "input_layernorm.scale": input_layernorm,
-                    "post_attention_layernorm.scale": post_attention_layernorm,
-                    "attention.rotary_emb.inv_freq": rope_freqs,
-                },
-                layer_i=layer_i + 2,
-                rank=out_rank,
-            )
-
-    for rank in range(num_output_shards):
-        model_state = {
-            "dp_world_size": 1,
-            "mp_world_size": num_output_shards,
-            "module": helper.sequential_cache[rank],
-            "optimizer": {},
-            "global_steps": 1,
-            "skipped_steps": 1,
-            "iteration": 1,
-        }
-        torch.save(
-            model_state, os.path.join(model_path, f"mp_rank_{rank:02d}_model_states.pt")
-        )
-        pbar.set_description(f"Saved shard {rank}")
-        pbar.update(1)
-    pbar.set_description("Done.")
-
-
-class Helper:
-    def __init__(
-        self, loaded, model_size, num_output_shards, model_path, pipeline_parallel
-    ):
-        self.loaded = loaded
-        self.model_size = model_size
-        self.num_output_shards = num_output_shards
-        self.model_path = model_path
-
-        self.pipeline_parallel = pipeline_parallel
-        self.sequential_cache = [{} for _ in range(num_output_shards)]
-
-    def del_loaded(self, key: str):
-        # Remove from memory as we go along
-        for loaded_shared in self.loaded:
-            del loaded_shared[key]
-
-    def save_shards(self, dictionary, layer_i: int):
-        for k, v in dictionary.items():
-            assert v.shape[0] == self.num_output_shards
-        for rank in range(self.num_output_shards):
-            torch.save(
-                {k: v[rank].clone() for k, v in dictionary.items()},
-                self.save_path(layer_i=layer_i, rank=rank),
-            )
-
-    def save_duplicates(self, dictionary, layer_i: int):
-        for rank in range(self.num_output_shards):
-            torch.save(
-                {k: v.clone() for k, v in dictionary.items()},
-                self.save_path(layer_i=layer_i, rank=rank),
-            )
-
-    def save(self, obj, layer_i, rank):
-        torch.save(obj, self.save_path(layer_i=layer_i + 2, rank=rank))
-
-    def shard(self, x, dim):
-        x_shape = list(x.shape)
-        assert x_shape[dim] % self.num_output_shards == 0
-        new_x_shape = (
-            x_shape[:dim]
-            + [self.num_output_shards, x_shape[dim] // self.num_output_shards]
-            + x_shape[dim + 1 :]
-        )
-        x = x.view(*new_x_shape)
-        return torch.movedim(x, 0, dim)
-
-    def save_path(self, layer_i, rank):
-        return os.path.join(
-            self.model_path, f"layer_{layer_i:02d}-model_{rank:02d}-model_states.pt"
-        )
-
-    def add_sequential_shard(self, dictionary, layer_i):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            for rank in range(self.num_output_shards):
-                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v[
-                    rank
-                ].clone()
-
-    def add_sequential_duplicates(self, dictionary, layer_i):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            for rank in range(self.num_output_shards):
-                self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
-
-    def add_sequential(self, dictionary, layer_i, rank):
-        assert not self.pipeline_parallel
-        for k, v in dictionary.items():
-            self.sequential_cache[rank][f"sequential.{layer_i}.{k}"] = v.clone()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert raw LLaMA checkpoints to GPT-NeoX format."
-    )
-    parser.add_argument(
-        "--input_dir",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B", "13B", "30B", "65B", "tokenizer_only"],
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write GPT-NeoX mode",
-    )
-    parser.add_argument(
-        "--num_output_shards",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--pipeline_parallel",
-        action="store_true",
-        help="Only use if PP>1",
-    )
-    args = parser.parse_args()
-    if args.pipeline_parallel:
-        print("parallel")
-        convert_model_pipeline(
-            output_base_path=args.output_dir,
-            input_base_path=os.path.join(args.input_dir, args.model_size),
-            model_size=args.model_size,
-            num_output_shards=args.num_output_shards,
-        )
-    else:
-        print("sequential")
-        convert_model_sequential(
-            output_base_path=args.output_dir,
-            input_base_path=os.path.join(args.input_dir, args.model_size),
-            model_size=args.model_size,
-            num_output_shards=args.num_output_shards,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/convert_sequential_to_hf.py b/tools/convert_sequential_to_hf.py
deleted file mode 100644
index 5a66219bf..000000000
--- a/tools/convert_sequential_to_hf.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-import yaml
-import argparse
-from tqdm import tqdm
-
-import torch
-from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
-
-from typing import List
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-from megatron.tokenizer import build_tokenizer
-
-
-"""
-A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
-
-Note that this script does not support all NeoX features.
-Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
-
-(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture.
-"""
-
-
-def load_partitions(input_checkpoint_path, mp_partitions) -> List[torch.Tensor]:
-    """Returns a list containing all states from a model (across MP partitions)"""
-
-    loaded_tp_ranks = [
-        torch.load(
-            os.path.join(
-                input_checkpoint_path,
-                f"mp_rank_{i:02}_model_states.pt",
-            ),
-            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
-        )
-        for i in range(mp_partitions)
-    ]
-
-    return loaded_tp_ranks
-
-
-def get_state(
-    state_dicts: List[torch.Tensor],
-    key: str,
-    layer_idx: int,
-) -> torch.Tensor:
-    """Accesses all MP partitions of a given layer/weight's state."""
-    # main DeepSpeed saves each MP partition
-    key = f"sequential.{layer_idx}.{key}"
-
-    return [state_dict["module"][key] for state_dict in state_dicts]
-
-
-def get_key(loaded_config, key, default=None):
-    """
-    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
-    """
-    key = key.replace("_", "-")
-    try:
-        return loaded_config[key]
-    except KeyError:
-        key = key.replace("-", "_")
-        try:
-            return loaded_config[key]
-        except KeyError:
-            return default
-
-
-def create_config(neox_config):
-    """take in a loaded yaml from NeoX and assign relevant values to HF config.
-    Returns: GPTNeoXConfig() object
-    """
-
-    class TokenizerArgs:
-        # kinda hacky.
-        # this is to get something with the same interface as is used in build_tokenizer()
-        # without diving into loading a neox_args object or using argparse etc.
-        def __init__(self, neox_config):
-            self.make_vocab_size_divisible_by = get_key(
-                neox_config, "make-vocab-size-divisible-by", default=128
-            )
-            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
-            self.vocab_file = get_key(neox_config, "vocab-file")
-            self.merge_file = get_key(neox_config, "merge-file")
-            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
-
-            self.rank = 0
-
-    args = TokenizerArgs(neox_config)
-    tokenizer = build_tokenizer(args)
-    try:  # GPT2TokenizerFast raises NotImplementedError
-        pad_token = tokenizer.pad
-    except:
-        pad_token = (
-            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
-        )
-
-    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
-    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
-
-    if use_tied_lns:
-        raise NotImplementedError(
-            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
-                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
-                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
-        )
-
-    # set all config values.
-    hf_config = GPTNeoXConfig(
-        vocab_size=args.padded_vocab_size,
-        hidden_size=get_key(neox_config, "hidden-size"),
-        num_hidden_layers=get_key(neox_config, "num-layers"),
-        num_attention_heads=get_key(neox_config, "num-attention-heads"),
-        intermediate_size=(get_key(neox_config, "hidden-size") * 4),
-        hidden_act=get_key(neox_config, "activation", default="gelu"),
-        rotary_pct=get_key(neox_config, "rotary-pct", default=1.0),
-        rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000),
-        max_position_embeddings=get_key(neox_config, "max-position-embeddings"),
-        initializer_range=get_key(neox_config, "init-method-std", 0.02),
-        layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5),
-        use_cache=True,
-        bos_token_id=tokenizer.eod,
-        eos_token_id=tokenizer.eod,
-        tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)),
-        use_parallel_residual=get_key(neox_config, "gpt-j-residual", False),
-    )
-    return hf_config
-
-
-def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
-    """convert a NeoX checkpoint to a HF model format.
-    should perform model-parallel merging correctly
-    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
-    """
-
-    hf_config = GPTNeoXConfig()
-
-    hf_config = create_config(loaded_config)
-
-    hf_model = GPTNeoXForCausalLM(
-        hf_config
-    )
-
-    # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
-    fp16 = get_key(loaded_config, "fp16")
-    # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
-    fp16 = get_key(loaded_config, "fp16")
-    if fp16:
-        try:
-            # current behavior is to pass "fp16": {"enabled": true}, when using upstream Deepspeed
-            if fp16["enabled"]:
-                hf_model.half()
-                print("Saving weights in fp16 precision...")
-        except:
-            try:
-                # attempt to access bf16 dict in yaml file, if fp16 not enabled
-                bf16 = get_key(loaded_config, "bf16")
-                if bf16:
-                    hf_model.to(dtype=torch.bfloat16)
-                    print("Saving weights in bf16 precision...")
-            except:
-                print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...")  
-
-    mp_partitions = get_key(loaded_config, "model-parallel-size")
-
-    # DeepSpeed main saves all model states from an MP rank in one file. load the MP ranks only once and index into them with get_state()
-    loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions)
-
-    ### Embedding layer ###
-    # Embedding is layer idx 0
-    hf_model.gpt_neox.embed_in.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(loaded_tp_ranks, "word_embeddings.weight", 0), dim=0
-            )
-        }
-    )
-    assert (
-        hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0]
-    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}"
-    ### End Embedding Layer ###
-
-    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
-
-        # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_i]
-
-        # + 2 bc of embed layer and a dummy _pre_transformer_block
-        state_dict = {}
-        for key in [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-        ]:
-            state_dict[key] = torch.cat(
-                get_state(loaded_tp_ranks, key, layer_i + 2), dim=1
-            )
-
-        # average layernorm stats over mp ranks
-        for key in [
-            "input_layernorm.weight",
-            "input_layernorm.bias",
-            "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias",
-        ]:
-            state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2)) / len(
-                loaded_tp_ranks
-            )
-
-        # LinearWithTPMerge
-        for key in [
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias",
-            "attention.query_key_value.weight",
-            "attention.query_key_value.bias",
-        ]:
-            state_dict[key] = torch.cat(
-                get_state(loaded_tp_ranks, key, layer_i + 2), dim=0
-            )
-
-        # LinearWithTPSplitBias
-        for key in [
-            "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias",
-        ]:
-            state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2))
-
-        # Just take one
-        state_dict["attention.rotary_emb.inv_freq"] = get_state(
-            loaded_tp_ranks, "attention.rotary_emb.inv_freq", layer_i + 2
-        )[0]
-
-        state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-        state_dict["attention.masked_bias"] = hf_layer.state_dict()[
-            "attention.masked_bias"
-        ]
-
-        # load state_dict into layer
-        hf_layer.load_state_dict(state_dict)
-
-    # Load final layer norm
-    hf_model.gpt_neox.final_layer_norm.load_state_dict(
-        {
-            "weight": (
-                sum(
-                    get_state(
-                        loaded_tp_ranks,
-                        "norm.weight",
-                        get_key(loaded_config, "num-layers") + 3,
-                    )
-                )
-            )
-            / len(loaded_tp_ranks),
-            "bias": (
-                sum(
-                    get_state(
-                        loaded_tp_ranks,
-                        "norm.bias",
-                        get_key(loaded_config, "num-layers") + 3,
-                    )
-                )
-            )
-            / len(loaded_tp_ranks),
-        }
-    )
-    # output embedding / LM head
-    hf_model.embed_out.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(
-                    loaded_tp_ranks,
-                    "final_linear.weight",
-                    get_key(loaded_config, "num-layers") + 4,
-                ),
-                dim=0,
-            ),
-        }
-    )
-
-    del loaded_tp_ranks
-
-    return hf_model
-
-
-if __name__ == "__main__":
-
-    # before running script:
-    # `pip install --upgrade transformers`
-    # `huggingface-cli login`
-    #
-    from huggingface_hub import create_repo, HfApi
-
-    parser = argparse.ArgumentParser(
-        description="Merge MP partitions and convert to HF Model."
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
-    )
-    parser.add_argument(
-        "--config_file",
-        type=str,
-        help="Path to config file for the input NeoX checkpoint.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="Output dir, where to save the HF Model, tokenizer, and configs",
-    )
-    parser.add_argument(
-        "--upload",
-        action="store_true",
-        help="Set to true in order to upload to the HF Hub directly.",
-    )
-    args = parser.parse_args()
-
-    with open(args.config_file) as f:
-        loaded_config = yaml.full_load(f)
-
-    hf_model = convert(args.input_dir, loaded_config, args.output_dir)
-
-    hf_model.save_pretrained(args.output_dir)
-
-    # save tokenizer to directory as well, for easy loading of model as a HF model
-    tokenizer_type = get_key(loaded_config, "tokenizer-type")
-
-    if tokenizer_type == "HFTokenizer":
-        print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
-        from transformers import PreTrainedTokenizerFast
-
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_file=get_key(loaded_config, "vocab-file")
-        )
-        print("loaded tokenizer: ", tokenizer)
-        tokenizer.save_pretrained(args.output_dir)
-        print("tokenizer saved!")
-
-        print(
-            tokenizer.decode(
-                hf_model.generate(
-                    tokenizer.encode("Hello, I am testing ", return_tensors="pt")
-                )[0]
-            )
-        )
-
-    if args.upload:
-        repo_name = input("Provide a repository name for the HF Hub: ")
-        create_repo(repo_name, repo_type="model", private=False, use_auth_token=True)
-
-        api = HfApi()
-        api.upload_folder(
-            folder_path=args.output_dir,
-            repo_id=repo_name,
-            repo_type="model",
-        )
diff --git a/tools/corpora.py b/tools/corpora.py
deleted file mode 100644
index b9e846454..000000000
--- a/tools/corpora.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-from abc import ABC, abstractmethod
-from multiprocessing import cpu_count
-
-"""
-This registry is for automatically downloading and extracting datasets.
-
-To register a class you need to inherit the DataDownloader class, and provide name and url attributes, and (optionally)
-the number of documents.
-
-When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected
-dataset.
-"""
-
-GPT2_VOCAB_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
-GPT2_MERGE_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
-
-
-class DataDownloader(ABC):
-    """Dataset registry class to automatically download / extract datasets"""
-
-    def __init__(
-        self,
-        tokenizer_type=None,
-        merge_file=None,
-        vocab_file=None,
-        data_dir=None,
-        force_redownload=None,
-        num_workers=None,
-    ):
-        if tokenizer_type is None:
-            tokenizer_type = "GPT2BPETokenizer"
-        if data_dir is None:
-            data_dir = os.environ.get("DATA_DIR", "./data")
-        if merge_file is None:
-            merge_file = f"{data_dir}/gpt2-merges.txt"
-        if force_redownload is None:
-            force_redownload = False
-        if vocab_file is None:
-            if tokenizer_type == "GPT2BPETokenizer":
-                vocab_file = f"{data_dir}/gpt2-vocab.json"
-            elif tokenizer_type == "HFGPT2Tokenizer":
-                vocab_file = "gpt2"
-            elif tokenizer_type == "CharLevelTokenizer":
-                pass
-            else:
-                assert vocab_file is not None, "No vocab file provided"
-        if num_workers is None:
-            num_workers = cpu_count()
-        self._tokenizer_type = tokenizer_type
-        self._merge_file = merge_file
-        self._vocab_file = vocab_file
-        self._data_dir = data_dir
-        self._force_redownload = force_redownload
-        self._num_workers = num_workers
-
-    @property
-    def base_dir(self):
-        """base data directory"""
-        return self._data_dir
-
-    @property
-    @abstractmethod
-    def name(self):
-        """name of dataset"""
-        pass
-
-    @property
-    @abstractmethod
-    def urls(self):
-        """URLs from which to download dataset"""
-        pass
-
-    @property
-    def tokenizer_type(self):
-        """tokenizer type to use when tokenizing data"""
-        return self._tokenizer_type
-
-    @property
-    def merge_file(self):
-        """Merge file for tokenizer"""
-        return self._merge_file
-
-    @property
-    def vocab_file(self):
-        """Vocab file for tokenizer"""
-        return self._vocab_file
-
-    @property
-    def num_workers(self):
-        """Number of workers to use in preprocessing"""
-        return self._num_workers
-
-    @property
-    def num_docs(self):
-        """Number of documents in the dataset (if known)"""
-        return None
-
-    @property
-    def ftfy(self):
-        """Use ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix text encodings"""
-        return False
-
-    def exists(self):
-        """Checks if the dataset is present"""
-        return os.path.isdir(f"{self.base_dir}/{self.name}")
-
-    def download(self):
-        """downloads dataset"""
-        os.makedirs(os.path.join(self.base_dir, self.name), exist_ok=True)
-        for url in self.urls:
-            try:
-                os_cmd = f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}"
-                if os.system(os_cmd) != 0:
-                    raise Exception(
-                        f"Cannot download file at URL {url}: server may be down"
-                    )
-            except Exception as e:
-                raise Exception(f"Download error: {e}")
-
-    def tokenize(self):
-        """tokenizes dataset"""
-        parent_folder = os.path.join(self.base_dir, self.name)
-        jsonl_filepath = ",".join(
-            [os.path.join(parent_folder, os.path.basename(url)) for url in self.urls]
-        )
-
-        cmd = f"python tools/preprocess_data.py \
-            --input {jsonl_filepath} \
-            --output-prefix {parent_folder}/{self.name} \
-            --vocab {self.vocab_file} \
-            --dataset-impl mmap \
-            --tokenizer-type {self.tokenizer_type} \
-            --merge-file {self.merge_file} \
-            --append-eod \
-            --workers {self.num_workers} "
-
-        if self.num_docs is not None:
-            cmd += f"--num-docs {self.num_docs} "
-
-        if self.ftfy:
-            cmd += f"--ftfy "
-
-        os.system(cmd)
-
-    def prepare(self):
-        if self._force_redownload:
-            self.download()
-        else:
-            if not self.exists():
-                self.download()
-
-        self.tokenize()
-
-
-class Enron(DataDownloader):
-    name = "enron"
-    urls = ["http://eaidata.bmk.sh/data/enron_emails.jsonl.zst"]
-    num_docs = 517401
-
-
-class PileSubset(DataDownloader):
-    name = "pile_00"
-    urls = ["https://the-eye.eu/public/AI/pile/train/00.jsonl.zst"]
-
-
-class Pile(DataDownloader):
-    name = "pile"
-    urls = [
-        f"https://the-eye.eu/public/AI/pile/train/{i:02}.jsonl.zst" for i in range(30)
-    ]
-
-
-class Github(DataDownloader):
-    name = "github"
-    urls = ["http://eaidata.bmk.sh/data/github_small.jsonl.zst"]
-
-
-class ArXiv(DataDownloader):
-    name = "arxiv"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/2020-09-08-arxiv-extracts-nofallback-until-2007-068.tar.gz"
-    ]
-
-
-class EuroParl(DataDownloader):
-    name = "europarl"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/EuroParliamentProceedings_1996_2011.jsonl.zst"
-    ]
-
-
-class FreeLaw(DataDownloader):
-    name = "freelaw"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"
-    ]
-
-
-class NiH(DataDownloader):
-    name = "nih"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/NIH_ExPORTER_awarded_grant_text.jsonl.zst"
-    ]
-
-
-class PubMed(DataDownloader):
-    name = "pubmed"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz"
-    ]
-
-
-class Books1(DataDownloader):
-    name = "books1"
-    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"]
-
-
-class Books3(DataDownloader):
-    name = "books3"
-    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz"]
-
-
-class HackerNews(DataDownloader):
-    name = "hackernews"
-    urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/hn.tar.gz"]
-    num_docs = 373000
-
-
-class OpenWebText2(DataDownloader):
-    name = "openwebtext2"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar"
-    ]
-    num_docs = 17103000
-
-
-class StackExchange(DataDownloader):
-    name = "stackexchange"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar"
-    ]
-
-
-class UbuntuIRC(DataDownloader):
-    name = "ubuntu_irc"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/ubuntu_irc_until_2020_9_1.jsonl.zst"
-    ]
-
-
-class YoutubeSubtitles(DataDownloader):
-    name = "youtube_subtitles"
-    urls = [
-        "https://the-eye.eu/public/AI/pile_preliminary_components/yt_subs.jsonl.zst"
-    ]
-
-
-class C4(DataDownloader):
-    name = "c4"
-    urls = [
-        f"https://the-eye.eu/eleuther_staging/c4/en/c4-train.{i:05}-of-01024.json.gz"
-        for i in range(1024)
-    ]
-
-
-class C4OpenWebText(DataDownloader):
-    name = "c4_openwebtext"
-    urls = [
-        f"https://the-eye.eu/eleuther_staging/c4/realnewslike/c4-train.{i:05}-of-00512.json.gz"
-        for i in range(512)
-    ]
-
-
-class Enwik8(DataDownloader):
-    name = "enwik8"
-    urls = ["https://data.deepai.org/enwik8.zip"]
-
-
-def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
-    if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
-        GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
-        GPT2_MERGE_FP = f"{data_dir}/gpt2-merges.txt"
-        if not os.path.isfile(GPT2_VOCAB_FP):
-            os.system(f"wget {GPT2_VOCAB_URL} -O {GPT2_VOCAB_FP}")
-        if not os.path.isfile(GPT2_MERGE_FP):
-            os.system(f"wget {GPT2_MERGE_URL} -O {GPT2_MERGE_FP}")
-
-
-DATA_DOWNLOADERS = {
-    "pass": "pass",
-    "enron": Enron,
-    "pile_subset": PileSubset,
-    "pile": Pile,
-    "github": Github,
-    "arxiv": ArXiv,
-    "europarl": EuroParl,
-    "freelaw": FreeLaw,
-    "nih": NiH,
-    "pubmed": PubMed,
-    "books1": Books1,
-    "books3": Books3,
-    "hackernews": HackerNews,
-    "openwebtext2": OpenWebText2,
-    "stackexchange": StackExchange,
-    "ubuntu_irc": UbuntuIRC,
-    "youtube_subtitles": YoutubeSubtitles,
-    "c4": C4,
-    "c4_openwebtext": C4OpenWebText,
-    "enwik8": Enwik8,
-}
-
-
-def prepare_dataset(
-    dataset_name: str,
-    tokenizer_type: str = None,
-    data_dir: str = None,
-    vocab_file: str = None,
-    merge_file: str = None,
-    force_redownload: bool = None,
-    num_workers: int = None,
-):
-    """
-    Downloads + tokenizes a dataset in the registry (dataset_name) and saves output .npy files to data_dir.
-    """
-    if data_dir is None:
-        data_dir = os.environ.get("DATA_DIR", "./data")
-    os.makedirs(data_dir, exist_ok=True)
-    maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir)
-    DownloaderClass = DATA_DOWNLOADERS.get(dataset_name.lower(), None)
-    if DownloaderClass is None:
-        raise NotImplementedError(
-            f'Dataset "{dataset_name}" not recognized - please choose from {list(DATA_DOWNLOADERS.keys())}'
-        )
-    elif DownloaderClass == "pass":
-        # pass on building dataset (for unit tests)
-        pass
-    else:
-        num_workers = 1 if dataset_name == "enwik8" else num_workers
-        d = DownloaderClass(
-            tokenizer_type=tokenizer_type,
-            vocab_file=vocab_file,
-            merge_file=merge_file,
-            data_dir=data_dir,
-            force_redownload=force_redownload,
-            num_workers=num_workers,
-        )
-        d.prepare()
diff --git a/tools/inspect_checkpoints.py b/tools/inspect_checkpoints.py
deleted file mode 100644
index 0bd5c05b4..000000000
--- a/tools/inspect_checkpoints.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Adapted from https://github.com/awaelchli/pytorch-lightning-snippets/blob/master/checkpoint/peek.py
-
-import code
-import os
-import re
-from argparse import ArgumentParser, Namespace
-from collections.abc import Mapping, Sequence
-from pathlib import Path
-
-import torch
-
-
-class COLORS:
-    BLUE = "\033[94m"
-    CYAN = "\033[96m"
-    GREEN = "\033[92m"
-    RED = "\033[31m"
-    YELLOW = "\033[33m"
-    MAGENTA = "\033[35m"
-    WHITE = "\033[37m"
-    UNDERLINE = "\033[4m"
-    END = "\033[0m"
-
-
-PRIMITIVE_TYPES = (int, float, bool, str, type)
-
-
-def natural_sort(l):
-    convert = lambda text: int(text) if text.isdigit() else text.lower()
-    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", str(key))]
-    return sorted(l, key=alphanum_key)
-
-
-def sizeof_fmt(num, suffix="B"):
-    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
-        if abs(num) < 1024.0:
-            return "%3.1f%s%s" % (num, unit, suffix)
-        num /= 1024.0
-    return "%.1f%s%s" % (num, "Yi", suffix)
-
-
-def pretty_print(contents: dict):
-    """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
-    col_size = max(len(str(k)) for k in contents)
-    for k, v in sorted(contents.items()):
-        key_length = len(str(k))
-        line = " " * (col_size - key_length)
-        line += f"{k}: {COLORS.BLUE}{type(v).__name__}{COLORS.END}"
-        if isinstance(v, dict):
-            pretty_print(v)
-        elif isinstance(v, PRIMITIVE_TYPES):
-            line += f" = "
-            line += f"{COLORS.CYAN}{repr(v)}{COLORS.END}"
-        elif isinstance(v, Sequence):
-            line += ", "
-            line += f"{COLORS.CYAN}len={len(v)}{COLORS.END}"
-        elif isinstance(v, torch.Tensor):
-            if v.ndimension() in (0, 1) and v.numel() == 1:
-                line += f" = "
-                line += f"{COLORS.CYAN}{v.item()}{COLORS.END}"
-            else:
-                line += ", "
-                line += f"{COLORS.CYAN}shape={list(v.shape)}{COLORS.END}"
-                line += ", "
-                line += f"{COLORS.CYAN}dtype={v.dtype}{COLORS.END}"
-            line += (
-                ", "
-                + f"{COLORS.CYAN}size={sizeof_fmt(v.nelement() * v.element_size())}{COLORS.END}"
-            )
-        print(line)
-
-
-def common_entries(*dcts):
-    if not dcts:
-        return
-    for i in set(dcts[0]).intersection(*dcts[1:]):
-        yield (i,) + tuple(d[i] for d in dcts)
-
-
-def pretty_print_double(contents1: dict, contents2: dict, args):
-    """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
-    col_size = max(
-        max(len(str(k)) for k in contents1), max(len(str(k)) for k in contents2)
-    )
-    common_keys = list(contents1.keys() & contents2.keys())
-    uncommon_keys_1 = [i for i in contents2.keys() if i not in common_keys]
-    uncommon_keys_2 = [i for i in contents1.keys() if i not in common_keys]
-    diffs_found = False
-    if uncommon_keys_1 + uncommon_keys_2:
-        diffs_found = True
-        if uncommon_keys_1:
-            print(
-                f"{COLORS.RED}{len(uncommon_keys_1)} key(s) found in ckpt 1 that isn't present in ckpt 2:{COLORS.END} \n\t{COLORS.BLUE}{' '.join(uncommon_keys_1)}{COLORS.END}"
-            )
-        if uncommon_keys_2:
-            print(
-                f"{COLORS.RED}{len(uncommon_keys_2)} key(s) found in ckpt 2 that isn't present in ckpt 1:{COLORS.END} \n\t{COLORS.BLUE}{' '.join(uncommon_keys_2)}{COLORS.END}"
-            )
-    for k, v1, v2 in sorted(common_entries(contents1, contents2)):
-        key_length = len(str(k))
-        line = " " * (col_size - key_length)
-        if type(v1) != type(v2):
-            print(
-                f"{COLORS.RED}{k} is a different type between ckpt1 and ckpt2: ({type(v1).__name__} vs. {type(v2).__name__}){COLORS.END}"
-            )
-            continue
-        else:
-            prefix = f"{k}: {COLORS.BLUE}{type(v1).__name__} | {type(v2).__name__}{COLORS.END}"
-        if isinstance(v1, dict):
-            pretty_print_double(v1, v2, args)
-        elif isinstance(v1, PRIMITIVE_TYPES):
-            if repr(v1) != repr(v2):
-                c = COLORS.RED
-                line += f" = "
-                line += f"{c}{repr(v1)} | {repr(v2)}{COLORS.END}"
-            else:
-                c = COLORS.CYAN
-                if not args.diff:
-                    line += f" = "
-                    line += f"{c}{repr(v1)} | {repr(v2)}{COLORS.END}"
-        elif isinstance(v1, Sequence):
-            if len(v1) != len(v2):
-                c = COLORS.RED
-                line += ", "
-                line += f"{c}len={len(v1)} | len={len(v2)}{COLORS.END}"
-            else:
-                c = COLORS.CYAN
-                if not args.diff:
-                    line += ", "
-                    line += f"{c}len={len(v1)} | len={len(v2)}{COLORS.END}"
-        elif isinstance(v1, torch.Tensor):
-            if v1.ndimension() != v2.ndimension():
-                c = COLORS.RED
-            else:
-                c = COLORS.CYAN
-
-            if (v1.ndimension() in (0, 1) and v1.numel() == 1) and (
-                v2.ndimension() in (0, 1) and v2.numel() == 1
-            ):
-                if not args.diff:
-                    line += f" = "
-                    line += f"{c}{v1.item()} | {c}{v2.item()}{COLORS.END}"
-            else:
-                if list(v1.shape) != list(v2.shape):
-                    c = COLORS.RED
-                    line += ", "
-                    line += f"{c}shape={list(v1.shape)} | shape={list(v2.shape)}{COLORS.END}"
-                else:
-                    c = COLORS.CYAN
-                    if not args.diff:
-                        line += ", "
-                        line += f"{c}shape={list(v1.shape)} | shape={list(v2.shape)}{COLORS.END}"
-                if v1.dtype != v2.dtype:
-                    c = COLORS.RED
-                    line += f"{c}dtype={v1.dtype} | dtype={v2.dtype}{COLORS.END}"
-
-                else:
-                    c = COLORS.CYAN
-                    if not args.diff:
-                        line += ", "
-                        line += f"{c}dtype={v1.dtype} | dtype={v2.dtype}{COLORS.END}"
-                if list(v1.shape) == list(v2.shape):
-                    if torch.allclose(v1, v2):
-                        if not args.diff:
-                            line += f", {COLORS.CYAN}VALUES EQUAL{COLORS.END}"
-                    else:
-                        line += f", {COLORS.RED}VALUES DIFFER{COLORS.END}"
-
-        if line.replace(" ", "") != "":
-            line = prefix + line
-            print(line)
-            diffs_found = True
-    if args.diff and not diffs_found:
-        pass
-    else:
-        if not args.diff:
-            print("\n")
-
-    return diffs_found
-
-
-def get_attribute(obj: object, name: str) -> object:
-    if isinstance(obj, Mapping):
-        return obj[name]
-    if isinstance(obj, Namespace):
-        return obj.name
-    return getattr(object, name)
-
-
-def get_files(pth):
-    if os.path.isdir(pth):
-        files = list(Path(pth).glob("*.pt")) + list(Path(pth).glob("*.ckpt"))
-    elif os.path.isfile(pth):
-        assert pth.endswith(".pt") or pth.endswith(".ckpt")
-        files = [Path(pth)]
-    else:
-        raise ValueError("Dir / File not found.")
-    return natural_sort(files)
-
-
-def peek(args: Namespace):
-
-    files = get_files(args.dir)
-
-    for file in files:
-        file = Path(file).absolute()
-        print(f"{COLORS.GREEN}{file.name}:{COLORS.END}")
-        ckpt = torch.load(file, map_location=torch.device("cpu"))
-        selection = dict()
-        attribute_names = args.attributes or list(ckpt.keys())
-        for name in attribute_names:
-            parts = name.split("/")
-            current = ckpt
-            for part in parts:
-                current = get_attribute(current, part)
-            selection.update({name: current})
-        pretty_print(selection)
-        print("\n")
-
-        if args.interactive:
-            code.interact(
-                banner="Entering interactive shell. You can access the checkpoint contents through the local variable 'checkpoint'.",
-                local={"checkpoint": ckpt, "torch": torch},
-            )
-
-
-def get_shared_fnames(files_1, files_2):
-    names_1 = [Path(i).name for i in files_1]
-    names_1_parent = Path(files_1[0]).parent
-    names_2 = [Path(i).name for i in files_2]
-    names_2_parent = Path(files_2[0]).parent
-    shared_names = list(set.intersection(*map(set, [names_1, names_2])))
-    return [names_1_parent / i for i in shared_names], [
-        names_2_parent / i for i in shared_names
-    ]
-
-
-def get_selection(filename, args):
-    ckpt = torch.load(filename, map_location=torch.device("cpu"))
-    selection = dict()
-    attribute_names = args.attributes or list(ckpt.keys())
-    for name in attribute_names:
-        parts = name.split("/")
-        current = ckpt
-        for part in parts:
-            current = get_attribute(current, part)
-        selection.update({name: current})
-    return selection
-
-
-def compare(args: Namespace):
-    dirs = [i.strip() for i in args.dir.split(",")]
-    assert len(dirs) == 2, "Only works with 2 directories / files"
-    files_1 = get_files(dirs[0])
-    files_2 = get_files(dirs[1])
-    files_1, files_2 = get_shared_fnames(files_1, files_2)
-
-    for file1, file2 in zip(files_1, files_2):
-        file1 = Path(file1).absolute()
-        file2 = Path(file2).absolute()
-        print(f"COMPARING {COLORS.GREEN}{file1.name} & {file2.name}:{COLORS.END}")
-        selection_1 = get_selection(file1, args)
-        selection_2 = get_selection(file2, args)
-        diffs_found = pretty_print_double(selection_1, selection_2, args)
-        if args.diff and diffs_found:
-            print(
-                f"{COLORS.RED}THE ABOVE DIFFS WERE FOUND IN {file1.name} & {file2.name} ^{COLORS.END}\n"
-            )
-
-        if args.interactive:
-            code.interact(
-                banner="Entering interactive shell. You can access the checkpoint contents through the local variable 'selection_1' / 'selection_2'.\nPress Ctrl-D to exit.",
-                local={
-                    "selection_1": selection_1,
-                    "selection_2": selection_2,
-                    "torch": torch,
-                },
-            )
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "dir",
-        type=str,
-        help="The checkpoint dir to inspect. Must be either: \
-         - a directory containing pickle binaries saved with 'torch.save' ending in .pt or .ckpt \
-         - a single path to a .pt or .ckpt file \
-         - two comma separated directories - in which case the script will *compare* the two checkpoints",
-    )
-    parser.add_argument(
-        "--attributes",
-        nargs="*",
-        help="Name of one or several attributes to query. To access an attribute within a nested structure, use '/' as separator.",
-        default=None,
-    )
-    parser.add_argument(
-        "--interactive",
-        "-i",
-        action="store_true",
-        help="Drops into interactive shell after printing the summary.",
-    )
-    parser.add_argument(
-        "--compare",
-        "-c",
-        action="store_true",
-        help="If true, script will compare two directories separated by commas",
-    )
-    parser.add_argument(
-        "--diff", "-d", action="store_true", help="In compare mode, only print diffs"
-    )
-
-    args = parser.parse_args()
-    if args.compare:
-        compare(args)
-    else:
-        peek(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/kill.sh b/tools/kill.sh
deleted file mode 100755
index bccd46d7e..000000000
--- a/tools/kill.sh
+++ /dev/null
@@ -1 +0,0 @@
-pkill -9 python
diff --git a/tools/killall.sh b/tools/killall.sh
deleted file mode 100755
index d2f3528bd..000000000
--- a/tools/killall.sh
+++ /dev/null
@@ -1 +0,0 @@
-pdsh -f 1024 -R ssh -w ^/job/hosts 'pkill -f train.py'
diff --git a/tools/merge20b.py b/tools/merge20b.py
deleted file mode 100644
index 23c034cee..000000000
--- a/tools/merge20b.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import torch
-import yaml
-import shutil
-from tqdm import auto as tqdm_lib
-
-
-VOCAB_SIZE = 50432
-IGNORED_MODEL_STATE_KEYS = [
-    "optimizer",
-    "random_rng_state",
-    "np_rng_state",
-    "torch_rng_state",
-    "cuda_rng_state",
-    "rng_tracker_states",
-]
-
-
-def modify_config(input_config_path, output_config_path, output_dir):
-    with open(input_config_path) as f:
-        loaded_config = yaml.full_load(f)
-
-    # replace model/pipeline parallel
-    loaded_config["model-parallel-size"] = 1
-    loaded_config["pipe-parallel-size"] = 1
-
-    # replace load / save directories:
-    loaded_config["load"] = output_dir
-    loaded_config["save"] = output_dir
-
-    # replace some other paths
-    loaded_config["vocab-file"] = os.path.join(output_dir, "20B_tokenizer.json")
-    loaded_config["log-dir"] = "./logs"
-
-    # we need to make sure the resulting vocab size is correct
-    # do this by modifying the 'make_vocab_size_divisible_by' argument to be
-    # orig * (orig_mp / mp_out)
-    loaded_config["make_vocab_size_divisible_by"] = VOCAB_SIZE
-
-    # remove zero optimizer
-    loaded_config["zero_optimization"]["stage"] = 0
-
-    with open(output_config_path, "w") as f:
-        yaml.dump(loaded_config, f)
-
-
-def modify_model_states(input_model_state_path, output_model_state_path):
-    model_state = torch.load(input_model_state_path)
-    for key in IGNORED_MODEL_STATE_KEYS:
-        del model_state[key]
-    model_state["mp_world_size"] = 1
-    model_state["dp_world_size"] = 1  # could make this configurable?
-    model_state["args"]["model_parallel_size"] = 1
-    model_state["args"]["make_vocab_size_divisible_by"] = VOCAB_SIZE
-    torch.save(model_state, output_model_state_path)
-
-
-def merge_model_weights(input_checkpoint_path, output_checkpoint_path):
-    pbar = tqdm_lib.tqdm(total=47)
-
-    # Load transformer layers
-    for layer_i in range(44):
-        pbar.set_description(f"Merging layer {layer_i}")
-        filename_tp1 = f"layer_{layer_i + 2:02d}-model_00-model_states.pt"
-        filename_tp2 = f"layer_{layer_i + 2:02d}-model_01-model_states.pt"
-        loaded_tp1 = torch.load(os.path.join(input_checkpoint_path, filename_tp1))
-        loaded_tp2 = torch.load(os.path.join(input_checkpoint_path, filename_tp2))
-        # noinspection PyDictCreation
-        merged = {}
-
-        # RowParallelLinear
-        merged["mlp.dense_4h_to_h.weight"] = torch.cat(
-            [
-                loaded_tp1["mlp.dense_4h_to_h.weight"],
-                loaded_tp2["mlp.dense_4h_to_h.weight"],
-            ],
-            dim=1,
-        )
-        merged["attention.dense.weight"] = torch.cat(
-            [
-                loaded_tp1["attention.dense.weight"],
-                loaded_tp2["attention.dense.weight"],
-            ],
-            dim=1,
-        )
-        merged["mlp.dense_4h_to_h.bias"] = (
-            loaded_tp1["mlp.dense_4h_to_h.bias"] + loaded_tp2["mlp.dense_4h_to_h.bias"]
-        )
-        merged["attention.dense.bias"] = (
-            loaded_tp1["attention.dense.bias"] + loaded_tp2["attention.dense.bias"]
-        )
-
-        # Layer Norms
-        merged["input_layernorm.weight"] = (
-            loaded_tp1["input_layernorm.weight"] + loaded_tp2["input_layernorm.weight"]
-        ) / 2
-        merged["input_layernorm.bias"] = (
-            loaded_tp1["input_layernorm.bias"] + loaded_tp2["input_layernorm.bias"]
-        ) / 2
-        merged["post_attention_layernorm.weight"] = (
-            loaded_tp1["post_attention_layernorm.weight"]
-            + loaded_tp2["post_attention_layernorm.weight"]
-        ) / 2
-        merged["post_attention_layernorm.bias"] = (
-            loaded_tp1["post_attention_layernorm.bias"]
-            + loaded_tp2["post_attention_layernorm.bias"]
-        ) / 2
-
-        # ColumnParallelLinear
-        merged["mlp.dense_h_to_4h.weight"] = torch.cat(
-            [
-                loaded_tp1["mlp.dense_h_to_4h.weight"],
-                loaded_tp2["mlp.dense_h_to_4h.weight"],
-            ],
-            dim=0,
-        )
-        merged["mlp.dense_h_to_4h.bias"] = torch.cat(
-            [
-                loaded_tp1["mlp.dense_h_to_4h.bias"],
-                loaded_tp2["mlp.dense_h_to_4h.bias"],
-            ],
-            dim=0,
-        )
-        merged["attention.query_key_value.weight"] = torch.cat(
-            [
-                loaded_tp1["attention.query_key_value.weight"],
-                loaded_tp2["attention.query_key_value.weight"],
-            ],
-            dim=0,
-        )
-        merged["attention.query_key_value.bias"] = torch.cat(
-            [
-                loaded_tp1["attention.query_key_value.bias"],
-                loaded_tp2["attention.query_key_value.bias"],
-            ],
-            dim=0,
-        )
-
-        # Just take one
-        merged["attention.rotary_emb.inv_freq"] = loaded_tp1[
-            "attention.rotary_emb.inv_freq"
-        ]
-
-        torch.save(merged, os.path.join(output_checkpoint_path, filename_tp1))
-        del loaded_tp1
-        del loaded_tp2
-        pbar.update(1)
-
-    # Load input embedding
-    pbar.set_description(f"Merging input embedding")
-    loaded_tp1 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_00-model_00-model_states.pt")
-    )
-    loaded_tp2 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_00-model_01-model_states.pt")
-    )
-    merged = {
-        "word_embeddings.weight": torch.cat(
-            [
-                loaded_tp1["word_embeddings.weight"],
-                loaded_tp2["word_embeddings.weight"],
-            ],
-            dim=0,
-        )
-    }
-    torch.save(
-        merged,
-        os.path.join(output_checkpoint_path, "layer_00-model_00-model_states.pt"),
-    )
-    del loaded_tp1
-    del loaded_tp2
-    pbar.update(1)
-
-    # Load final layer norm
-    pbar.set_description(f"Merging final layer norm")
-    loaded_tp1 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_47-model_00-model_states.pt")
-    )
-    loaded_tp2 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_47-model_01-model_states.pt")
-    )
-    merged = {
-        "norm.weight": (loaded_tp1["norm.weight"] + loaded_tp2["norm.weight"]) / 2,
-        "norm.bias": (loaded_tp1["norm.bias"] + loaded_tp2["norm.bias"]) / 2,
-    }
-    torch.save(
-        merged,
-        os.path.join(output_checkpoint_path, "layer_47-model_00-model_states.pt"),
-    )
-    del loaded_tp1
-    del loaded_tp2
-    pbar.update(1)
-
-    # Load output embedding
-    pbar.set_description(f"Merging output embedding")
-    loaded_tp1 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_48-model_00-model_states.pt")
-    )
-    loaded_tp2 = torch.load(
-        os.path.join(input_checkpoint_path, "layer_48-model_01-model_states.pt")
-    )
-    merged = {
-        "final_linear.weight": torch.cat(
-            [
-                loaded_tp1["final_linear.weight"],
-                loaded_tp2["final_linear.weight"],
-            ],
-            dim=0,
-        ),
-    }
-    torch.save(
-        merged,
-        os.path.join(output_checkpoint_path, "layer_48-model_00-model_states.pt"),
-    )
-    del loaded_tp1
-    del loaded_tp2
-    pbar.update(1)
-    pbar.set_description("Done.")
-
-
-def merge(input_dir, output_dir):
-    input_checkpoint_path = os.path.join(input_dir, "global_step150000")
-    output_checkpoint_path = os.path.join(output_dir, "global_step150000")
-    os.makedirs(output_checkpoint_path, exist_ok=True)
-    os.makedirs(os.path.join(output_dir, "configs"), exist_ok=True)
-    for i in range(8):
-        modify_model_states(
-            input_model_state_path=os.path.join(
-                input_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
-            ),
-            output_model_state_path=os.path.join(
-                output_checkpoint_path, f"mp_rank_{i:02d}_model_states.pt"
-            ),
-        )
-    modify_config(
-        input_config_path=os.path.join(input_dir, "configs", "20B.yml"),
-        output_config_path=os.path.join(output_dir, "configs", "20B.yml"),
-        output_dir=output_dir,
-    )
-    merge_model_weights(
-        input_checkpoint_path=input_checkpoint_path,
-        output_checkpoint_path=output_checkpoint_path,
-    )
-    shutil.copyfile(
-        os.path.join(input_dir, "20B_tokenizer.json"),
-        os.path.join(output_dir, "20B_tokenizer.json"),
-    )
-    with open(os.path.join(output_dir, "latest"), "w") as f:
-        f.write("global_step150000")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Merge 20B checkpoint.")
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        help='Checkpoint dir, which should contain (e.g. a folder named "global_step150000")',
-    )
-    parser.add_argument(
-        "--output_dir", type=str, help="Output dir, to save the 1-GPU weights configs"
-    )
-    args = parser.parse_args()
-    merge(args.input_dir, args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
deleted file mode 100644
index c5d1e6255..000000000
--- a/tools/merge_datasets.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import os
-import sys
-import json
-import argparse
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-
-from megatron.data import indexed_dataset
-
-
-def main(args):
-
-    prefixes = set()
-    for basename in os.listdir(args.input):
-        prefix, ext = os.path.splitext(basename)
-
-        if prefix in prefixes:
-            continue
-
-        if not os.path.isfile(os.path.join(args.input, basename)):
-            continue
-
-        ext_pair = ".bin" if ext == ".idx" else ".idx"
-        assert os.path.isfile(
-            os.path.join(args.input, prefix) + ext_pair
-        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
-
-        prefixes.add(prefix)
-
-    builder = None
-    for prefix in sorted(prefixes):
-        if builder is None:
-            dataset = indexed_dataset.make_dataset(
-                os.path.join(args.input, prefix), "infer"
-            )
-
-            if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
-                builder = indexed_dataset.MMapIndexedDatasetBuilder(
-                    args.output_prefix + ".bin", dtype=dataset._index.dtype
-                )
-            else:
-                builder = indexed_dataset.IndexedDatasetBuilder(
-                    args.output_prefix + ".bin"
-                )
-
-            del dataset
-
-        builder.merge_file_(os.path.join(args.input, prefix))
-
-    builder.finalize(args.output_prefix + ".idx")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    group = parser.add_argument_group(title="input data")
-    group.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Path to directory containing all document files to merge",
-    )
-
-    group = parser.add_argument_group(title="output data")
-    group.add_argument(
-        "--output-prefix",
-        type=str,
-        required=True,
-        help="Path to binary output file without suffix",
-    )
-
-    args = parser.parse_args()
-
-    assert os.path.isdir(
-        args.input
-    ), f"ERROR: {args.input} is not a directory or does not exist"
-
-    assert os.path.isdir(
-        os.path.dirname(args.output_prefix)
-    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
-
-    main(args)
diff --git a/tools/merge_mp_partitions.py b/tools/merge_mp_partitions.py
deleted file mode 100644
index 6509718ff..000000000
--- a/tools/merge_mp_partitions.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Merge model parallel partitions."""
-
-import os
-import sys
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-
-import torch
-
-from megatron import mpu
-from megatron.checkpointing import ensure_directory_exists
-from megatron.checkpointing import get_checkpoint_name
-from megatron.checkpointing import get_checkpoint_tracker_filename
-from megatron.global_vars import rebuild_tokenizer
-from megatron.global_vars import _parse_args
-
-
-def split_into_partitions(tensor, num_partitions, partition_dim, stride):
-
-    per_partition_size = mpu.utils.divide(tensor.size(partition_dim), num_partitions)
-    per_partition_per_stride_size = mpu.utils.divide(per_partition_size, stride)
-
-    partitions_list = torch.split(
-        tensor, per_partition_per_stride_size, dim=partition_dim
-    )
-
-    partitions = []
-    for i in range(num_partitions):
-        partition = torch.cat(partitions_list[i::num_partitions], dim=partition_dim)
-        partitions.append(partition)
-
-    return partitions
-
-
-def merge_partitions(merged, partitions, partition_dim, stride):
-
-    # Number and size of each partition.
-    num_partitions = len(partitions)
-    per_partition_size = None
-    for partition in partitions:
-        if per_partition_size is None:
-            per_partition_size = partition.size(partition_dim)
-        else:
-            assert per_partition_size == partition.size(partition_dim)
-
-    def concat_partitions(partitions_):
-        with torch.no_grad():
-            if (per_partition_size * num_partitions) == merged.size(partition_dim):
-                torch.cat(partitions_, dim=partition_dim, out=merged)
-            else:
-                print(
-                    "     ***WARNING*** sizes do not match. Will cut "
-                    "the merged partitions by {} along dimension {} "
-                    "to reduce the size from {} to {} ...".format(
-                        (per_partition_size * num_partitions)
-                        - merged.size(partition_dim),
-                        partition_dim,
-                        per_partition_size * num_partitions,
-                        merged.size(partition_dim),
-                    )
-                )
-                merged_ = torch.cat(partitions_, dim=partition_dim)
-                merged_split = torch.split(
-                    merged_, merged.size(partition_dim), dim=partition_dim
-                )
-                merged_ = merged_split[0]
-                assert merged_.size(partition_dim) == merged.size(partition_dim)
-                merged.data.copy_(merged_.data)
-
-    # If stride is 1, then do simple concatenation.
-    if stride == 1:
-        concat_partitions(partitions)
-        return
-
-    # For none unity strides, first split based on stride and then group.
-    per_partition_per_stride_size = mpu.utils.divide(per_partition_size, stride)
-    # Chunk and build a list.
-    chunks = None
-    for i, partition in enumerate(partitions):
-        chunk = torch.split(partition, per_partition_per_stride_size, dim=partition_dim)
-
-        if chunks is None:
-            chunks = [0] * (num_partitions * len(chunk))
-        chunks[i::num_partitions] = chunk
-
-    # Concatinate.
-    concat_partitions(chunks)
-
-    return
-
-
-def get_model(model_type):
-
-    if model_type == "GPT2":
-        from pretrain_gpt2 import model_provider
-    else:
-        raise Exception("unrecognized model type: {}".format(model_type))
-
-    model = model_provider()
-    model = model.half()
-
-    return model
-
-
-def get_parallel_checkpoint_name(path):
-
-    tracker_filename = get_checkpoint_tracker_filename(path)
-    iteration = 0
-    with open(tracker_filename, "r") as f:
-        metastring = f.read().strip()
-        iteration = int(metastring)
-    assert iteration > 0
-    checkpoint_name = get_checkpoint_name(path, iteration)
-
-    return checkpoint_name, iteration
-
-
-def test_split_merge():
-
-    print("testing split and merge ...")
-
-    # [QKV.ROW-COL]
-    tensor = torch.FloatTensor(
-        [
-            [1.11, 1.12, 1.13, 1.14, 1.15],
-            [1.21, 1.22, 1.23, 1.24, 1.25],
-            [1.31, 1.32, 1.33, 1.34, 1.35],
-            [1.41, 1.42, 1.43, 1.44, 1.45],
-            [2.11, 2.12, 2.13, 2.14, 2.15],
-            [2.21, 2.22, 2.23, 2.24, 2.25],
-            [2.31, 2.32, 2.33, 2.34, 2.35],
-            [2.41, 2.42, 2.43, 2.44, 2.45],
-            [3.11, 3.12, 3.13, 3.14, 3.15],
-            [3.21, 3.22, 3.23, 3.24, 3.25],
-            [3.31, 3.32, 3.33, 3.34, 3.35],
-            [3.41, 3.42, 3.43, 3.44, 3.45],
-        ]
-    )
-
-    num_partitions = 2
-    partition_dim = 0
-    stride = 3
-    partitions = split_into_partitions(tensor, num_partitions, partition_dim, stride)
-
-    merged = torch.zeros_like(tensor)
-    merge_partitions(merged, partitions, partition_dim, stride)
-
-    max_error = (merged - tensor).abs().max()
-    print("  > max error (should be zero): {}".format(max_error))
-
-
-def get_mp_merge_args(parser):
-    """Provide extra arguments required for merging."""
-    group = parser.add_argument_group(title="mp merge")
-
-    group.add_argument(
-        "--model-type",
-        type=str,
-        required=True,
-        choices=["BERT", "GPT2", "RACE", "MNLI", "QQP"],
-        help="Type of the model.",
-    )
-
-    return parser
-
-
-def main():
-
-    # Args
-    args = _parse_args(extra_args_provider=get_mp_merge_args)
-    model_type = args.model_type
-    orig_model_parallel_size = args.model_parallel_size
-    args.model_parallel_size = 1
-    tokenizer = rebuild_tokenizer(args)
-
-    print("\n merging model parallel partitions ...")
-    print(" > number of partitions: {}".format(orig_model_parallel_size))
-    print(" > checkpoint path: {}".format(args.load))
-    print(" > model parameters:")
-    print("    number of tokens ................ {} ".format(tokenizer.vocab_size))
-    print("    number of layers ................ {}".format(args.num_layers))
-    print("    hidden size ..................... {}".format(args.hidden_size))
-    print("    number of attention heads ....... {}".format(args.num_attention_heads))
-    print(
-        "    maximum position embeddings ..... {}".format(args.max_position_embeddings)
-    )
-
-    # Full model.
-    print("> building the full model ...")
-    mpu.initialize.set_model_parallel_world_size(1)
-    mpu.initialize.set_model_parallel_rank(0)
-    merged_model = get_model(model_type)
-
-    # Build and load partitions.
-    partitions = []
-    iteration = 0
-    args.model_parallel_size = orig_model_parallel_size
-    tokenizer = rebuild_tokenizer(args)
-    mpu.initialize.set_model_parallel_world_size(args.model_parallel_size)
-    for rank in range(args.model_parallel_size):
-        mpu.initialize.set_model_parallel_rank(rank)
-        checkpoint_name, iteration = get_parallel_checkpoint_name(args.load)
-        print("> loading {} ...".format(checkpoint_name))
-        model_ = get_model(model_type)
-        sd = torch.load(checkpoint_name, map_location="cpu")
-        model_.load_state_dict(sd["model"])
-        partitions.append(model_)
-
-    # Parameter generators so we can loop through them semiltaneouly.
-    merged_params_gen = merged_model.named_parameters()
-    partitions_params_gen = [partition.named_parameters() for partition in partitions]
-    while True:
-        try:
-
-            # Get the params and check names.
-            name, merged_param = next(merged_params_gen)
-            print(" > working on {} ...".format(name))
-            print(
-                "     merged         type: {}, size: {}".format(
-                    merged_param.dtype, list(merged_param.size())
-                )
-            )
-            partitions_param = []
-            for rank, partition_params_gen in enumerate(partitions_params_gen):
-                partition_name, partition_param = next(partition_params_gen)
-                assert partition_name == name
-                partitions_param.append(partition_param)
-                print(
-                    "     partition {}    type: {}, size: {}".format(
-                        rank, partition_param.dtype, list(partition_param.size())
-                    )
-                )
-
-            # For the non-parallel parameters, simply copy the rank 0 values.
-            if not hasattr(merged_param, "model_parallel"):
-                print("     none-parallel parameter, simple copy from rank 0")
-                with torch.no_grad():
-                    merged_param.data.copy_(partitions_param[0].data)
-            # For parallel parameters, merge the values
-            else:
-                print(
-                    "     parallel parameter merge with stride {} along "
-                    "dimension {}".format(
-                        merged_param.stride, merged_param.partition_dim
-                    )
-                )
-                merge_partitions(
-                    merged_param,
-                    partitions_param,
-                    merged_param.partition_dim,
-                    merged_param.stride,
-                )
-
-        except StopIteration:
-            break
-
-    # Save the model.
-    args.model_parallel_size = 1
-    mpu.initialize.set_model_parallel_rank(0)
-    sd = {}
-    sd["model"] = merged_model.state_dict()
-    sd["iteration"] = iteration
-    merged_path = os.path.join(args.load, "merged")
-    checkpoint_name = get_checkpoint_name(merged_path, iteration)
-    ensure_directory_exists(checkpoint_name)
-    print("> saving merged model to {}".format(checkpoint_name))
-    torch.save(sd, checkpoint_name)
-
-    print("done :-)")
-
-
-if __name__ == "__main__":
-
-    main()
diff --git a/tools/multinode_prepare_data.sh b/tools/multinode_prepare_data.sh
deleted file mode 100644
index 1ff7244ae..000000000
--- a/tools/multinode_prepare_data.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# USAGE:
-# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
-# over the processes.
-# This bash script takes a single text file as input argument.
-# The text file contains a valid filepath in each line, leading to a jsonl-file.
-# Furthermore an environment variable for the rank and the world size needs to be set.
-# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
-# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
-# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.
-
-# Parse command-line arguments
-text_file="$1"
-rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
-world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
-num_lines=$(wc -l < "$text_file")
-chunk_size=$((num_lines / world_size))
-start_line=$((rank * chunk_size + 1))
-end_line=$((start_line + chunk_size - 1))
-
-# Make sure the last chunk includes all remaining lines
-if [[ $rank == $((world_size - 1)) ]]; then
-    end_line=$num_lines
-fi
-
-# Select the chunk of the text file that corresponds to the rank
-chunk_file="chunk_${rank}.txt"
-sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"
-
-# Parse additional flags to be passed to the Python script
-shift 1  # Shift past the first three arguments
-py_args=""
-prefix_arg=""
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --output-prefix=*) prefix_arg="$1"; shift;;
-        --output-prefix) prefix_arg="$1 $2"; shift 2;;
-        --*) py_args="$py_args $1 $2"; shift 2;;
-        *) echo "Unknown argument: $1"; exit 1;;
-    esac
-done
-
-# Add the rank to the --output-prefix argument if it is set
-if [[ -n "$prefix_arg" ]]; then
-    py_args="$py_args $prefix_arg$rank"
-else
-    # Inject a default --output-prefix argument containing the rank
-    py_args="$py_args --output-prefix rank${rank}"
-fi
-
-
-echo "processing $chunk_file with rank $rank at world size $world_size"
-echo "using the following args: $py_args"
-# Call the Python script with the list of file paths in the chunk
-python tools/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args
-
-# Clean up
-rm "$chunk_file"
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
deleted file mode 100644
index 862620eb8..000000000
--- a/tools/preprocess_data.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Processing data for pretraining."""
-
-import argparse
-import multiprocessing
-import os
-import sys
-
-import lm_dataformat as lmd
-import numpy as np
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-import time
-import tqdm
-import torch
-import ftfy
-
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
-from threading import Semaphore
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-
-    def encode(self, text):
-        if self.args.ftfy:
-            text = ftfy.fix_text(text)
-        ids = {}
-        for key in self.args.jsonl_keys:
-            doc_ids = []
-            text_ids = Encoder.tokenizer.tokenize(text)
-            if len(text_ids) > 0:
-                doc_ids.append(text_ids)
-            if self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-        return ids, len(text)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="input data")
-    group.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
-        "list",
-    )
-    group.add_argument(
-        "--jsonl-keys",
-        nargs="+",
-        default=["text"],
-        help="space separate listed of keys to extract from jsonl. Defa",
-    )
-    group.add_argument(
-        "--num-docs",
-        default=None,
-        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
-        type=int,
-    )
-    group = parser.add_argument_group(title="tokenizer")
-    group.add_argument(
-        "--tokenizer-type",
-        type=str,
-        required=True,
-        choices=[
-            "HFGPT2Tokenizer",
-            "HFTokenizer",
-            "GPT2BPETokenizer",
-            "CharLevelTokenizer",
-            "TiktokenTokenizer",
-            "SPMTokenizer",
-        ],
-        help="What type of tokenizer to use.",
-    )
-    group.add_argument(
-        "--vocab-file", type=str, default=None, help="Path to the vocab file"
-    )
-    group.add_argument(
-        "--merge-file",
-        type=str,
-        default=None,
-        help="Path to the BPE merge file (if necessary).",
-    )
-    group.add_argument(
-        "--append-eod",
-        action="store_true",
-        help="Append an <eod> token to the end of a document.",
-    )
-    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
-    group = parser.add_argument_group(title="output data")
-    group.add_argument(
-        "--output-prefix",
-        type=str,
-        required=True,
-        help="Path to binary output file without suffix",
-    )
-    group.add_argument(
-        "--dataset-impl",
-        type=str,
-        default="mmap",
-        choices=["lazy", "cached", "mmap"],
-        help="Dataset implementation to use. Default: mmap",
-    )
-
-    group = parser.add_argument_group(title="runtime")
-    group.add_argument(
-        "--workers", type=int, default=1, help="Number of worker processes to launch"
-    )
-    group.add_argument(
-        "--log-interval",
-        type=int,
-        default=100,
-        help="Interval between progress updates",
-    )
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    # some default/dummy values for the tokenizer
-    args.rank = 0
-    args.make_vocab_size_divisible_by = 128
-    args.model_parallel_size = 1
-
-    return args
-
-
-def yield_from_files(fnames: list, semaphore):
-    """
-    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
-    other compressed formats. Also filters out empty documents.
-
-    :param fnames: list of filenames
-    """
-
-    def yielder(fname, semaphore):
-        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
-            semaphore.acquire()
-            yield f
-
-    for fname in fnames:
-        semaphore.acquire()
-
-        yield from yielder(fname, semaphore)
-
-
-def main():
-    args = get_args()
-    encoder = Encoder(args)
-    tokenizer = build_tokenizer(args)
-    print(f"Vocab size: {tokenizer.vocab_size}")
-    print(f"Output prefix: {args.output_prefix}")
-
-    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
-    # hence building up memory
-    semaphore = Semaphore(10000 + args.workers)
-
-    # use multiprocessing to iterate over input documents
-    fin = yield_from_files(args.input.split(","), semaphore)
-
-    if args.workers > 1:
-        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
-    else:
-        encoder.initializer()
-        encoded_docs = (encoder.encode(doc) for doc in fin)
-
-    # make a dataset builder for each key in args.jsonl_keys
-    # each key will output to a different file beginning with args.output_prefix
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    for key in args.jsonl_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(
-            args.output_prefix, key, "document"
-        )
-        output_idx_files[key] = "{}_{}_{}.idx".format(
-            args.output_prefix, key, "document"
-        )
-        builders[key] = indexed_dataset.make_builder(
-            output_bin_files[key],
-            impl=args.dataset_impl,
-            vocab_size=tokenizer.vocab_size,
-        )
-
-    # actually do tokenization
-    proc_start = time.time()
-    total_bytes_processed = 0
-    pbar = tqdm.tqdm()
-    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
-        total_bytes_processed += bytes_processed
-
-        # release semaphore so `yield_from_files` can add another file to the buffer
-        semaphore.release()
-
-        # add each tokenized document / sentence
-        for key, sentences in doc.items():
-            for sentence in sentences:
-                builders[key].add_item(np.array(sentence, dtype=builders[key].dtype))
-            # separate with eos token
-            builders[key].end_document()
-
-        # log progress
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed / elapsed / 1024 / 1024
-            pbar.set_description(
-                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed :.2f} docs/s, {mbs:.2f} MB/s)."
-            )
-            if i != 0:
-                pbar.update(args.log_interval)
-
-    # save output file
-    for key in args.jsonl_keys:
-        builders[key].finalize(output_idx_files[key])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/preprocess_data_with_mask.py b/tools/preprocess_data_with_mask.py
deleted file mode 100644
index 636e852ce..000000000
--- a/tools/preprocess_data_with_mask.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-A script for processing a dataset such that corresponding labels are also produced. These are then used to perform masked finetuning 
-(for example, finetuning a model to only output the text following some delimiter in the finetuning dataset such as "Answer: " 
-rather than generating the entire "Question: ... Answer: " turns of conversation.
-
-To run this script, first edit `tools/corpora.py` such that the command to call `tools/preprocess_data.py` is as follows:
-
-```
-cmd = f"python tools/preprocess_data_with_mask.py \
-    --input {jsonl_filepath} \
-    --output-prefix {parent_folder}/{self.name} \
-    --vocab {self.vocab_file} \
-    --dataset-impl mmap \
-    --tokenizer-type {self.tokenizer_type} \
-    --merge-file {self.merge_file} \
-    --append-eod \
-    --mask-before-token X,Y,Z \
-    --workers {self.num_workers} "
-    
-if self.num_docs is not None:
-    cmd += f"--num-docs {self.num_docs} "
-
-if self.ftfy:
-    cmd += f"--ftfy "
-```
-where --mask-before-token must be the (comma-separated) list of tokens produced by encoding your delimiter string. 
-Up to and including the first occurrence of this token sequence in a document, all tokens will have their loss mask zeroed out when the label dataset is provided to NeoX.
-
-Then, specify 
-```
-"train_data_paths": ["/path/to/dataset/name_text_document"],
-"label_data_paths": ["/path/to/dataset/name_label_document"]
-```
-in your YML config. This will then allow for finetuning on the data with loss masks set appropriately. 
-(However, be warned that NeoX packs documents to fill context windows, which may degrade performance in some finetuning situations where instead padding out to the context length may be preferred.)
-"""
-
-import argparse
-import multiprocessing
-import os
-import sys
-import re
-
-import lm_dataformat as lmd
-import numpy as np
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-import time
-import tqdm
-import torch
-import ftfy
-
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
-from threading import Semaphore
-from functools import lru_cache
-
-
-@lru_cache(maxsize=None)
-def build_nxt(pattern: tuple) -> tuple:
-    # The function is being cached. Use tuple to avoid the cache being tampered out of scope.
-    nxt = [0]
-    current = 1
-    match_idx = 0
-
-    while current < len(pattern):
-        if pattern[match_idx] == pattern[current]:
-            current += 1
-            match_idx += 1
-            nxt.append(match_idx)
-        elif match_idx != 0:
-            match_idx = nxt[match_idx - 1]
-        else:
-            nxt.append(0)
-            current += 1
-
-    return tuple(nxt)
-
-
-def kmp(seq, pattern, first_appearance=False):
-    """
-    Search for the location of a subsequence in a list. Not sure if there is a python built-in
-    implementation of kmp somewhere...
-    """
-    nxt = build_nxt(tuple(pattern))
-    current = 0
-    match_idx = 0
-
-    matched = []
-
-    while current < len(seq):
-        if seq[current] == pattern[match_idx]:
-            current += 1
-            match_idx += 1
-        elif match_idx != 0:
-            match_idx = nxt[match_idx - 1]
-        else:
-            current += 1
-
-        if match_idx == len(pattern):
-            matched.append(current - len(pattern))
-            if first_appearance:
-                return matched
-            match_idx = nxt[match_idx - 1]
-
-    return matched
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-
-    def encode(self, text):
-        if self.args.ftfy:
-            text = ftfy.fix_text(text)
-        if isinstance(text, str):
-            text = {"text": text}
-        ids = {}
-        for key in self.args.jsonl_keys:
-            doc_ids = []
-            text_ids = Encoder.tokenizer.tokenize(text['text'])
-            if len(text_ids) > 0:
-                doc_ids.append(text_ids)
-            if self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-        return ids, len(text)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title="input data")
-    group.add_argument(
-        "--input",
-        type=str,
-        required=True,
-        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
-        "list",
-    )
-    group.add_argument(
-        "--jsonl-keys",
-        nargs="+",
-        default=["text"],
-        help="space separate listed of keys to extract from jsonl. Defa",
-    )
-    group.add_argument(
-        "--mask-before-token",
-        default=None,
-        help="apply loss masks before certain token(s). If multi-token pattern, separate by commas without space, e.g. --mask-before-token 0,1,1270 to use the token pattern [0,1,1270].",
-        type=str,
-    )
-    group.add_argument(
-        "--num-docs",
-        default=None,
-        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
-        type=int,
-    )
-    group = parser.add_argument_group(title="tokenizer")
-    group.add_argument(
-        "--tokenizer-type",
-        type=str,
-        required=True,
-        choices=[
-            "HFGPT2Tokenizer",
-            "HFTokenizer",
-            "GPT2BPETokenizer",
-            "CharLevelTokenizer",
-        ],
-        help="What type of tokenizer to use.",
-    )
-    group.add_argument(
-        "--vocab-file", type=str, default=None, help="Path to the vocab file"
-    )
-    group.add_argument(
-        "--merge-file",
-        type=str,
-        default=None,
-        help="Path to the BPE merge file (if necessary).",
-    )
-    group.add_argument(
-        "--append-eod",
-        action="store_true",
-        help="Append an <eod> token to the end of a document.",
-    )
-    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
-    group = parser.add_argument_group(title="output data")
-    group.add_argument(
-        "--output-prefix",
-        type=str,
-        required=True,
-        help="Path to binary output file without suffix",
-    )
-    group.add_argument(
-        "--dataset-impl",
-        type=str,
-        default="mmap",
-        choices=["lazy", "cached", "mmap"],
-        help="Dataset implementation to use. Default: mmap",
-    )
-
-    group = parser.add_argument_group(title="runtime")
-    group.add_argument(
-        "--workers", type=int, default=1, help="Number of worker processes to launch"
-    )
-    group.add_argument(
-        "--log-interval",
-        type=int,
-        default=100,
-        help="Interval between progress updates",
-    )
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    # some default/dummy values for the tokenizer
-    args.rank = 0
-    args.make_vocab_size_divisible_by = 128
-    args.model_parallel_size = 1
-
-    return args
-
-
-def yield_from_files(fnames: list, semaphore):
-    """
-    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
-    other compressed formats. Also filters out empty documents.
-
-    :param fnames: list of filenames
-    """
-
-    def yielder(fname, semaphore):
-        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
-            semaphore.acquire()
-            yield f
-
-    for fname in fnames:
-        semaphore.acquire()
-
-        yield from yielder(fname, semaphore)
-
-
-def mask(sentence: list, pivot_tokens: list, include_pivot=True):
-    inds = kmp(sentence, pivot_tokens)
-    if not inds:
-        return sentence
-    index = inds[0]
-    if include_pivot:
-        index += len(pivot_tokens)
-
-    return [-100] * index + sentence[index:]
-
-
-def main():
-    args = get_args()
-    encoder = Encoder(args)
-    tokenizer = build_tokenizer(args)
-    print(f"Vocab size: {tokenizer.vocab_size}")
-    print(f"Output prefix: {args.output_prefix}")
-
-    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
-    # hence building up memory
-    semaphore = Semaphore(10000 + args.workers)
-
-    # use multiprocessing to iterate over input documents
-    fin = yield_from_files(args.input.split(","), semaphore)
-
-    if args.workers > 1:
-        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
-    else:
-        encoder.initializer()
-        encoded_docs = (encoder.encode(doc) for doc in fin)
-
-    
-    if args.mask_before_token is not None:
-        token_mask = [int(re.sub(r'[^0-9]', '', r)) for r in args.mask_before_token.split(",") if re.sub(r'[^0-9]', '', r)]
-    else:
-        token_mask = []
-
-
-    # make a dataset builder for each key in args.jsonl_keys
-    # each key will output to a different file beginning with args.output_prefix
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    for key in args.jsonl_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(
-            args.output_prefix, key, "document"
-        )
-        output_idx_files[key] = "{}_{}_{}.idx".format(
-            args.output_prefix, key, "document"
-        )
-        builders[key] = indexed_dataset.make_builder(
-            output_bin_files[key],
-            impl=args.dataset_impl,
-            vocab_size=tokenizer.vocab_size,
-        )
-    if token_mask:
-        assert "label" not in args.jsonl_keys, "label should not be included as it will be generated according to the mask."
-        key = "label"
-        output_bin_files[key] = "{}_{}_{}.bin".format(
-            args.output_prefix, key, "document"
-        )
-        output_idx_files[key] = "{}_{}_{}.idx".format(
-            args.output_prefix, key, "document"
-        )
-        builders[key] = indexed_dataset.make_builder(
-            output_bin_files[key],
-            impl=args.dataset_impl,
-            vocab_size=tokenizer.vocab_size,
-        )
-    int32_labels = ["text", "label"]
-    for l in int32_labels:
-        builders[l]._dtype = np.int32
-
-
-    # actually do tokenization
-    proc_start = time.time()
-    total_bytes_processed = 0
-    pbar = tqdm.tqdm()
-    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
-        total_bytes_processed += bytes_processed
-
-        # release semaphore so `yield_from_files` can add another file to the buffer
-        semaphore.release()
-
-        # add each tokenized document / sentence
-        for key, sentences in doc.items():
-            for sentence in sentences:
-                builders[key].add_item(np.array(sentence, dtype=builders[key].dtype))
-                if token_mask: 
-                    masked_sentence = mask(sentence, token_mask)
-                    builders["label"].add_item(np.array(masked_sentence, dtype=builders["text"].dtype))
-            # separate with eos token
-            builders[key].end_document()
-            if token_mask:
-                builders["label"].end_document()
-
-
-        # log progress
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed / elapsed / 1024 / 1024
-            pbar.set_description(
-                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)."
-            )
-            if i != 0:
-                pbar.update(args.log_interval)
-
-    # save output file
-    update_keys = args.jsonl_keys + ["label"] if token_mask else args.jsonl_keys
-    for key in update_keys:
-        builders[key].finalize(output_idx_files[key])
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/tools/sync.sh b/tools/sync.sh
deleted file mode 100755
index d3788cce4..000000000
--- a/tools/sync.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-# Push files to all nodes
-# Usage
-# sync.sh file [file2..]
-
-echo Number of files to upload: $#
-
-for file in "$@"
-do
-    full_path=$(realpath $file)
-    echo Uploading $full_path
-    pdcp -f 1024 -R ssh -w ^/job/hosts $full_path $full_path
-done
diff --git a/tools/sync_cmd.sh b/tools/sync_cmd.sh
deleted file mode 100644
index 6525247fb..000000000
--- a/tools/sync_cmd.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-# Runs a command in parallel across all nodes
-# Usage
-# sync_cmd.sh 'echo "hello world"'
-
-echo "Command: $1";
-pdsh -R ssh -w ^/job/hosts $1
diff --git a/tools/syncdir.sh b/tools/syncdir.sh
deleted file mode 100755
index f43edbd8c..000000000
--- a/tools/syncdir.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-
-# Push files to all nodes
-# Usage
-# sync.sh file [file2..]
-
-echo Number of files to upload: $#
-
-for file in "$@"
-do
-    full_path=$(realpath $file)
-    parentdir="$(dirname "$full_path")"
-    echo Uploading $full_path to $parentdir
-    pdcp -f 1024 -R ssh -w ^/job/hosts -r $full_path $parentdir
-done
diff --git a/tools/upload.py b/tools/upload.py
deleted file mode 100644
index 324a7debe..000000000
--- a/tools/upload.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-from huggingface_hub import HfApi, create_repo
-
-converted_ckpt = sys.argv[1]
-repo_name = sys.argv[2]
-branch_name = sys.argv[3]
-try:
-    create_repo(repo_name, repo_type="model", private=False)
-except:
-    print("repo {repo_name} already exists!")
-    pass
-
-files = os.listdir(converted_ckpt)
-
-api = HfApi()
-if branch_name != "main":
-    try:
-        api.create_branch(
-            repo_id=repo_name,
-            repo_type="model",
-            branch=branch_name,
-        )
-    except:
-        print(f"branch {branch_name} already exists, try again...")
-print(f"to upload: {files}")
-for file in files:
-    print(f"Uploading {file} to branch {branch_name}...")
-    api.upload_file(
-        path_or_fileobj=os.path.join(converted_ckpt, file),
-        path_in_repo=file,
-        repo_id=repo_name,
-        repo_type="model",
-        commit_message=f"Upload {file}",
-        revision=branch_name,
-    )
-    print(f"Successfully uploaded {file} !")

From 975d8f8460901d4a5fa5b8273e3ad6511f4b1e5b Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 18:55:40 +0000
Subject: [PATCH 43/64] cleanup: use fp32 rope (non-fused) from main

---
 megatron/model/positional_embeddings.py | 36 +++++++++++++++++--------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 28e64fc5f..16cd22ad8 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,17 +38,24 @@ def forward(self, x, seq_dim=1):
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
         self.precision = precision
         self.max_seq_len = max_seq_len
         self.base = base
         self.dim = dim
-        
+
         # precompute cos_cached, sin_cached in fp32
-        cos_cached, sin_cached, inv_freq = self._prepare_cache(max_seq_len, precision, base)
+        cos_cached, sin_cached, inv_freq = self._prepare_cache(
+            max_seq_len, precision, base
+        )
 
         self.register_buffer("inv_freq", inv_freq)
         self.cos_cached = cos_cached
-        self.sin_cached = sin_cached    
+        self.sin_cached = sin_cached
 
     def _prepare_cache(self, seq_len, precision, base):
         # precompute cos_cached, sin_cached in fp32
@@ -60,19 +67,26 @@ def _prepare_cache(self, seq_len, precision, base):
 
         cos_cached = emb.cos()[:, None, None, :]
         sin_cached = emb.sin()[:, None, None, :]
-         
-        return cos_cached.to(precision), sin_cached.to(precision), inv_freq.to(precision)
 
-    def forward(self, x, seq_dim=1, seq_len=None):
+        return (
+            cos_cached.to(precision),
+            sin_cached.to(precision),
+            inv_freq.to(precision),
+        )
+
+    def forward(self, x, seq_dim=0, seq_len=None):
         if seq_len is None:
             seq_len = x.shape[seq_dim]
+
         assert seq_len <= self.max_seq_len
+
         if seq_len != self.max_seq_len:
-            cos_new, sin_new, _ = self._prepare_cache(seq_len, self.precision, self.base)
-            # print(f"##############\nseq_len!=max_seq_len\ninput: {x.shape}\ncos_new: {cos_new.shape}\nsin_new: {sin_new.shape}\nseq_dim:{seq_dim}")
-            return cos_new.to(x.device), sin_new.to(x.device)
+            # y, z, _ = self._prepare_cache(seq_len, self.precision, self.base)
+            return (
+                self.cos_cached[:seq_len, ...].to(x.device),
+                self.sin_cached[:seq_len, ...].to(x.device),
+            )
         else:
-            # print(f"##############\nseq_len==max_seq_len\ninput: {x.shape}\ncos_cached: {self.cos_cached.shape}\nsin_cached: {self.cos_cached.shape}")
             return self.cos_cached.to(x.device), self.sin_cached.to(x.device)
 
 

From 23b7577e6487e253326e27045540d14cb5facd56 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 13 Feb 2024 19:08:15 +0000
Subject: [PATCH 44/64] cleanup: no longer block out GQA codepaths in
 conversion scripts

---
 tools/ckpts/convert_neox_to_hf.py                | 11 -----------
 tools/ckpts/convert_raw_llama_weights_to_neox.py |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
index a9cd4277e..6ad67f208 100644
--- a/tools/ckpts/convert_neox_to_hf.py
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -246,11 +246,6 @@ def __init__(self, neox_config):
             }
         )
 
-        if args["num_attention_heads"] != args["num_key_value_heads"]:
-            assert (
-                False
-            ), "Got num_key_value_heads != num_attention_heads, but Grouped-Query Attention is not yet supported by NeoX."
-
         if architecture == "mistral":
             # mistral-specific options
             args.update(
@@ -671,12 +666,6 @@ def main(input_args=None, overwrite_values=None):
         "mistral",
     ], f"expected --architecture to be one of 'neox', 'mistral', 'llama', but got '{args.architecture}' !"
 
-    if args.architecture == "mistral":
-        # Mistral Support Coming Soon
-        assert (
-            False
-        ), "Got num_key_value_heads != num_attention_heads, but Grouped-Query Attention is not yet supported by NeoX."
-
     with open(args.config_file) as f:
         loaded_config = yaml.full_load(f)
         if overwrite_values:
diff --git a/tools/ckpts/convert_raw_llama_weights_to_neox.py b/tools/ckpts/convert_raw_llama_weights_to_neox.py
index 6fc9f8a7e..9d6440cff 100644
--- a/tools/ckpts/convert_raw_llama_weights_to_neox.py
+++ b/tools/ckpts/convert_raw_llama_weights_to_neox.py
@@ -73,9 +73,6 @@ def convert_model_pipeline(
     num_layers = params["n_layers"]
     num_heads = params["n_heads"]
     if "n_kv_heads" in params:
-        assert (
-            False
-        ), "Found `n_kv_heads` != `n_heads` in checkpoint config. However, Grouped-Query Attention is not yet supported by NeoX"
         num_kv_heads = params["n_kv_heads"]
     else:
         num_kv_heads = num_heads
@@ -333,9 +330,6 @@ def convert_model_sequential(
     num_layers = params["n_layers"]
     num_heads = params["n_heads"]
     if "n_kv_heads" in params:
-        assert (
-            False
-        ), "Found `n_kv_heads` != `n_heads` in checkpoint config. However, Grouped-Query Attention is not yet supported by NeoX"
         num_kv_heads = params["n_kv_heads"]
     else:
         num_kv_heads = num_heads

From 54135b4ee0dc4a6b7194fb736ccc2679e135c883 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 14 Feb 2024 15:25:17 +0000
Subject: [PATCH 45/64] cleanup: gqa code a bit

---
 megatron/model/transformer.py | 72 ++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index b8be8ddf7..8161c7134 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -53,8 +53,10 @@
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
+     kv: number of key or value heads
      p: number of model parallel partitions
      np: n/p
+     kvp: kv/p
      hp: h/p
      hn: h/n
      b: batch size
@@ -295,21 +297,25 @@ def __init__(
         self.pos_emb = neox_args.pos_emb
         self.sliding_window_width = neox_args.sliding_window_width
 
-        self.attention_type = neox_args.attention_type
-        if self.attention_type != "multihead":
+        if (
+            not neox_args.num_kv_heads
+            or neox_args.num_kv_heads == neox_args.num_attention_heads
+        ):
+            self.gqa = False
+        else:
+            self.gqa = True
+        if self.gqa:
             self.num_kv_heads_per_partition = mpu.divide(
                 neox_args.num_kv_heads, world_size
-            )  # TODO: we want to clone single-kv heads across ranks...
+            )  # we do not yet clone KV heads in MQA across TP ranks...
             self.kv_hidden_size = (
                 neox_args.num_kv_heads * self.hidden_size_per_attention_head
-            )
+            )  # how large the total hidden dim for each of K and V is
         else:
-            self.num_kv_heads_per_partition = (
-                self.num_attention_heads_per_partition
-            )  # None
-            self.kv_hidden_size = neox_args.hidden_size  # None
+            self.num_kv_heads_per_partition = self.num_attention_heads_per_partition
+            self.kv_hidden_size = neox_args.hidden_size
 
-        if self.attention_type == "multihead":
+        if not self.gqa:
             # Strided linear layer.
             self.query_key_value = mpu.ColumnParallelLinear(
                 neox_args=neox_args,
@@ -320,6 +326,7 @@ def __init__(
                 bias=neox_args.use_bias_in_attn_linear,
             )
         else:
+            # QKV proj is smaller if we are using GQA / MQA
             self.query_key_value = mpu.ColumnParallelLinear(
                 neox_args=neox_args,
                 input_size=neox_args.hidden_size,
@@ -374,6 +381,10 @@ def __init__(
         self.attention_type = neox_args.attention_config[layer_number]
         self.use_flash_attention = self.attention_type == "flash"
         self.sparse = self.attention_type not in ("global", "flash")
+
+        if self.gqa:
+            assert (not self.sparse) and (self.use_flash_attention)
+
         if self.sparse:
             self.sparse_attn = configure_sparse_attention(
                 neox_args,
@@ -576,9 +587,6 @@ def flash_attention(self, query_layer, key_layer, value_layer):
                 output_size[0], output_size[2], output_size[1], -1
             )
 
-            # print(key_layer.shape)
-            # print(value_layer.shape)
-
             if not self.training:
                 q_shape = query_layer.shape
                 k_shape = key_layer.shape
@@ -667,7 +675,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         # Query, Key, and Value
         # =====================
 
-        if self.attention_type == "multihead":
+        if not self.gqa:
+            # QKV projection for MHA.
+
             # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
@@ -683,12 +693,16 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                 mixed_x_layer, 3
             )
         else:
-            # Attention heads [sq, b, h] --> [sq, b, (np + 2 * num. (query / num. kv)) * hn)]
+            # QKV projection and separation for GQA,
+            # where KV projections may be smaller than Q projection.
+            # we
+
+            # Attention heads [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-            # TODO: instead split here into [sq, b, np * hn], 2 [sq, b, np/kv_ratio * hn] and then reshape?
-            # TODO: check equivalence (in the multihead case(?))
             # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
+
+            # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
             mixed_x_layer = mixed_x_layer.reshape(
                 (
                     mixed_x_layer.shape[0],
@@ -707,6 +721,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                     ),
                 )
             )
+
+            # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
             (query_layer, key_layer, value_layer) = [
                 x.contiguous()
                 for x in torch.split(
@@ -732,21 +748,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                 )
             ]
 
-            # [sq, b, (np * (1 + 2 * num. (query / num. kv)) * hn)] --> [sq, b, np, (1 + 2 * nq / nkv) * hn]
-            # new_tensor_shape = mixed_x_layer.size()[:-1] + (
-            #     self.num_attention_heads_per_partition + ???,
-            #     self.hidden_size_per_attention_head,
-
-            # [sq, b, np * hn] --> [sq, b, np, hn]
-            new_query_shape = (
-                query_layer.size(0),
-                query_layer.size(1),
-                self.num_attention_heads_per_partition,
-                self.hidden_size_per_attention_head,
-            )
-
-            query_layer = query_layer.view(*new_query_shape)
-
             new_kv_shape = (
                 key_layer.size(0),
                 key_layer.size(1),
@@ -757,14 +758,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             key_layer = key_layer.view(*new_kv_shape)
 
             value_layer = value_layer.view(*new_kv_shape)
-
-            # mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            ## [sq, b, np, 3 * hn
-            # (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
-            #    mixed_x_layer, 3
-            # )
-
+        
         # QK Normalization https://arxiv.org/abs/2302.05442
         if self.use_qk_layernorm:
             query_layer = self.qk_layernorm(query_layer)

From 594d926cbd4ef5f392d9985dc72967408d20fe07 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 14 Feb 2024 15:27:00 +0000
Subject: [PATCH 46/64] add llama2, llemma configs

---
 configs/llama2/13B.yml           | 26 +++++++++
 configs/llama2/70B.yml           | 31 ++++++++++
 configs/llama2/7B.yml            | 26 +++++++++
 configs/llama2/codellama_34B.yml | 30 ++++++++++
 configs/llama2/codellama_7B.yml  | 29 ++++++++++
 configs/llemma/34B.yml           | 97 ++++++++++++++++++++++++++++++++
 configs/llemma/7B.yml            | 96 +++++++++++++++++++++++++++++++
 7 files changed, 335 insertions(+)
 create mode 100644 configs/llama2/13B.yml
 create mode 100644 configs/llama2/70B.yml
 create mode 100644 configs/llama2/7B.yml
 create mode 100644 configs/llama2/codellama_34B.yml
 create mode 100644 configs/llama2/codellama_7B.yml
 create mode 100644 configs/llemma/34B.yml
 create mode 100644 configs/llemma/7B.yml

diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml
new file mode 100644
index 000000000..75ce220a2
--- /dev/null
+++ b/configs/llama2/13B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 40,
+  "hidden_size": 5120,
+  "num_attention_heads": 40,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml
new file mode 100644
index 000000000..286a1b568
--- /dev/null
+++ b/configs/llama2/70B.yml
@@ -0,0 +1,31 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 80,
+  "hidden_size": 8192,
+  "intermediate_size": 28672,
+  "num_attention_heads": 64,
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml
new file mode 100644
index 000000000..6a5c97e64
--- /dev/null
+++ b/configs/llama2/7B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/codellama_34B.yml b/configs/llama2/codellama_34B.yml
new file mode 100644
index 000000000..67129de0d
--- /dev/null
+++ b/configs/llama2/codellama_34B.yml
@@ -0,0 +1,30 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/codellama_7B.yml b/configs/llama2/codellama_7B.yml
new file mode 100644
index 000000000..40e50a427
--- /dev/null
+++ b/configs/llama2/codellama_7B.yml
@@ -0,0 +1,29 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llemma/34B.yml b/configs/llemma/34B.yml
new file mode 100644
index 000000000..8e6ed74e2
--- /dev/null
+++ b/configs/llemma/34B.yml
@@ -0,0 +1,97 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # trained on 256 gpus
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 16,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 12000,
+  "lr_decay_iters": 12000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "min_lr": 1.65e-6,
+  "warmup": 0.042, # warmup for ~500 iters
+  "checkpoint_factor": 250,
+  "eval_interval": 250,
+  "eval_iters": 25,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "SPMTokenizer",
+  #"vocab-file": # use 'tokenizer.model' from Meta CodeLlama download
+
+  # "load": "" # set to same as "save" to resume from intermediate finetuning step
+  #"load": MP=8 CodeLlama-34B checkpoint, converted from Meta CodeLlama download.
+  # When resuming from mid-finetuning run, change "load" to the same as save location.
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+}
diff --git a/configs/llemma/7B.yml b/configs/llemma/7B.yml
new file mode 100644
index 000000000..e589fb65a
--- /dev/null
+++ b/configs/llemma/7B.yml
@@ -0,0 +1,96 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # trained on 256 gpus
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 48000,
+  "lr_decay_iters": 48000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "decay_lr_to": 0.033,
+  "warmup_iters": 500,
+  "checkpoint_factor": 500,
+  "eval_interval": 250,
+  "eval_iters": 50,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+  "save": "/path/to/save/llema-replication",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "/path/to/converted/codellama_7b_weights_with_mp2",
+
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+}

From 0827bb847694914441b9573ba23086a1cd818f7f Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:18:03 +0000
Subject: [PATCH 47/64] add non-flash GQA ; refactor modeling code

---
 megatron/model/transformer.py | 176 +++++++++++++++++++++-------------
 1 file changed, 109 insertions(+), 67 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 8161c7134..1fb862fc4 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -295,6 +295,18 @@ def __init__(
             neox_args.num_attention_heads, world_size
         )
         self.pos_emb = neox_args.pos_emb
+
+        self.use_qk_layernorm = neox_args.use_qk_layernorm
+        if self.use_qk_layernorm:
+            norm, eps = get_norm(neox_args)
+            self.qk_layernorm = norm(
+                [
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                ],
+                eps=eps,
+            )
+
         self.sliding_window_width = neox_args.sliding_window_width
 
         if (
@@ -383,7 +395,7 @@ def __init__(
         self.sparse = self.attention_type not in ("global", "flash")
 
         if self.gqa:
-            assert (not self.sparse) and (self.use_flash_attention)
+            assert not self.sparse
 
         if self.sparse:
             self.sparse_attn = configure_sparse_attention(
@@ -454,13 +466,11 @@ def attention(
             query_layer.size(0),
             key_layer.size(0),
         )
-
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(
             output_size[2], output_size[0] * output_size[1], -1
         )
         key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
         # preallocating result tensor: [b * np, sq, sk]
         matmul_result = torch.empty(
             output_size[0] * output_size[1],
@@ -481,7 +491,6 @@ def attention(
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
-
         # ==================================================
         # Update attention mask for inference. [b, np, sq, sk]
         # ==================================================
@@ -667,6 +676,97 @@ def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
             query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe
         )
 
+    def gqa_project(self, hidden_states, attention_mask, layer_past=None):
+        # QKV projection and separation into separate Q/K/V layers for GQA,
+        # where KV projections may be smaller than Q projection.
+        # the logic for this is explained in comments of this function
+        # detailing the intermediate sizes of tensors at each reshape.
+
+        # Attention heads [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
+
+        # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
+        mixed_x_layer = mixed_x_layer.reshape(
+            (
+                mixed_x_layer.shape[0],
+                mixed_x_layer.shape[1],
+                self.num_attention_heads_per_partition,
+                int(
+                    self.hidden_size_per_attention_head
+                    * (
+                        1
+                        + 2
+                        * (
+                            self.num_kv_heads_per_partition
+                            / self.num_attention_heads_per_partition
+                        )
+                    )
+                ),
+            )
+        )
+
+        # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
+        (query_layer, key_layer, value_layer) = [
+            x.contiguous()
+            for x in torch.split(
+                mixed_x_layer,
+                [
+                    self.hidden_size_per_attention_head,
+                    int(
+                        (
+                            self.num_kv_heads_per_partition
+                            / self.num_attention_heads_per_partition
+                        )
+                        * self.hidden_size_per_attention_head
+                    ),
+                    int(
+                        (
+                            self.num_kv_heads_per_partition
+                            / self.num_attention_heads_per_partition
+                        )
+                        * self.hidden_size_per_attention_head
+                    ),
+                ],
+                dim=mixed_x_layer.dim() - 1,
+            )
+        ]
+
+        # reshape K/V to proper output shape (last dim = head dim again)
+        # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn]
+        new_kv_shape = (
+            key_layer.size(0),
+            key_layer.size(1),
+            self.num_kv_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+
+        key_layer = key_layer.view(*new_kv_shape)
+
+        value_layer = value_layer.view(*new_kv_shape)
+
+        # TODO: if not use_flash, repeat_interleave on key/value layers
+        if not self.use_flash_attention:
+            key_layer = torch.repeat_interleave(
+                key_layer,
+                repeats=int(
+                    self.num_attention_heads_per_partition
+                    // self.num_kv_heads_per_partition
+                ),
+                dim=2,
+            )
+            value_layer = torch.repeat_interleave(
+                value_layer,
+                repeats=int(
+                    self.num_attention_heads_per_partition
+                    // self.num_kv_heads_per_partition
+                ),
+                dim=2,
+            )
+
+        return query_layer, key_layer, value_layer
+
     def forward(self, hidden_states, attention_mask, layer_past=None):
 
         # hidden_states: [sq, b, h]
@@ -693,72 +793,14 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                 mixed_x_layer, 3
             )
         else:
-            # QKV projection and separation for GQA,
-            # where KV projections may be smaller than Q projection.
-            # we
-
-            # Attention heads [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
-            mixed_x_layer, _ = self.query_key_value(hidden_states)
+            # Grouped Query Attention (GQA) - specific logic for performing QKV proj
+            # and separating out Q, K, and V outputs.
 
-            # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
-
-            # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
-            mixed_x_layer = mixed_x_layer.reshape(
-                (
-                    mixed_x_layer.shape[0],
-                    mixed_x_layer.shape[1],
-                    self.num_attention_heads_per_partition,
-                    int(
-                        self.hidden_size_per_attention_head
-                        * (
-                            1
-                            + 2
-                            * (
-                                self.num_kv_heads_per_partition
-                                / self.num_attention_heads_per_partition
-                            )
-                        )
-                    ),
-                )
-            )
-
-            # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
-            (query_layer, key_layer, value_layer) = [
-                x.contiguous()
-                for x in torch.split(
-                    mixed_x_layer,
-                    [
-                        self.hidden_size_per_attention_head,
-                        int(
-                            (
-                                self.num_kv_heads_per_partition
-                                / self.num_attention_heads_per_partition
-                            )
-                            * self.hidden_size_per_attention_head
-                        ),
-                        int(
-                            (
-                                self.num_kv_heads_per_partition
-                                / self.num_attention_heads_per_partition
-                            )
-                            * self.hidden_size_per_attention_head
-                        ),
-                    ],
-                    dim=mixed_x_layer.dim() - 1,
-                )
-            ]
-
-            new_kv_shape = (
-                key_layer.size(0),
-                key_layer.size(1),
-                self.num_kv_heads_per_partition,
-                self.hidden_size_per_attention_head,
+            # output shapes: 1 x [sq, b, np, hn], 2 x [sq, b, kvp, hn] if using flash
+            query_layer, key_layer, value_layer = self.gqa_project(
+                hidden_states, attention_mask, layer_past=layer_past
             )
 
-            key_layer = key_layer.view(*new_kv_shape)
-
-            value_layer = value_layer.view(*new_kv_shape)
-        
         # QK Normalization https://arxiv.org/abs/2302.05442
         if self.use_qk_layernorm:
             query_layer = self.qk_layernorm(query_layer)

From 558bdd8d03eb12a86a8b3bdcbfdafd74844a0939 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:21:39 +0000
Subject: [PATCH 48/64] clean up mistral config for commit

---
 configs/mistral/7B.yml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/configs/mistral/7B.yml b/configs/mistral/7B.yml
index a6e01c2d8..67b7c6a52 100644
--- a/configs/mistral/7B.yml
+++ b/configs/mistral/7B.yml
@@ -1,6 +1,6 @@
 {
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 4,
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
   "make_vocab_size_divisible_by": 1,
 
   # model settings
@@ -8,7 +8,6 @@
   "hidden_size": 4096,
   "intermediate_size": 14336,
   "num_attention_heads": 32,
-  "attention_type": "groupedquery",
   "num_kv_heads": 8,
   "seq_length": 4096,
   "sliding_window_width": 4096,
@@ -22,8 +21,10 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-5,
 
-  # Currently, we only support Grouped Query Attention
-  # When Flash attention is enabled.
+  # Grouped Query Attention is supported for both default ("global")
+  # and Flash attention. However, we highly recommend the use of Flash attention
+  # to get FLOP + runtime speedups when using GQA,
+  # and sliding window attention is currently only supported by Flash attention.
   "attention_config": [[["flash"], 32]],
 
   "scaled_upper_triang_masked_softmax_fusion": true,
@@ -34,6 +35,6 @@
   "activation": "silu",
 
   "tokenizer_type": "SPMTokenizer",
-  "vocab-file": "/weka/hailey/mistral-support-neox/mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+  #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
 
-}
\ No newline at end of file
+}

From 726935fab36399bacdea827685ba53e1ac9bc17d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:22:49 +0000
Subject: [PATCH 49/64] further cleanup configs dir

---
 configs/data_mixture.yml      |   6 --
 configs/llemma-34b-config.yml | 108 ----------------------------------
 configs/llemma-7b-config.yml  | 105 ---------------------------------
 configs/llemma_34b.yml        | 108 ----------------------------------
 configs/llemma_7b.yml         | 105 ---------------------------------
 5 files changed, 432 deletions(-)
 delete mode 100644 configs/data_mixture.yml
 delete mode 100644 configs/llemma-34b-config.yml
 delete mode 100644 configs/llemma-7b-config.yml
 delete mode 100644 configs/llemma_34b.yml
 delete mode 100644 configs/llemma_7b.yml

diff --git a/configs/data_mixture.yml b/configs/data_mixture.yml
deleted file mode 100644
index 637ece59c..000000000
--- a/configs/data_mixture.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "train-data-paths": ["/path/to/proof-pile_llama_tokenizer/train/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/train/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/code-with-proofsteps_llama/train/code-with-proofsteps/code-with-proofsteps_text_document", "/path/to/proof-pile_llama_tokenizer/train/pile-sample/pile-sample_text_document", "/path/to/code-rp_llama_tokenizer/train/code-rp/code-rp_text_document"],
-  "train-data-weights": [2, 4, 1, 0.147368, 0.221053],
-  "valid-data-paths": ["/path/to/proof-pile_llama_tokenizer/validation/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/validation/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/validation/code-with-proofsteps/code-with-proofsteps_text_document"],
-  "test-data-paths": ["/path/to/proof-pile_llama_tokenizer/test/arxiv-rp/arxiv-rp_text_document", "/path/to/open-web-math-v1.2_llama/test/open-web-math/open-web-math_text_document", "/path/to/proof-pile_llama_tokenizer/code-with-proofsteps_llama/test/code-with-proofsteps/code-with-proofsteps_text_document"],
-}
diff --git a/configs/llemma-34b-config.yml b/configs/llemma-34b-config.yml
deleted file mode 100644
index 46e1396cc..000000000
--- a/configs/llemma-34b-config.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-{
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 8,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 48,
-  "hidden_size": 8192,
-  "num_attention_heads": 64,
-  "attention_type": "groupedquery",
-  "num_kv_heads": 8,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "rotary_emb_base": 1000000,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-5,
-
-  "attention_config": [[["flash"], 48]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00005,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 2,
-   "gradient_accumulation_steps": 16,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "precision": "bfloat16",
-   "fp32_allreduce": true,
-   "bf16": {
-     "enabled": true
-   },
-   "data_types": {
-     "grad_accum_dtype": "fp32"
-   },
-
-   "train_iters": 12000,
-   "lr_decay_iters": 12000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "decay_lr_to": 0.033,
-   "warmup_iters": 500,
-   "checkpoint_factor": 250,
-   "eval_interval": 250,
-   "eval_iters": 25,
-
-   "log_interval": 1,
-   "steps_per_print": 1,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "SPMTokenizer",
-   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
-
-   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
-   # "load": "" # set to same as "save" to resume from intermediate finetuning step
-   "load": "/path/to/converted/codellama_34b_weights_with_mp8",
-
-   "finetune": true, # set to false once resuming from intermediate finetuning step
-   "checkpoint_validation_with_forward_pass": true,
-
-
-   "use_wandb": true,
-   "wandb_group": "llemma_34b_replication",
-   "wandb_project": "your-project-name",
-   "wandb_team": "your-teamname-here",
-   "wandb_host": "https://api.wandb.ai",
-
-   "launcher": "slurm",
-   "deepspeed_slurm": true
-}
diff --git a/configs/llemma-7b-config.yml b/configs/llemma-7b-config.yml
deleted file mode 100644
index fa61af90f..000000000
--- a/configs/llemma-7b-config.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-{
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 2,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 32,
-  "hidden_size": 4096,
-  "num_attention_heads": 32,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "rotary_emb_base": 10000,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-5,
-
-  "attention_config": [[["flash"], 32]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0001,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "precision": "bfloat16",
-   "fp32_allreduce": true,
-   "bf16": {
-     "enabled": true
-   },
-   "data_types": {
-     "grad_accum_dtype": "fp32"
-   },
-
-   "train_iters": 48000,
-   "lr_decay_iters": 48000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "decay_lr_to": 0.033,
-   "warmup_iters": 500,
-   "checkpoint_factor": 500,
-   "eval_interval": 250,
-   "eval_iters": 50,
-
-   "log_interval": 1,
-   "steps_per_print": 1,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "SPMTokenizer",
-   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
-
-   "save": "/path/to/save/llema-replication",
-   #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
-   "load": "/path/to/converted/codellama_7b_weights_with_mp2",
-
-   "finetune": true, # set to false once resuming from intermediate finetuning step
-   "checkpoint_validation_with_forward_pass": true,
-
-   "use_wandb": true,
-   "wandb_group": "llemma_7b_replication",
-   "wandb_project": "your-project-name",
-   "wandb_team": "your-teamname-here",
-   "wandb_host": "https://api.wandb.ai",
-
-   "launcher": "slurm",
-   "deepspeed_slurm": true
-}
diff --git a/configs/llemma_34b.yml b/configs/llemma_34b.yml
deleted file mode 100644
index 46e1396cc..000000000
--- a/configs/llemma_34b.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-{
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 8,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 48,
-  "hidden_size": 8192,
-  "num_attention_heads": 64,
-  "attention_type": "groupedquery",
-  "num_kv_heads": 8,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "rotary_emb_base": 1000000,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-5,
-
-  "attention_config": [[["flash"], 48]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00005,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 2,
-   "gradient_accumulation_steps": 16,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "precision": "bfloat16",
-   "fp32_allreduce": true,
-   "bf16": {
-     "enabled": true
-   },
-   "data_types": {
-     "grad_accum_dtype": "fp32"
-   },
-
-   "train_iters": 12000,
-   "lr_decay_iters": 12000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "decay_lr_to": 0.033,
-   "warmup_iters": 500,
-   "checkpoint_factor": 250,
-   "eval_interval": 250,
-   "eval_iters": 25,
-
-   "log_interval": 1,
-   "steps_per_print": 1,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "SPMTokenizer",
-   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
-
-   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
-   # "load": "" # set to same as "save" to resume from intermediate finetuning step
-   "load": "/path/to/converted/codellama_34b_weights_with_mp8",
-
-   "finetune": true, # set to false once resuming from intermediate finetuning step
-   "checkpoint_validation_with_forward_pass": true,
-
-
-   "use_wandb": true,
-   "wandb_group": "llemma_34b_replication",
-   "wandb_project": "your-project-name",
-   "wandb_team": "your-teamname-here",
-   "wandb_host": "https://api.wandb.ai",
-
-   "launcher": "slurm",
-   "deepspeed_slurm": true
-}
diff --git a/configs/llemma_7b.yml b/configs/llemma_7b.yml
deleted file mode 100644
index fa61af90f..000000000
--- a/configs/llemma_7b.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-{
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 2,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 32,
-  "hidden_size": 4096,
-  "num_attention_heads": 32,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "rotary_emb_base": 10000,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-5,
-
-  "attention_config": [[["flash"], 32]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0001,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "precision": "bfloat16",
-   "fp32_allreduce": true,
-   "bf16": {
-     "enabled": true
-   },
-   "data_types": {
-     "grad_accum_dtype": "fp32"
-   },
-
-   "train_iters": 48000,
-   "lr_decay_iters": 48000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "decay_lr_to": 0.033,
-   "warmup_iters": 500,
-   "checkpoint_factor": 500,
-   "eval_interval": 250,
-   "eval_iters": 50,
-
-   "log_interval": 1,
-   "steps_per_print": 1,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "SPMTokenizer",
-   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
-
-   "save": "/path/to/save/llema-replication",
-   #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
-   "load": "/path/to/converted/codellama_7b_weights_with_mp2",
-
-   "finetune": true, # set to false once resuming from intermediate finetuning step
-   "checkpoint_validation_with_forward_pass": true,
-
-   "use_wandb": true,
-   "wandb_group": "llemma_7b_replication",
-   "wandb_project": "your-project-name",
-   "wandb_team": "your-teamname-here",
-   "wandb_host": "https://api.wandb.ai",
-
-   "launcher": "slurm",
-   "deepspeed_slurm": true
-}

From 4cec22313f882091d3106b110db06fbdde82920a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:23:23 +0000
Subject: [PATCH 50/64] remove slurm script from llemma

---
 34b_launch_script.sh | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 34b_launch_script.sh

diff --git a/34b_launch_script.sh b/34b_launch_script.sh
deleted file mode 100644
index 2dd0166b8..000000000
--- a/34b_launch_script.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-#... your SLURM arguments here
-#SBATCH --nodes=32
-#SBATCH --ntasks-per-node=8         
-#SBATCH --cpus-per-task=12
-#SBATCH --gres=gpu:8
-#SBATCH --output=34b_replication_%j.out
-#SBATCH --error=34b_replication_%j.out
-#SBATCH --exclusive
-#SBATCH --open-mode=append
-#SBATCH --requeue
-
-# setup the conda environment and paths to NCCL install.
-source /path/to/conda_setup_script.sh
-
-ds_report
-
-# set distributed env variable flags such as NCCL_DEBUG here
-
-export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=12802
-export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
-
-# Move to the gpt-neox install
-TRAIN_PATH=/path/to/gpt-neox
-cd $TRAIN_PATH
-
-# Write the hostfile for this job here
-# Should write to a hostfile that contains lines of format `<machine IP> slots=<NUM_GPUS_PER_NODE>`
-bash /helper/script/write_hostfile.sh
-export DLTS_HOSTFILE=path/to/hostfile/hosts_$SLURM_JOBID
-
-
-# launch distributed job. If using `"deepspeed_slurm": true` and `"launcher": "slurm"` on a SLURM cluster, 
-# then NeoX will handle the creation of a distributed run across 256 gpus.
-python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
-        --conf_dir /path/to/math-lm/pretraining llemma_34b.yml data_mixture.yml   

From eca632d50afb9c3c68c5fa6daf621f6e6387f833 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:29:23 +0000
Subject: [PATCH 51/64] update seqlen params for codellama, llemma configs

---
 configs/llama2/13B.yml           | 4 ++--
 configs/llama2/codellama_34B.yml | 6 ++++--
 configs/llama2/codellama_7B.yml  | 6 ++++--
 configs/llemma/34B.yml           | 4 ++++
 configs/llemma/7B.yml            | 4 ++++
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml
index 75ce220a2..973b8bea4 100644
--- a/configs/llama2/13B.yml
+++ b/configs/llama2/13B.yml
@@ -7,8 +7,8 @@
   "num_layers": 40,
   "hidden_size": 5120,
   "num_attention_heads": 40,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
   "pos_emb": "rotary",
   "rotary_pct": 1,
   "no_weight_tying": true,
diff --git a/configs/llama2/codellama_34B.yml b/configs/llama2/codellama_34B.yml
index 67129de0d..88e9afaf6 100644
--- a/configs/llama2/codellama_34B.yml
+++ b/configs/llama2/codellama_34B.yml
@@ -8,8 +8,10 @@
   "hidden_size": 8192,
   "num_attention_heads": 64,
   "num_kv_heads": 8,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
+  # Codellama was uptrained on 16k token sequence lengths
+  # with rotary_emb_base adjusted to 1_000_000.
+  "seq_length": 16384,
+  "max_position_embeddings": 16384,
   "pos_emb": "rotary",
   "rotary_pct": 1,
   "rotary_emb_base": 1000000,
diff --git a/configs/llama2/codellama_7B.yml b/configs/llama2/codellama_7B.yml
index 40e50a427..be123ebee 100644
--- a/configs/llama2/codellama_7B.yml
+++ b/configs/llama2/codellama_7B.yml
@@ -7,8 +7,10 @@
   "num_layers": 32,
   "hidden_size": 4096,
   "num_attention_heads": 32,
-  "seq_length": 4096,
-  "max_position_embeddings": 4096,
+  # Codellama was uptrained on 16k token sequence lengths
+  # with rotary_emb_base adjusted to 1_000_000.
+  "seq_length": 16384,
+  "max_position_embeddings": 16384,
   "pos_emb": "rotary",
   "rotary_pct": 1,
   "rotary_emb_base": 1000000,
diff --git a/configs/llemma/34B.yml b/configs/llemma/34B.yml
index 8e6ed74e2..bd72d7e23 100644
--- a/configs/llemma/34B.yml
+++ b/configs/llemma/34B.yml
@@ -9,6 +9,10 @@
   "num_attention_heads": 64,
   "attention_type": "groupedquery",
   "num_kv_heads": 8,
+  # NB: These rotary embedding and sequence length parameters
+  # May differ from CodeLlama configs. They match what we used for
+  # Llemma continued pretraining. See https://arxiv.org/abs/2310.10631
+  # For detailed discussion
   "seq_length": 4096,
   "max_position_embeddings": 4096,
   "pos_emb": "rotary",
diff --git a/configs/llemma/7B.yml b/configs/llemma/7B.yml
index e589fb65a..fb72c8c18 100644
--- a/configs/llemma/7B.yml
+++ b/configs/llemma/7B.yml
@@ -7,6 +7,10 @@
   "num_layers": 32,
   "hidden_size": 4096,
   "num_attention_heads": 32,
+  # NB: These rotary embedding and sequence length parameters
+  # May differ from CodeLlama configs. They match what we used for
+  # Llemma continued pretraining. See https://arxiv.org/abs/2310.10631
+  # For detailed discussion
   "seq_length": 4096,
   "max_position_embeddings": 4096,
   "pos_emb": "rotary",

From b07e63a51f6233e7ee7af30edc5df0e9a5d00935 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 21 Feb 2024 19:50:19 +0000
Subject: [PATCH 52/64] add more comments to GQA code, and make reshapes more
 readable

---
 megatron/model/transformer.py | 78 +++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 1fb862fc4..5846fc791 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -682,29 +682,51 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
         # the logic for this is explained in comments of this function
         # detailing the intermediate sizes of tensors at each reshape.
 
-        # Attention heads [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
+        # pass through projection: [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
         mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-        # TODO: refactor this out into an mpu.utils fn like split_tensor_along_last_dim
+        # First: reshape so we have seqlen, batch, and num. query heads each as separate dims
+        # Final dim is not exactly head dim: the first (head dim) dims are query heads,
+        # The last (head dim * ratio of kv to q heads) each are the "k/v heads"
+        # (right now we treat like we have same num. heads, but smaller head dim)
 
         # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
-        mixed_x_layer = mixed_x_layer.reshape(
-            (
-                mixed_x_layer.shape[0],
-                mixed_x_layer.shape[1],
-                self.num_attention_heads_per_partition,
-                int(
-                    self.hidden_size_per_attention_head
+        new_qkv_shape = (
+            mixed_x_layer.shape[0],
+            mixed_x_layer.shape[1],
+            self.num_attention_heads_per_partition,
+            int(
+                self.hidden_size_per_attention_head
+                * (
+                    1
+                    + 2
                     * (
-                        1
-                        + 2
-                        * (
-                            self.num_kv_heads_per_partition
-                            / self.num_attention_heads_per_partition
-                        )
+                        self.num_kv_heads_per_partition
+                        / self.num_attention_heads_per_partition
                     )
-                ),
-            )
+                )
+            ),
+        )
+        mixed_x_layer = mixed_x_layer.reshape(*new_qkv_shape)
+
+        # Next: split our fake head dim. (last dim) so that the first (head dim) dimensions go to Q,
+        # the last smaller 2 * (head dim * kv to q head ratio) each divided between K and V separately
+        split_sizes = (
+            self.hidden_size_per_attention_head,
+            int(
+                (
+                    self.num_kv_heads_per_partition
+                    / self.num_attention_heads_per_partition
+                )
+                * self.hidden_size_per_attention_head
+            ),
+            int(
+                (
+                    self.num_kv_heads_per_partition
+                    / self.num_attention_heads_per_partition
+                )
+                * self.hidden_size_per_attention_head
+            ),
         )
 
         # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
@@ -712,28 +734,12 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
             x.contiguous()
             for x in torch.split(
                 mixed_x_layer,
-                [
-                    self.hidden_size_per_attention_head,
-                    int(
-                        (
-                            self.num_kv_heads_per_partition
-                            / self.num_attention_heads_per_partition
-                        )
-                        * self.hidden_size_per_attention_head
-                    ),
-                    int(
-                        (
-                            self.num_kv_heads_per_partition
-                            / self.num_attention_heads_per_partition
-                        )
-                        * self.hidden_size_per_attention_head
-                    ),
-                ],
+                split_sizes,
                 dim=mixed_x_layer.dim() - 1,
             )
         ]
 
-        # reshape K/V to proper output shape (last dim = head dim again)
+        # reshape K/V to proper output shape (last dim = correct full "real" head size again)
         # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn]
         new_kv_shape = (
             key_layer.size(0),
@@ -746,7 +752,7 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
 
         value_layer = value_layer.view(*new_kv_shape)
 
-        # TODO: if not use_flash, repeat_interleave on key/value layers
+        # if not using Flash attention, we repeat K/V heads to match Q head counts
         if not self.use_flash_attention:
             key_layer = torch.repeat_interleave(
                 key_layer,

From f0dcf175efb60f146040e2ff798d77ec389d5e45 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:11:14 +0000
Subject: [PATCH 53/64] make inv_freq non-persistent

---
 megatron/model/positional_embeddings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 16cd22ad8..c4605669d 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -39,7 +39,7 @@ class RotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.seq_len_cached = None
         self.cos_cached = None
         self.sin_cached = None
@@ -53,7 +53,7 @@ def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
             max_seq_len, precision, base
         )
 
-        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.cos_cached = cos_cached
         self.sin_cached = sin_cached
 

From 95afe82f9c825295538d6b6978be74bd50a97363 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:20:40 +0000
Subject: [PATCH 54/64] actually, just ensure mistral has inv_freqs as a
 persistent buffer

---
 megatron/model/positional_embeddings.py       |  4 +--
 .../convert_raw_llama_weights_to_neox.py      | 28 +++++++++++--------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index c4605669d..16cd22ad8 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -39,7 +39,7 @@ class RotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("inv_freq", inv_freq)
         self.seq_len_cached = None
         self.cos_cached = None
         self.sin_cached = None
@@ -53,7 +53,7 @@ def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
             max_seq_len, precision, base
         )
 
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("inv_freq", inv_freq)
         self.cos_cached = cos_cached
         self.sin_cached = sin_cached
 
diff --git a/tools/ckpts/convert_raw_llama_weights_to_neox.py b/tools/ckpts/convert_raw_llama_weights_to_neox.py
index 9d6440cff..a28f1d0fb 100644
--- a/tools/ckpts/convert_raw_llama_weights_to_neox.py
+++ b/tools/ckpts/convert_raw_llama_weights_to_neox.py
@@ -162,7 +162,13 @@ def permute_rotary(w):
         rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
         helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
     elif "mistral" in model_size:
-        rope_freqs = None
+        # mistral does not include rope freqs in the distributed checkpoint, unlike llama.
+        # rather than making this buffer always non-persistent on the NeoX side,
+        # just create and save it for Mistral.
+        base = 10000.0
+        rope_freqs = 1.0 / (
+            base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+        )
     else:
         rope_freqs = loaded[0]["rope.freqs"]
         helper.del_loaded("rope.freqs")
@@ -287,11 +293,7 @@ def permute_rotary(w):
                     # Duplicated layers
                     "input_layernorm.scale": input_layernorm,
                     "post_attention_layernorm.scale": post_attention_layernorm,
-                    **(
-                        {"attention.rotary_emb.inv_freq": rope_freqs}
-                        if "mistral" not in model_size
-                        else {}
-                    ),
+                    "attention.rotary_emb.inv_freq": rope_freqs,
                 },
                 layer_i=layer_i + 2,
                 rank=out_rank,
@@ -411,7 +413,13 @@ def permute_rotary(w):
         rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
         helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
     elif "mistral" in model_size:
-        rope_freqs = None
+        # mistral does not include rope freqs in the distributed checkpoint, unlike llama.
+        # rather than making this buffer always non-persistent on the NeoX side,
+        # just create and save it for Mistral.
+        base = 10000.0
+        rope_freqs = 1.0 / (
+            base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+        )
     else:
         rope_freqs = loaded[0]["rope.freqs"]
         helper.del_loaded("rope.freqs")
@@ -538,11 +546,7 @@ def permute_rotary(w):
                     # Duplicated layers
                     "input_layernorm.scale": input_layernorm,
                     "post_attention_layernorm.scale": post_attention_layernorm,
-                    **(
-                        {"attention.rotary_emb.inv_freq": rope_freqs}
-                        if "mistral" not in model_size
-                        else {}
-                    ),
+                    "attention.rotary_emb.inv_freq": rope_freqs,
                 },
                 layer_i=layer_i + 2,
                 rank=out_rank,

From 5cfe8ee70f26118cc5b12c7321658700327c6079 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:21:50 +0000
Subject: [PATCH 55/64] non-flash GQA works, so ensure arguments.py permits it

---
 megatron/neox_arguments/arguments.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 1b43160af..71a3f2203 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1070,8 +1070,9 @@ def calculate_derived(self):
                     not self.sparsity_config
                 ), "Sparse attention not compatible with GQA or MQA"
                 assert all(
-                    attn_type == "flash" for attn_type in self.attention_config
-                ), "GQA / MQA currently only compatible with Flash Attention 2.0"
+                    (attn_type == "flash") or (attn_type == "global")
+                    for attn_type in self.attention_config
+                ), "GQA / MQA currently only compatible with Flash or standard global Attention"
                 assert (
                     self.num_kv_heads % self.model_parallel_size == 0
                 ), "Number of KV heads must be at least model_parallel_size for now!"

From 627a2878a8d7f2e36ffe3648498c5268aa18c95a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:45:13 +0000
Subject: [PATCH 56/64] no longer use our own copies of flash attention
 interface functions

---
 megatron/model/flash_attention.py | 831 ------------------------------
 megatron/model/transformer.py     |  19 +-
 2 files changed, 8 insertions(+), 842 deletions(-)
 delete mode 100644 megatron/model/flash_attention.py

diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py
deleted file mode 100644
index d446f0a51..000000000
--- a/megatron/model/flash_attention.py
+++ /dev/null
@@ -1,831 +0,0 @@
-# Based on: https://github.com/HazyResearch/flash-attention/blob/4a6eaa9f27df6fff7ffb2c24e894938a687dd870/flash_attn/flash_attn_interface.py
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from flash_attn import flash_attn_triton
-import flash_attn_2_cuda as flash_attn_cuda  # For flash_attn version 2.1.1
-
-
-def flash_attn_unpadded_unpacked_func_triton(
-    q, k, v, bias=None, causal=False, softmax_scale=None
-):
-    return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
-
-
-def _flash_attn_forward_cuda(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    return_softmax,
-    num_splits=0,
-    generator=None,
-):
-    """
-    num_splits: how much to parallelize over the seqlen_q dimension. num_splits=0 means
-    it will be set by an internal heuristic. We're exposing num_splits mostly for benchmarking.
-    Don't change it unless you know what you're doing.
-    """
-    softmax_lse, *rest = flash_attn_cuda.fwd(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        return_softmax,
-        num_splits,
-        generator,
-    )
-    # if out.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    S_dmask = rest[0] if return_softmax else None
-    return out, softmax_lse, S_dmask
-
-
-def _flash_attn_backward_cuda(
-    dout,
-    q,
-    k,
-    v,
-    out,
-    softmax_lse,
-    dq,
-    dk,
-    dv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    num_splits=0,
-    generator=None,
-):
-    """
-    num_splits: whether to parallelize over the seqlen_k dimension (num_splits > 1) or
-    not (num_splits = 1). num_splits=0 means it will be set by an internal heuristic.
-    Any value above 1 will call the same kernel (i.e. num_splits=2 would call the same kernel
-    as num_splits=3), so effectively the choices are 0, 1, and 2.
-    This hyperparameter can be tuned for performance, but default value (heuristic) should work fine.
-    """
-    _, _, _, softmax_d = flash_attn_cuda.bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        num_splits,
-        generator,
-    )
-    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dq, dk, dv, softmax_d
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cu_seqlens,
-        max_seqlen,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            torch.empty_like(qkv[:, 0]),
-            cu_seqlens,
-            cu_seqlens,
-            max_seqlen,
-            max_seqlen,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(qkv, out, softmax_lse, cu_seqlens, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen = max_seqlen
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        qkv, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dqkv = torch.empty_like(qkv)
-        _flash_attn_backward_cuda(
-            dout,
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            out,
-            softmax_lse,
-            dqkv[:, 0],
-            dqkv[:, 1],
-            dqkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            ctx.max_seqlen,
-            ctx.max_seqlen,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_qkvpacked_func_cuda(
-    qkv,
-    cu_seqlens,
-    max_seqlen,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    return FlashAttnQKVPackedFunc.apply(
-        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
-    )
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            torch.empty_like(q),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(
-            q, kv, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            kv,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dq = torch.empty_like(q)
-        dkv = torch.empty_like(kv)
-        _flash_attn_backward_cuda(
-            dout,
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            out,
-            softmax_lse,
-            dq,
-            dkv[:, 0],
-            dkv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dq, dkv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_kvpacked_func_cuda(
-    q,
-    kv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        kv: (total_k, 2, nheads, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnKVPackedFunc.apply(
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
-
-
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            q,
-            k,
-            v,
-            torch.empty_like(q),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(
-            q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
-        _flash_attn_backward_cuda(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dq,
-            dk,
-            dv,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dq, dk, dv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_func_cuda(
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        k: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
-        v: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnFunc.apply(
-        q,
-        k,
-        v,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
-
-
-# For flash-attention 2 integration
-def _flash_attn_varlen_forward(
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    return_softmax,
-):
-    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
-    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    (
-        out,
-        q,
-        k,
-        v,
-        out_padded,
-        softmax_lse,
-        S_dmask,
-        rng_state,
-    ) = flash_attn_cuda.varlen_fwd(
-        q,
-        k,
-        v,
-        None,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        return_softmax,
-        None,
-    )
-    # if out.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
-
-
-def _flash_attn_varlen_backward(
-    dout,
-    q,
-    k,
-    v,
-    out,
-    softmax_lse,
-    dq,
-    dk,
-    dv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    rng_state=None,
-):
-    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
-    # dq, dk, dv are allocated by us so they should already be contiguous
-    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
-    dq, dk, dv, softmax_d, = flash_attn_cuda.varlen_bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        None,
-        rng_state,
-    )
-    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dq, dk, dv, softmax_d
-
-
-class FlashAttnVarlenQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cu_seqlens,
-        max_seqlen,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        (
-            out,
-            q,
-            k,
-            v,
-            out_padded,
-            softmax_lse,
-            S_dmask,
-            rng_state,
-        ) = _flash_attn_varlen_forward(
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            max_seqlen,
-            max_seqlen,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax and dropout_p > 0,
-        )
-        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen = max_seqlen
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        q, k, v, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
-        qkv_shape = q.shape[:-2] + (3, *q.shape[-2:])
-        dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device)
-        _flash_attn_varlen_backward(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dqkv[:, 0],
-            dqkv[:, 1],
-            dqkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            ctx.max_seqlen,
-            ctx.max_seqlen,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-            rng_state=rng_state,
-        )
-        dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_attn_varlen_qkvpacked_func(
-    qkv,
-    cu_seqlens,
-    max_seqlen,
-    dropout_p=0.0,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    If Q, K, V are already stacked into 1 tensor, this function will be faster than
-    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
-    of the gradients of Q, K, V.
-    For multi-query and grouped-query attention (MQA/GQA), please see
-    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.
-
-    Arguments:
-        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
-        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into qkv.
-        max_seqlen: int. Maximum sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnVarlenQKVPackedFunc.apply(
-        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
-    )
-
-
-class FlashAttnVarlenKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        (
-            out,
-            q,
-            k,
-            v,
-            out_padded,
-            softmax_lse,
-            S_dmask,
-            rng_state,
-        ) = _flash_attn_varlen_forward(
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax and dropout_p > 0,
-        )
-        ctx.save_for_backward(
-            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        dq = torch.empty_like(q)
-        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
-        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
-        _flash_attn_varlen_backward(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dq,
-            dkv[:, 0],
-            dkv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-            rng_state=rng_state,
-        )
-        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
-        dkv = dkv[..., : dout.shape[-1]]
-        return dq, dkv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_varlen_kvpacked_func(
-    q,
-    kv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p=0.0,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    If K, V are already stacked into 1 tensor, this function will be faster than
-    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
-    of the gradients of K, V.
-    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
-    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
-    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
-    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
-
-    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
-    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
-        1 1 1 1 0
-        1 1 1 1 1
-    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
-        0 0
-        0 0
-        0 0
-        1 0
-        1 1
-    If the row of the mask is all zero, the output will be zero.
-
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnVarlenKVPackedFunc.apply(
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 5846fc791..85cfb6e2d 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -406,24 +406,21 @@ def __init__(
             )
         else:
             if self.use_flash_attention:
-                from megatron.model.flash_attention import (
-                    # flash_attn_unpadded_qkvpacked_func_cuda,
-                    # flash_attn_unpadded_kvpacked_func_cuda,
-                    # Change of function names going from flash attention 1 -> flash attention 2
-                    flash_attn_varlen_qkvpacked_func,
-                    flash_attn_varlen_kvpacked_func,
-                    flash_attn_unpadded_unpacked_func_triton,
-                )
+                # we now use Flash Attention 2's provided interface.
+                # TODO: we no longer need to use flash_triton_fn since flash cuda supports alibi.
+                # consider adding OpenAI's more recent Flash-2 Triton kernel in future
+                # from https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
                 from flash_attn.flash_attn_interface import (
                     flash_attn_func,
                     flash_attn_varlen_func,
                 )
+                from flash_attn.flash_attn_triton import (
+                    flash_attn_func as flash_attn_unpadded_unpacked_func_triton,
+                )
 
                 self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
                 self.flash_qkv_fn = flash_attn_func
-                self.flash_varlen_qkv_fn = (
-                    flash_attn_varlen_func  # TODO: use neox's flash attention interface
-                )
+                self.flash_varlen_qkv_fn = flash_attn_varlen_func
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,

From 63c2fbedffa2fcf0562100d6eb2c49b36ba0cce8 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:47:32 +0000
Subject: [PATCH 57/64] remove unused mpu util fn

---
 megatron/mpu/utils.py | 45 ++-----------------------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index 79b728edc..cb12f2569 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2024, EleutherAI
+# Copyright (c) 2021, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -52,47 +52,6 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=
 
     return tensor_list
 
-def split_reorder_and_stack_separate_qkv(q, k, v, num_mp_ranks, dim=0, contiguous_qkv_chunks=False):
-    """
-    Splits separate q, k, v matrices e.g. from huggingface into chunks of size=mp config, then interleaves them so that each worker
-    gets its packed qkv matrices appropriately before stacking them.
-    Idea (example for GQA):
-    q = [q1, q2, q3, q4]
-    k = [k1, k2]
-    v = [v1, v2]
-    1) Split: First split into mp chunks, assuming mp=2 we get   [[q1, q2], [q3, q4]],   [[k1], [k2]],   [[v1], [v2]]
-    2) Reorder: Then group relevant qkv for each mp rank: [q1, q2, k1, v1], [q3, q4, k2, v2]
-    3) Stack: Consolidate into single qkv: [q1, q2, k1, v1, q3, q4, k2, v2]
-    That way when the qkv gets loaded on each rank we avoid [q1, q2, q3, q4] on one rank, [k1, k2, v1, v2] on the other, which would
-    be misinterpreted in transformer.py as q3 being a key tensor, q4 being a value tensor, etc.
-    
-    Relying on the assert happening when mpu.divide gets called when initialising the neox transformer; note this will need to be updated
-    if the q, k, v behaviour of transformers.py is changed.
-
-    To perform a simple test on the case num_mp_ranks=2:
-    m = 2
-    A = torch.ones((8,2))
-    B = torch.ones((8,2))*2
-    C = torch.ones((8,2))*3
-    D = torch.cat([torch.cat((x, y, z), dim=0) for x, y, z in zip(torch.chunk(A, chunks=m, dim=0),
-                                                                  torch.chunk(B, chunks=m, dim=0),
-                                                                  torch.chunk(C, chunks=m, dim=0))],
-                      dim=0)
-    """
-    def conditional_contiguous(tensor, contiguous_qkv_chunks):
-        if contiguous_qkv_chunks:
-            return tensor.contiguous()
-        else:
-            return tensor
-    return torch.cat(
-                        [
-                        conditional_contiguous(torch.cat((x, y, z), dim=dim), contiguous_qkv_chunks) 
-                                                    for x, y, z in zip(torch.chunk(q, chunks=num_mp_ranks, dim=dim),
-                                                                       torch.chunk(k, chunks=num_mp_ranks, dim=dim),
-                                                                       torch.chunk(v, chunks=num_mp_ranks, dim=dim))
-                        ],
-                        dim=dim
-                    )
 
 class VocabUtility:
     """Split the vocabulary into `world_size` chunks amd return the

From e7684926e467aafcab3abb6db9d329c7e3715849 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:49:02 +0000
Subject: [PATCH 58/64] delete unused config file

---
 configs/mistral_7b.yml | 121 -----------------------------------------
 1 file changed, 121 deletions(-)
 delete mode 100644 configs/mistral_7b.yml

diff --git a/configs/mistral_7b.yml b/configs/mistral_7b.yml
deleted file mode 100644
index c2dbf409e..000000000
--- a/configs/mistral_7b.yml
+++ /dev/null
@@ -1,121 +0,0 @@
-{
-  "pipe_parallel_size": 0,
-  "model_parallel_size": 4,
-  # "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 32,
-  "hidden_size": 4096,
-  "intermediate_size": 14336,
-  "num_attention_heads": 32,
-  "attention_type": "groupedquery",
-  "num_kv_heads": 8,
-  "seq_length": 4096,
-  "sliding_window_width": 4096,
-  "max_position_embeddings": 131072,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "rotary_emb_base": 10000,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "attention_config": [[["flash"], 32]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-  "init_method_std": 0.02,
-
-  "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00005,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-  },
-
-  "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  # "train_batch_size": 4,
-  "train_micro_batch_size_per_gpu": 2,
-  "gradient_accumulation_steps": 16,
-  "data_impl": "mmap",
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "precision": "bfloat16",
-  "fp32_allreduce": true,
-  "bf16": {
-    "enabled": true
-  },
-  "data_types": {
-    "grad_accum_dtype": "fp32"
-  },
-
-  "train_iters": 12000,
-  "lr_decay_iters": 12000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "decay_lr_to": 0.033,
-  "warmup_iters": 500,
-  "checkpoint_factor": 250,
-  "eval_interval": 250,
-  "eval_iters": 25,
-
-  "log_interval": 1,
-  "steps_per_print": 1,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "SPMTokenizer",
-  "vocab-file": "/path/to/mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
-
-  "save": "checkpoints/test",
-  # "load": "" # set to same as "save" to resume from intermediate finetuning step
-  "load": "checkpoints/neox-converted",
-
-# Set these 2 (finetune and validation with fw pass) to false during conversion
-  "finetune": false, # set to false once resuming from intermediate finetuning step
-  "checkpoint_validation_with_forward_pass": false,
-
-
-  "use_wandb": false,
-  "wandb_group": "llemma_34b_replication",
-  "wandb_project": "your-project-name",
-  "wandb_team": "your-teamname-here",
-  "wandb_host": "https://api.wandb.ai",
-
-  "launcher": "slurm",
-  "deepspeed_slurm": true,
-
-  # Special stability ?
-  "no_ssh_check": true,
-  "comment": "neox",
-
-  "train-data-paths": ["data/enwik8/enwik8_text_document"],
-  "valid-data-paths": ["data/enwik8/enwik8_text_document"],
-  "test-data-paths": ["data/enwik8/enwik8_text_document"], 
-}

From caa440db5d2501de035d616a8c7a7219d5290408 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 19:50:44 +0000
Subject: [PATCH 59/64] fix diff on mpu/utils.py

---
 megatron/mpu/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index cb12f2569..13941dc29 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 74fde9868688ccfb38b2e6efc72f620598d24c5d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 20:49:51 +0000
Subject: [PATCH 60/64] remove slurm scripts that won't be in this PR

---
 convert_hf_to_sequential_mistral.py | 766 ----------------------------
 mistral_7b_convert.sh               |  64 ---
 2 files changed, 830 deletions(-)
 delete mode 100644 convert_hf_to_sequential_mistral.py
 delete mode 100644 mistral_7b_convert.sh

diff --git a/convert_hf_to_sequential_mistral.py b/convert_hf_to_sequential_mistral.py
deleted file mode 100644
index 032738654..000000000
--- a/convert_hf_to_sequential_mistral.py
+++ /dev/null
@@ -1,766 +0,0 @@
-import sys
-import os
-import copy
-import deepspeed
-
-# import time
-
-import argparse
-import torch
-
-import numpy as np
-
-from functools import reduce
-from transformers import AutoModelForCausalLM, AutoConfig
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
-)
-from megatron.neox_arguments import NeoXArgs
-from megatron.training import get_model, get_optimizer, get_learning_rate_scheduler
-from megatron.initialize import initialize_megatron
-from megatron import mpu
-from megatron.checkpointing import load_checkpoint, save_checkpoint
-from megatron.mpu.utils import split_reorder_and_stack_separate_qkv
-
-# from megatron.utils import (
-#     Timers,
-#     init_wandb,
-# )
-
-"""
-A script for converting publicly available Huggingface (HF) checkpoints NeoX format.
-
-Note that this script requires access to corresponding config files for equivalent NeoX models to those found in Hugging face.
-
-Example usage: (Converts the 70M Pythia model to NeoX format)
-================================================================
-OMPI_COMM_WORLD_RANK=0 CUDA_VISIBLE_DEVICES=0 python tools/ckpts/convert_hf_to_sequential.py \
-    --hf-model-name pythia-70m-v0 \
-    --revision 143000 \
-    --output-dir checkpoints/neox_converted/pythia/70m \
-    --cache-dir checkpoints/HF \
-    --config configs/pythia/70M.yml configs/local_setup.yml \
-    --test
-
-
-For multi-gpu support we must initialize deepspeed:
-NOTE: This requires manually changing the arguments below.
-================================================================
-CUDA_VISIBLE_DEVICES=0,1,2,3 python ./deepy.py tools/ckpts/convert_hf_to_sequential.py \
-    -d configs pythia/70M.yml local_setup.yml
-"""
-
-MULTI_GPU_ARGS = " ".join(
-    [
-        "--hf-model-name mistralai/Mistral-7B-v0.1",
-        "--output-dir checkpoints/neox-converted/",
-        "--cache-dir checkpoints/HF",
-        "--config configs/mistral_7b.yml",
-        "--test",
-    ]
-)
-
-def reshape_qkv_helper(heads, num_heads, hidden_size, num_mp_ranks, num_q_heads):
-    assert hidden_size >= num_q_heads
-    assert num_q_heads >= num_mp_ranks
-    assert int(hidden_size//num_q_heads * (num_heads/num_q_heads)) > 0
-    heads = heads.reshape(num_heads, hidden_size//num_q_heads, hidden_size)
-    heads = heads.reshape(num_mp_ranks, num_q_heads//num_mp_ranks, int(hidden_size//num_q_heads * (num_heads/num_q_heads)), hidden_size)
-    return heads
-
-def handle_qkv(hf_layer_sd, num_mp_ranks, hf_config):
-    hidden_size = hf_config.hidden_size
-    num_attention_heads = hf_config.num_attention_heads
-    num_key_value_heads = hf_config.num_key_value_heads
-    q = reshape_qkv_helper(hf_layer_sd["self_attn.q_proj.weight"], num_attention_heads, hidden_size, num_mp_ranks, num_attention_heads)
-    k = reshape_qkv_helper(hf_layer_sd["self_attn.k_proj.weight"], num_key_value_heads, hidden_size, num_mp_ranks, num_attention_heads)
-    v = reshape_qkv_helper(hf_layer_sd["self_attn.v_proj.weight"], num_key_value_heads, hidden_size, num_mp_ranks, num_attention_heads)
-
-    return torch.cat([q, k, v], dim=2).reshape(num_mp_ranks, (hidden_size + int(2 * (hidden_size * num_key_value_heads/num_attention_heads)))//num_mp_ranks, hidden_size)
-    # return torch.cat([hf_layer_sd["self_attn.q_proj.weight"], hf_layer_sd["self_attn.k_proj.weight"], hf_layer_sd["self_attn.v_proj.weight"]], dim=0)
-
-def convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks):
-    """Converts the weights of a HuggingFace model to neox 2.0 format.
-
-    :param hf_model: the huggingface model
-    :param seq_state_dict: the state dict of the equivalent neox model
-
-    returns the updated sequential state dict
-    """
-
-    num_layers = hf_model.config.num_hidden_layers
-    hf_config = hf_model.config
-
-    # Embedding is layer idx 0
-    seq_state_dict[
-        "sequential.0.word_embeddings.weight"
-    ] = hf_model.model.embed_tokens.state_dict()["weight"]
-
-    for layer_hf in range(num_layers):
-        # offset by 2
-        layer_seq = layer_hf + 2
-
-        # get layer from hf model
-        hf_layer = hf_model.model.layers[layer_hf]
-        hf_layer_sd = hf_layer.state_dict()
-
-
-        seq_state_dict[f"sequential.{layer_seq}.attention.query_key_value.weight"] \
-                        = handle_qkv(hf_layer_sd, num_mp_ranks, hf_config)
-                        # = split_reorder_and_stack_separate_qkv(hf_layer_sd["self_attn.q_proj.weight"], 
-                        #                                        hf_layer_sd["self_attn.k_proj.weight"],
-                        #                                        hf_layer_sd["self_attn.v_proj.weight"],
-                        #                                        num_mp_ranks=num_mp_ranks,
-                        #                                        dim=0)
-        seq_state_dict[f"sequential.{layer_seq}.attention.dense.weight"] = hf_layer_sd["self_attn.o_proj.weight"]
-        seq_state_dict[f"sequential.{layer_seq}.mlp.w1.weight"] = hf_layer_sd["mlp.gate_proj.weight"]
-        seq_state_dict[f"sequential.{layer_seq}.mlp.w3.weight"] = hf_layer_sd["mlp.up_proj.weight"]
-        seq_state_dict[f"sequential.{layer_seq}.mlp.w2.weight"] = hf_layer_sd["mlp.down_proj.weight"]
-        seq_state_dict[f"sequential.{layer_seq}.input_layernorm.scale"] = hf_layer_sd["input_layernorm.weight"]
-        seq_state_dict[f"sequential.{layer_seq}.post_attention_layernorm.scale"] = hf_layer_sd["post_attention_layernorm.weight"]
-# 
-
-
-
-# Conversion table for Mistral 7b 0.1
-# FOR PARTS LOADED THROUGH hf_model.model
-# 'embed_tokens.weight'                           ->      '0.word_embeddings.weight'
-# 'layers.0.self_attn.q_proj.weight'              ->      '2.attention.query_key_value.weight'
-# 'layers.0.self_attn.k_proj.weight'              ->      <gpt_neox packs qkv in single tensor, see transformer.py>
-# 'layers.0.self_attn.v_proj.weight'              ->      <gpt_neox packs qkv in single tensor, see transformer.py>
-# 'layers.0.self_attn.o_proj.weight'              ->      '2.attention.dense.weight'
-# 'layers.0.mlp.gate_proj.weight'                 ->      '2.mlp.w1.weight'
-# 'layers.0.mlp.up_proj.weight'                   ->      '2.mlp.w3.weight'
-# 'layers.0.mlp.down_proj.weight'                 ->      '2.mlp.w2.weight'
-# 'layers.0.input_layernorm.weight'               ->      '2.input_layernorm.scale'
-# 'layers.0.post_attention_layernorm.weight'      ->      '2.post_attention_layernorm.scale'
-# 'norm.weight'                                   ->      '<num_layers+3>.norm.scale'
-# FOR PARTS LOADED THROUGH hf_model.lm_head
-# <weights>                                       ->      '<num_layers+4>.final_linear.weight'
-
-
-    # Load final layer norm
-    layer_seq = num_layers + 3
-    seq_state_dict[
-        f"sequential.{layer_seq}.norm.scale"
-    ] = hf_model.model.norm.state_dict()["weight"]
-
-    # output embedding / LM head
-    layer_seq += 1
-    seq_state_dict[
-        f"sequential.{layer_seq}.final_linear.weight"
-    ] = hf_model.lm_head.state_dict()["weight"]
-
-
-def shard_sequential_mp(num_mp_ranks, sequential):
-    """Shards the sequential model into model parallel ranks.
-
-    :param num_mp_ranks: the number of model parallel ranks
-    :param sequential: the state dict of the sequential model at mp=1
-
-    returns a dict of state dicts for each mp rank
-    """
-    ranks = {x: dict() for x in range(num_mp_ranks)}
-    for k, v in sequential.items():
-        if reduce(
-            np.logical_or,
-            [
-                x in k
-                for x in [
-                    "layernorm",
-                    "rotary_emb",
-                    "mlp.w2.bias",
-                    "norm.scale",
-                    "attention.dense.bias",
-                ]
-            ],
-        ):
-            # no splitting
-            for x in range(num_mp_ranks):
-                ranks[x][k] = v
-        else:
-            print(k)
-            print(v.shape)
-            if len(v.shape) == 1:
-                size_per_rank = v.shape[0] / num_mp_ranks
-                if size_per_rank % 128 != 0.0:
-                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
-                    size_diff = int((padded_size * 4) - v.shape[max_])
-                    zero_pad = torch.zeros((size_diff))
-                    v = torch.cat([v, zero_pad], dim=max_)
-                else:
-                    padded_size = size_per_rank
-
-                assert size_per_rank % 1.0 == 0.0
-                assert padded_size % 1.0 == 0.0
-
-                padded_size = int(padded_size)
-                size_per_rank = int(size_per_rank)
-
-                print("size_per_rank 1", size_per_rank)
-                print("padded_size 1", padded_size)
-                for x in range(num_mp_ranks):
-                    if size_per_rank != padded_size:
-                        # need to pad
-                        ranks[x][k] = v[padded_size * x : padded_size * (x + 1)]
-                    else:
-                        ranks[x][k] = v[size_per_rank * x : size_per_rank * (x + 1)]
-
-            elif len(v.shape) == 2:
-
-                if reduce(
-                    np.logical_or,
-                    [
-                        x in k
-                        for x in [
-                            "attention.dense.weight",
-                            "mlp.w2.weight",
-                        ]
-                    ],
-                ):  # column parallel
-                    max_, min_ = 1, 0
-                elif reduce(
-                    np.logical_or,
-                    [
-                        x in k
-                        for x in [
-                            "attention.query_key_value.weight",
-                            "attention.query_key_value.bias",
-                            "word_embeddings.weight",
-                            "final_linear.weight",
-                            "mlp.w1",
-                            "mlp.w3",
-                        ]
-                    ],
-                ):
-                    # row parallel
-                    max_, min_ = 0, 1
-                else:
-                    raise Exception("Unknown weight to shard: {}".format(k))
-
-                size_per_rank = v.shape[max_] / num_mp_ranks
-                if size_per_rank % 128 != 0.0:
-                    padded_size = (128 - (size_per_rank % 128)) + size_per_rank
-                    size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
-
-                    assert (
-                        size_diff > 0
-                    ), "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
-                        size_diff, size_per_rank, k, v.shape, padded_size
-                    )
-
-                    zero_pad = (
-                        torch.zeros((size_diff, v.shape[min_]))
-                        if max_ == 0
-                        else torch.zeros((v.shape[min_], size_diff))
-                    )
-
-                    v = torch.cat([v, zero_pad], dim=max_)
-                else:
-                    padded_size = size_per_rank
-
-                assert size_per_rank % 1.0 == 0.0
-                assert padded_size % 1.0 == 0.0
-
-                padded_size = int(padded_size)
-                size_per_rank = int(size_per_rank)
-
-                print("size_per_rank 2", size_per_rank)
-                print("padded_size 2", padded_size)
-                for x in range(num_mp_ranks):
-                    if size_per_rank != padded_size:
-                        # need to pad
-                        ranks[x][k] = (
-                            v[padded_size * x : padded_size * (x + 1), :]
-                            if max_ == 0
-                            else v[:, padded_size * x : padded_size * (x + 1)]
-                        )
-                    else:
-                        ranks[x][k] = (
-                            v[size_per_rank * x : size_per_rank * (x + 1), ...]
-                            if max_ == 0
-                            else v[:, size_per_rank * x : size_per_rank * (x + 1)]
-                        )
-
-            elif len(v.shape) == 3:
-                if reduce(
-                    np.logical_or,
-                    [
-                        x in k
-                        for x in [
-                            "attention.query_key_value.weight",
-                        ]
-                    ],
-                ):
-                    # row parallel (indices start at 1 since idx 0 is mp rank)
-                    max_, min_ = 1, 2
-                else:
-                    raise Exception("Unknown weight to shard: {}".format(k))
-
-                # size_per_rank = v.shape[max_]
-                # if size_per_rank % 128 != 0.0:
-                #     padded_size = (128 - (size_per_rank % 128)) + size_per_rank
-                #     size_diff = int((padded_size * num_mp_ranks) - v.shape[max_])
-
-                #     assert (
-                #         size_diff > 0
-                #     ), "[ERROR] size diff is negative: {} for size_per_rank: {}, k:{}, shape:{}, padded_size:{}".format(
-                #         size_diff, size_per_rank, k, v.shape, padded_size
-                #     )
-
-                #     zero_pad = (
-                #         torch.zeros((v.shape[0], size_diff, v.shape[min_]))
-                #         if max_ == 0
-                #         else torch.zeros((v.shape[min_], size_diff))
-                #     )
-
-                #     v = torch.cat([v, zero_pad], dim=max_)
-                # else:
-                #     padded_size = size_per_rank
-
-                # assert size_per_rank % 1.0 == 0.0
-                # assert padded_size % 1.0 == 0.0
-
-                # padded_size = int(padded_size)
-                # size_per_rank = int(size_per_rank)
-
-                # print("size_per_rank 2", size_per_rank)
-                # print("padded_size 2", padded_size)
-                for x in range(num_mp_ranks):
-                    ranks[x][k] = v[x, :, :]
-            else:
-                raise NotImplementedError()
-
-    return ranks
-
-
-def replace_sharded_seq(mp_checkpoints, mp_sharded_seq):
-    """replaces the values within checkpointed configs with those
-    from the sharded sequential object."""
-
-    for mp_idx, shard in mp_sharded_seq.items():
-        mp_key = f"mp_rank_{mp_idx:02}_model_states.pt"
-
-        # use for loop instead of direct assignment
-        # to check for compatibility
-        for k, v in mp_checkpoints[mp_key]["module"].items():
-            try:
-                mp_checkpoints[mp_key]["module"][k] = shard[k]
-            except KeyError:
-                if reduce(
-                    np.logical_or,
-                    [
-                        x in k
-                        for x in [
-                            "rotary_emb",
-                        ]
-                    ],
-                ):
-                    continue
-                print("ERROR key:{} not found in shard.".format(k))
-
-
-def shard_pp(sequential, mp_rank, num_layers):
-    """Shards the model into layers.
-
-    :param sequential: the state dict of the sequential model at mp=1
-    :param mp_rank: the model parallel rank of the layers
-
-    returns a dict of state dicts for each layer
-    """
-    suffix = f"-model_{mp_rank:02}-model_states.pt"
-
-    layers_seq = dict()
-    layers_seq[f"layer_00" + suffix] = {
-        "word_embeddings.weight": sequential[f"sequential.0.word_embeddings.weight"]
-    }
-    layers_seq[f"layer_{num_layers+3:02}" + suffix] = {
-        "norm.scale": sequential[f"sequential.{num_layers+3}.norm.scale"],
-    }
-
-    layers_seq[f"layer_{num_layers+4:02}" + suffix] = {
-        "final_linear.weight": sequential[
-            f"sequential.{num_layers+4}.final_linear.weight"
-        ]
-    }
-
-    for layer in range(2, num_layers + 2):
-        layer_keys = [x for x in sequential if ".{}.".format(layer) in x]
-        layers_seq[f"layer_{layer:02}" + suffix] = {
-            k.split(".{}.".format(layer))[1]: sequential[k] for k in layer_keys
-        }
-
-    return layers_seq
-
-
-def shard_pp_mp(num_mp_ranks, sequential, num_layers):
-    """Shards the model into layers and model parallel ranks.
-
-    :param num_mp_ranks: the number of model parallel ranks
-    :param sequential: the state dict of the sequential model at mp=1
-    :param num_layers: the number of layers in the model
-
-    returns a dict of state dicts for each layer for each model parallel rank
-    """
-    mp_sharded = shard_sequential_mp(num_mp_ranks=num_mp_ranks, sequential=sequential)
-
-    layers_pp_mp = {}
-    for mp_rank, d in mp_sharded.items():
-        layers_pp_mp.update(
-            shard_pp(sequential=d, mp_rank=mp_rank, num_layers=num_layers)
-        )
-    return layers_pp_mp
-
-
-def convert(hf_model, ckpt_dir, output_dir):
-    """Converts a huggingface model to a NeoX checkpoint for different
-        model parallel and pipeline parallel settings degrees.
-
-    :param hf_model: the huggingface model
-    :param ckpt_dir: the directory containing the NeoX checkpoint
-    :param output_dir: the directory to save the converted checkpoint
-    returns None
-    """
-
-    os.listdir(ckpt_dir)
-
-    ckpts, layers = {}, {}
-    for x in os.listdir(ckpt_dir):
-        if x.startswith("mp_rank"):
-            ckpts[x] = torch.load(os.path.join(ckpt_dir, x))
-        elif x.startswith("layer"):
-            layers[x] = torch.load(os.path.join(ckpt_dir, x))
-
-    assert len(layers) + len(ckpts) > 0, "No checkpoints found in {}".format(ckpt_dir)
-
-    os.makedirs(output_dir, exist_ok=True)
-    seq_state_dict = dict()
-    convert_hf_to_sequential(hf_model, seq_state_dict, num_mp_ranks=len(ckpts))
-
-    if len(ckpts) == 1 and len(layers) == 0:
-        # pp=0, mp=1
-        key = list(ckpts.keys())[0]
-        ckpts[key]["module"] = seq_state_dict
-        to_save = ckpts
-
-    elif len(ckpts) > 1 and len(layers) == 0:
-        # pp=0, mp>1
-        sharded_seq = shard_sequential_mp(
-            num_mp_ranks=len(ckpts), sequential=seq_state_dict
-        )
-        replace_sharded_seq(mp_checkpoints=ckpts, mp_sharded_seq=sharded_seq)
-        to_save = ckpts
-
-    elif len(ckpts) == 1 and len(layers) > 1:
-        # pp>0, mp==1
-        to_save = shard_pp(
-            sequential=seq_state_dict,
-            mp_rank=0,
-            num_layers=hf_model.config.num_hidden_layers,
-        )
-
-    elif len(ckpts) > 1 and len(layers) > 1:
-        # pp>0, mp>1
-        to_save = shard_pp_mp(
-            num_mp_ranks=len(ckpts),
-            sequential=seq_state_dict,
-            num_layers=hf_model.config.num_hidden_layers,
-        )
-
-    else:
-        raise NotImplementedError(
-            "Not implemented for len(ckpts)={} and len(layers)={}".format(
-                len(ckpts), len(layers)
-            )
-        )
-
-    for k, v in to_save.items():
-        print("saving {}...".format(os.path.join(output_dir, k)))
-        torch.save(v, os.path.join(ckpt_dir, k))
-
-    # copy the checkpoint to the output_dir
-    print("rm -r {}/*".format(output_dir))
-    os.system("rm -r {}/*".format(output_dir))
-    os.makedirs(output_dir, exist_ok=True)
-    print("cp -r {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
-    os.system("cp -r {} {}".format(os.path.join(ckpt_dir, "*"), output_dir))
-
-    # set latest file within the output_dir
-    latest_file = os.path.join("/".join(output_dir.split("/")[:-1]), "latest")
-    os.system("rm -r " + latest_file)
-    with open(latest_file, "w") as f:
-        f.write(output_dir.split("/")[-1])
-
-
-def consume_neox_args2(args_parsed, overwrite_values=None):
-    """
-    Deepspeed launcher needs to pass the arguments for `pretrain_gpt2.py` across to all machines.
-
-    In order not to have any problems with different configs being mismatched across machines, we instead read the .yaml configuration file from the main rank,
-    then serialize the arguments to a dictionary, which the deepspeed launcher broadcasts to all machines (`--megatron_config`).
-
-    We then instantiate a new NeoXArgs from the dictionary (`.from_dict`). This should ensure args are never inconsistent across machines.
-    """
-
-    with open(args_parsed.megatron_config) as jsonfile:
-        megatron_config = json.load(jsonfile)
-    if args_parsed.deepspeed_config is not None:
-        overwrite_values = NeoXArgs.set_up_autotuning(
-            args_parsed.deepspeed_config, overwrite_values
-        )
-    if overwrite_values is not None:
-        megatron_config.update(overwrite_values)
-    return NeoXArgs.from_dict(args_dict=megatron_config)
-
-
-def get_non_existing_dir(tmp_dir):
-    while os.path.exists(tmp_dir):
-        tmp_dir = os.path.join(tmp_dir, "tmp_dir")
-    return tmp_dir
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Convert a Hugging Face GPT-NeoX model back to a sequential model compatible with GPT-NeoX training."
-    )
-    parser.add_argument(
-        "--revision",
-        type=int,
-        default=143000,
-        help="Revision or step of the Pythia model to convert.",
-    )
-    parser.add_argument( 
-        "--output-dir",
-        type=str,
-        help="Path to save the converted GPT-NeoX model checkpoint.",
-    )
-    parser.add_argument(
-        "--config", 
-        nargs="*",
-        default=[],
-        help="Path to the config file for the equivalent NeoX model."
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set, will run a test to ensure the conversion was successful."
-    )
-    parser.add_argument(
-        "--download-only",
-        action="store_true",
-        help="If set, script will only download the model and not convert it."
-    )
-
-    parser.add_argument(
-        "--ckpt-tmp-dir",
-        default="tmp/convert_ckpt_tmp_dir/JOBID_" + os.environ["SLURM_JOB_ID"],
-        help="Directory to store cached hugging face checkpoints. [WARNING: MUST BE VISIBLE TO ALL RANKS]"
-    )
-    parser.add_argument(
-        "--hf-model-name",
-        type=str,
-        help="Name of the hugging face model to download from EleutherAI/{hf-model-name}.}"
-    )
-
-    parser.add_argument(
-        "--cache-dir",
-        default="checkpoints/HF",
-        help="Directory to store cached hugging face checkpoints."
-    )
-    try:
-        if int(os.environ['WORLD_SIZE']) > 1:
-            args = parser.parse_args(MULTI_GPU_ARGS.split(" "))
-        else:
-            args = parser.parse_args()
-    except KeyError:
-        args = parser.parse_args()
-
-
-    tmp_cache_dir = get_non_existing_dir(args.ckpt_tmp_dir)
-
-
-    if args.download_only:
-        hf_model = AutoModelForCausalLM.from_pretrained(
-            f"{args.hf_model_name}",
-            cache_dir=os.path.join(
-                args.cache_dir, f"{args.hf_model_name}"
-            ),
-        ).half()
-        exit(0)
-    else:
-        print("======================================================================")
-        print(
-            "Warning the following script will delete files within {}".format(
-                args.output_dir
-            )
-        )
-        print(
-            "Warning the following script will delete this directory {}".format(
-                tmp_cache_dir
-            )
-        )
-        print("======================================================================")
-        # time.sleep(5)
-    
-    if int(os.environ.get('OMPI_COMM_WORLD_SIZE',1)) > 1:
-        neox_args = consume_neox_args2(args2)
-    else:
-        neox_args = NeoXArgs.from_ymls(args.config)
-    neox_args.configure_distributed_args()
-    neox_args.build_tokenizer()
-    neox_args.initialize_tensorboard_writer()
-
-    # setup logging and timers
-    # init_wandb(neox_args=neox_args)
-    # timers = Timers(
-    #     use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
-    # )
-    initialize_megatron(neox_args=neox_args)
-
-    torch.distributed.barrier()
-
-    model = get_model(neox_args=neox_args, use_cache=True)
-
-    torch.distributed.barrier()
-
-    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
-    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
-
-    model, optimizer, _, lr_scheduler = deepspeed.initialize(
-        model=model,
-        optimizer=optimizer,
-        # args=neox_args,
-        lr_scheduler=lr_scheduler,
-        dist_init_required=False,
-        model_parameters=None,
-        config_params=neox_args.deepspeed_config,
-        mpu=mpu if not neox_args.is_pipe_parallel else None,
-    )
-    
-
-    if os.environ.get("RANK", "1") == "0":
-        os.makedirs(f"{tmp_cache_dir}", exist_ok=True)
-
-    torch.distributed.barrier()
-    neox_args.save = tmp_cache_dir
-
-    save_checkpoint(
-        neox_args=neox_args,
-        iteration=0,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-    )
-    print(os.listdir(f"{tmp_cache_dir}"))
-    ckpt_dir = os.path.join(tmp_cache_dir, "global_step0")
-
-
-    if torch.distributed.get_rank() == 0:
-        # config = AutoConfig.from_pretrained(
-        #     f"{args.hf_model_name}",
-        #     cache_dir=os.path.join(
-        #         args.cache_dir,f"{args.hf_model_name}"
-        #         ),
-        #     )
-        # does not change the weights, but is needed to align logits
-        # config.update({'hidden_act':'gelu_fast'})
-        hf_model = AutoModelForCausalLM.from_pretrained(
-            f"{args.hf_model_name}",
-            #config=config,
-            cache_dir=os.path.join(
-                args.cache_dir,f"{args.hf_model_name}"
-                ),
-        ).half()
-        print("==========================================")
-        print("Loaded Hugging Face model successfully!")
-        print("==========================================")
-
-        # if os.environ.get("RANK", "1") == '0':
-        #     hf_param_names_and_sizes = [[param_name, param_weight.size()] for param_name, param_weight in hf_model.model.named_parameters()]
-        #     neox_param_names_and_sizes = [[param_name, param_weight.size()] for param_name, param_weight in model.sequential.named_parameters()]
-        #     print("\n\n\n\n\n\n HF param names ", hf_param_names_and_sizes)
-        #     print("Neox param names ", neox_param_names_and_sizes, "\n\n\n\n\n\n")
-        # torch.distributed.barrier()
-        # exit(0)
-
-        convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir)
-
-        if os.environ.get("RANK", "1") == '0':
-            # cleanup temp dir
-            os.system(f"rm -r {tmp_cache_dir}")
-
-    torch.distributed.barrier()
-
-    #verify the conversion can be loaded
-    neox_args.load = "/".join(args.output_dir.split("/")[:-1])
-    print(os.environ["RANK"], neox_args.load)
-    neox_args.finetune=True
-    load_checkpoint(
-        neox_args=neox_args,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        iteration=None,
-    )
-    print("==========================================")
-    print("Converted checkpoint successfully loaded!")
-    print("==========================================")
-
-
-    if args.test and torch.distributed.get_world_size() == 1:
-        # only implemented for world size 1
-
-        with torch.no_grad():
-            # torch.backends.cudnn.benchmark = False
-            # torch.use_deterministic_algorithms(True) #setting the CUBLAS_WORKSPACE_CONFIG=:4096:8 environment variable is required for this to work (tested for A6000)
-            model.eval()
-            hf_model.eval()
-
-            b = 10
-            seq_len = 32
-            inputs = torch.randint(0, 50304, (b, seq_len), dtype=torch.long).cuda()
-            mask = (
-                (torch.triu(torch.ones(seq_len, seq_len)) != 1).transpose(0, 1).cuda()
-            )
-            pos_ids = torch.arange(0, seq_len).unsqueeze(0).cuda()
-
-            torch.manual_seed(0)
-            outputs_neox = model.cuda()(
-                (inputs, pos_ids, mask.unsqueeze(0).unsqueeze(0)), neox_args=neox_args
-            )
-
-            torch.manual_seed(0)
-            outputs = hf_model.cuda()(input_ids=inputs)
-
-            print("HF logits   .sum(): ", outputs.logits.to(torch.float32).sum())
-            print("NeoX logits .sum(): ", outputs_neox.to(torch.float32).sum())
-
-            print(
-                "\nLogit comparison summary for {} sequences of length {}:".format(
-                    b, seq_len
-                )
-            )
-            print("=============================================================")
-            for i in range(b):
-                abs_diff = (
-                    outputs.logits[i, ...].to(torch.float32)
-                    - outputs_neox[i, ...].to(torch.float32)
-                ).abs()
-                print(
-                    "[Random sequence {}] (hflogits - neoxlogits).abs() -- mean: {:.5f}\tmax: {:.5f}\tmin: {:.5f}\tmedian: {:.5f}".format(
-                        i,
-                        abs_diff.mean(),
-                        abs_diff.max(),
-                        abs_diff.min(),
-                        abs_diff.median(),
-                    )
-                )
-
-    elif args.test:
-        print(
-            "[INFO] Checkpoint conversion logit test not implemented for distributed world_size > 1. Current world_size: {}".format(
-                torch.distributed.get_world_size()
-            )
-        )
-
diff --git a/mistral_7b_convert.sh b/mistral_7b_convert.sh
deleted file mode 100644
index 3e4d20568..000000000
--- a/mistral_7b_convert.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=convert_mistral_7B
-#SBATCH --partition=a40x            # Make sure you need this
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=8
-#SBATCH --cpus-per-task=12          # Number of cores per tasks
-#SBATCH --gres=gpu:8                 # Number of gpus
-#SBATCH --output=convert_mistral_7B_%x_%j.out      # Set this dir where you want slurm outs to go
-#SBATCH --error=convert_mistral_7B_%x_%j.out      # Set this dir where you want slurm outs to go
-#SBATCH --exclusive      # Turn off node sharing
-#SBATCH --account=neox
-#SBATCH --open-mode=append
-#SBATCH --requeue
-#SBATCH --time=0-00:12:00
-
-
-# set up the environment using a setup script
-source ~/setup.sh
-
-ds_report
-
-export NCCL_DEBUG=INFO
-export NCCL_TREE_THRESHOLD=0
-export NCCL_PROTO=simple
-# Network issues without the following two NCCL vars set; See https://github.com/NVIDIA/nccl/issues/676
-export NCCL_IBEXT_DISABLE=1
-export NCCL_SOCKET_IFNAME=^docker0,lo
-
-export FI_EFA_FORK_SAFE=1
-export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
-export FI_EFA_ENABLE_SHM_TRANSFER=0
-export FI_PROVIDER=efa
-export FI_EFA_TX_MIN_CREDITS=64
-
-export PYTHONFAULTHANDLER=1
-
-export OMPI_MCA_mtl_base_verbose=1
-export OMPI_MCA_btl="^openib"
-
-export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=12802
-export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
-
-# Hide duplicated errors using this hack - will be properly fixed in pt-1.12
-export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json
-export TORCH_EXTENSIONS_DIR=./extensions/
-
-# Move to the gpt-neox install
-TRAIN_PATH=/weka/$(whoami)/gpt-neox
-cd $TRAIN_PATH
-
-# Write the hostfile for this job
-export MASTER_ADDR=$(echo $MASTER_ADDR | cut -d '-' -f 2- | tr '-' '.')
-bash ~/write_ip_hostfile.sh
-export DLTS_HOSTFILE=/weka/$(whoami)/hostfiles/hosts_$SLURM_JOBID
-
-
-
-python ./deepy.py convert_hf_to_sequential_mistral.py \
-    -d configs mistral_7b.yml
-
-# python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
-#         --conf_dir configs/ mistral_7b.yml
\ No newline at end of file

From e7d1282e25691cb43a3d858cd75345e04b254a5c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 20:52:09 +0000
Subject: [PATCH 61/64] run pre-commit

---
 configs/neox_arguments.md            | 7 +++----
 megatron/neox_arguments/neox_args.py | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 7df94988d..9aef84bcf 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -976,7 +976,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default = 
+    Default =
 
 
     a single prompt's end. Defaults to newline
@@ -1018,7 +1018,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default = 
+    Default =
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1762,7 +1762,7 @@ Args for deepspeed config
 
     Default = None
 
-    
+
 
 
 
@@ -2062,4 +2062,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
-
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index a8e4b33da..8c3a6fb18 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -114,12 +114,12 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     num_attention_heads: int = None
     """
-    Number of transformer attention heads. 
-    
+    Number of transformer attention heads.
+
     If num_kv_heads is set, will control only number of query heads.
     """
 
-    num_kv_heads: int = None 
+    num_kv_heads: int = None
     """
     Number of transformer key/value attention heads.
 
@@ -848,7 +848,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     s3_chunk_size: int = 104_857_600
     """
     The number of bytes in each file chunk when uploading to s3. Defaults to 100MiB.
-    """ 
+    """
 
     config_files: dict = None
     """

From ace0e94309cf865a410cf32a8b63b6a12eb8df4b Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 22 Feb 2024 21:06:05 +0000
Subject: [PATCH 62/64] update tests for conversion scripts

---
 tests/unit/test_format_conversion_scripts.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index 9c78c962d..e0801434c 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -1,12 +1,9 @@
 import pytest
-from tools.ckpts import convert_sequential_to_hf
+from tools.ckpts import convert_neox_to_hf
 from tests.common import simulate_deepy_env, save_random_model
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
-# Test is failing; possibly we resolve by using the word embedding weights in the 'layer_00-model_00-model_states.pt' file?
-@pytest.mark.xfail(
-    reason="Failing to find 'word_embeddings.weight' in state_dict['module'] from the 'mp_rank_00_model_states.pt' file"
-)
+
 def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
     # Generate random GPT-NEOX model, check we can convert to hf format
     model_dir = str(tmpdir)
@@ -24,6 +21,4 @@ def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
         model_dir,
     ]
     overwrite_values = {"tokenizer_type": NeoXArgsTokenizer.tokenizer_type}
-    convert_sequential_to_hf.main(
-        input_args=script_args, overwrite_values=overwrite_values
-    )
+    convert_neox_to_hf.main(input_args=script_args, overwrite_values=overwrite_values)

From 157ec4770a485d927aff6c2b722051c96233ee8f Mon Sep 17 00:00:00 2001
From: Quentin Anthony <qganthony@yahoo.com>
Date: Thu, 22 Feb 2024 21:35:14 -0800
Subject: [PATCH 63/64] add flash version check for sliding window

---
 megatron/neox_arguments/arguments.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 71a3f2203..5c7df65cc 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -21,6 +21,8 @@
 import copy
 import torch
 import argparse
+from pkg_resources import packaging
+from importlib.metadata import version
 
 from dataclasses import dataclass
 from typing import List, Dict
@@ -388,6 +390,13 @@ def consume_deepy_args(cls, input_args=None):
 
             neox_args.wandb_group += "_" + wandb.util.generate_id()
 
+
+        if neox_args.sliding_window_width is not None:
+            _flash_version = packaging.version.Version(version("flash-attn"))
+            assert (
+                _flash_version >= packaging.version.Version("2.0.0")
+            ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.0.0 to support sliding window attention." 
+
         neox_args.print()
 
         return neox_args

From db9947ef8d6e44e7c1b3595a9e31841bab3c8660 Mon Sep 17 00:00:00 2001
From: Quentin Anthony <qganthony@yahoo.com>
Date: Thu, 22 Feb 2024 21:36:30 -0800
Subject: [PATCH 64/64] pre-commit

---
 megatron/neox_arguments/arguments.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 5c7df65cc..7bca420cd 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -390,12 +390,11 @@ def consume_deepy_args(cls, input_args=None):
 
             neox_args.wandb_group += "_" + wandb.util.generate_id()
 
-
         if neox_args.sliding_window_width is not None:
             _flash_version = packaging.version.Version(version("flash-attn"))
-            assert (
-                _flash_version >= packaging.version.Version("2.0.0")
-            ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.0.0 to support sliding window attention." 
+            assert _flash_version >= packaging.version.Version(
+                "2.0.0"
+            ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.0.0 to support sliding window attention."
 
         neox_args.print()