diff --git a/evaluation/benchmark_throughput.py b/evaluation/benchmark_throughput.py
new file mode 100644
index 00000000..b3938d26
--- /dev/null
+++ b/evaluation/benchmark_throughput.py
@@ -0,0 +1,318 @@
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import torch
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+from tqdm import tqdm
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Tokenize the prompts and completions.
+    prompts = [prompt for prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+        output_len = len(completion_token_ids[i])
+        if fixed_output_len is not None:
+            output_len = fixed_output_len
+        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+    # Filter out too long sequences.
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+        prompt_len = len(prompt_token_ids)
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    # Sample the requests.
+    sampled_requests = random.sample(filtered_dataset, num_requests)
+    return sampled_requests
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        enforce_eager=enforce_eager,
+    )
+
+    # Add the requests to the engine.
+    for prompt, _, output_len in requests:
+        sampling_params = SamplingParams(
+            n=n,
+            temperature=0.0 if use_beam_search else 1.0,
+            top_p=1.0,
+            use_beam_search=use_beam_search,
+            ignore_eos=True,
+            max_tokens=output_len,
+        )
+        # FIXME(woosuk): Do not use internal method.
+        llm._add_request(
+            prompt=prompt,
+            prompt_token_ids=None,
+            sampling_params=sampling_params,
+        )
+
+    start = time.perf_counter()
+    # FIXME(woosuk): Do not use internal method.
+    llm._run_engine(use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    #if llm.config.model_type == "llama":
+    # To enable padding in the HF backend.
+    tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import pipeline
+    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                args.quantization, args.tensor_parallel_size,
+                                args.seed, args.n, args.use_beam_search,
+                                args.trust_remote_code, args.dtype,
+                                args.max_model_len, args.enforce_eager)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
diff --git a/evaluation/eval.sh b/evaluation/eval.sh
new file mode 100644
index 00000000..b53c327b
--- /dev/null
+++ b/evaluation/eval.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=56
+#SBATCH --mem=0
+#SBATCH --partition=standard-g
+#SBATCH --time=0-01:00:00
+#SBATCH --gpus-per-node=mi250:8
+#SBATCH --exclusive=user
+#SBATCH --hint=nomultithread
+#SBATCH --account=project_465000670
+#SBATCH --output=eval-logs/%j.out
+#SBATCH --error=eval-logs/%j.err
+
+# if run without sbatch, invoke here
+if [ -z $SLURM_JOB_ID ]; then
+    mkdir -p logs
+    sbatch "$0"
+    exit
+fi
+
+# LUMI setup
+# module load LUMI/22.08 partition/G singularity-bindings/system-cpeGNU-22.08-noglibc  # singularity-bindings is BROKEN
+module load LUMI/22.08 partition/G
+
+# These replace the module load of singularity-bindings
+local_libfabric_version=1.15.2.0
+local_craympich_version=8.1.27
+export SINGULARITYENV_LD_LIBRARY_PATH="/lib64:/opt/cray/pe/mpich/$local_craympich_version/ofi/gnu/9.1/lib-abi-mpich:/opt/cray/pe/lib64:/opt/cray/pe:/opt/cray/libfabric/$local_libfabric_version/lib64:/usr/lib64:/opt/cray/pe/gcc-libs:${SINGULARITYENV_LD_LIBRARY_PATH}"
+export SINGULARITY_BIND="/opt/cray,/usr/lib64/libbrotlidec.so.1,/usr/lib64/libbrotlicommon.so.1,/usr/lib64/libnl-3.so.200,/usr/lib64/libnl-route-3.so.200,/usr/lib64/libcxi.so.1,/usr/lib64/libcurl.so.4,/usr/lib64/libnghttp2.so.14,/usr/lib64/libidn2.so.0,/usr/lib64/libssh.so.4,/usr/lib64/libpsl.so.5,/usr/lib64/libssl.so.1.1,/usr/lib64/libcrypto.so.1.1,/usr/lib64/libgssapi_krb5.so.2,/usr/lib64/libldap_r-2.4.so.2,/usr/lib64/liblber-2.4.so.2,/usr/lib64/libjson-c.so.3,/usr/lib64/libunistring.so.2,/usr/lib64/libkrb5.so.3,/usr/lib64/libk5crypto.so.3,/usr/lib64/libkrb5support.so.0,/usr/lib64/libsasl2.so.3,/usr/lib64/libkeyutils.so.1,/var/spool/slurmd/mpi_cray_shasta,/usr/lib64/libzstd.so.1,/lib64/libselinux.so.1,/usr/lib64/libpcre.so.1,${SINGULARITY_BIND}"
+
+# These are some more custom exports
+export SINGULARITY_BIND=/users/larsenra/aws-ofi-rccl/install:/opt/aws-ofi-rccl,/usr/lib64/libjitterentropy.so.3,${SINGULARITY_BIND}
+export SINGULARITYENV_LD_LIBRARY_PATH=/opt/ompi/lib:${EBROOTAWSMINOFIMINRCCL}/lib:/opt/cray/xpmem/2.5.2-2.4_3.47__gd0f7936.shasta/lib64:/opt/aws-ofi-rccl/lib:${SINGULARITYENV_LD_LIBRARY_PATH}
+export SINGULARITY_BIND=$(echo $SINGULARITY_BIND | sed 's|,/usr/lib64/libssh.so.4||g') # do not bind host libssh which is built against a wrong libssl for some reason
+export LC_ALL=C
+export HF_DATASETS_CACHE="/scratch/project_465000670/.cache/huggingface" 
+export HF_HOME="/scratch/project_465000670/.cache/huggingface"
+
+# values for distributed setup 
+GPUS_PER_NODE=$SLURM_GPUS_PER_NODE
+NNODES=$SLURM_NNODES
+export NODE_RANK=$SLURM_NODEID
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=9999
+export WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# compilers in the container
+export CC=gcc-11
+export CXX=g++-11
+
+CONTAINER="/project/project_465000670/pytorch_rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1.sif"
+
+SING_BIND="/scratch/project_465000670"
+
+# hold separate logs for easier debugging
+rm -rf separate-logs
+mkdir -p separate-logs
+
+set -exuo pipefail
+
+# symlink logs/latest.out and logs/latest.err
+ln -f -s $SLURM_JOB_ID.out logs/latest.out
+ln -f -s $SLURM_JOB_ID.err logs/latest.err
+
+CHECKPOINT_PATH=checkpoints
+
+GIT_ROOT=$(git rev-parse --show-toplevel)
+PATH_TO_SCRIPTS="scripts/lumi"
+cd ${GIT_ROOT} # ensure that we are in the git root for remaining paths to work
+CMD=" \
+    llm-foundry/scripts/train/train.py \
+    ${PATH_TO_SCRIPTS}/yamls/continue-mistral-7b.yaml
+    "
+
+# Bind masks from Samuel (TODO: unused for now since composer handles process spawning, but might help performance to use this)
+c=fe
+
+# Bind mask for one thread per core
+BIND_MASK_1="0x${c}000000000000,0x${c}00000000000000,0x${c}0000,0x${c}000000,0x${c},0x${c}00,0x${c}00000000,0x${c}0000000000"
+
+# Bind mask for two threads per core
+BIND_MASK_2="0x${c}00000000000000${c}000000000000,0x${c}00000000000000${c}00000000000000,0x${c}00000000000000${c}0000,0x${c}00000000000000${c}000000,0x${c}00000000000000${c},0x${c}00000000000000${c}00,0x${c}00000000000000${c}00000000,0x${c}00000000000000${c}0000000000"
+
+BIND_MASK="$BIND_MASK_1"
+#echo "Using --cpu-bind=mask_cpu:$BIND_MASK"
+
+echo $CMD
+
+echo "START $SLURM_JOBID: $(date)"
+
+#    --cpu-bind=mask_cpu:$BIND_MASK \
+srun \
+    --label \
+    singularity exec -B "$SING_BIND" "$CONTAINER" \
+    /scratch/project_465000670/danish-foundation-models/scripts/lumi/mosaic_in_container.sh \
+    $CMD
+
+echo "END $SLURM_JOBID: $(date)"
diff --git a/evaluation/make_venv.sh b/evaluation/make_venv.sh
new file mode 100755
index 00000000..42b1b3e4
--- /dev/null
+++ b/evaluation/make_venv.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Important: should be run in the `rocm/pytorch`` container
+set -euxo pipefail
+export LC_ALL=C
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+cd ${SCRIPT_DIR}
+rm -rf .venv
+python3 -m venv .venv
+source .venv/bin/activate
+
+pip install --upgrade pip
+pip install packaging cmake # build requirements
+
+# Install pytorch
+pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+
+# Install flash attention
+TMP_DIR=$(mktemp -d)
+git clone --recurse-submodules https://github.com/ROCmSoftwarePlatform/flash-attention ${TMP_DIR}
+cd ${TMP_DIR}
+export GPU_ARCHS="gfx90a" # for MI250X on LUMI
+export MAX_JOBS=8 # this install breaks on dev nodes (memory?), so install on a login node while being nice
+python3 setup.py install
+
+# Install vllm
+TMP_DIR=$(mktemp -d)
+git clone https://github.com/vllm-project/vllm.git ${TMP_DIR}
+cd ${TMP_DIR}
+pip install xformers==0.0.23 --no-deps # this step is from the vllm docs
+bash patch_xformers.rocm.sh # so is this
+pip install -U -r requirements-rocm.txt
+python setup.py install
+
+# Install scandeval
+pip install scandeval
\ No newline at end of file
diff --git a/llm-foundry b/llm-foundry
new file mode 160000
index 00000000..8d5beb78
--- /dev/null
+++ b/llm-foundry
@@ -0,0 +1 @@
+Subproject commit 8d5beb78be7e6449f5e0b4b8252f046ac5e584f9
diff --git a/training/continue_mistral_mosaic.sh b/training/continue_mistral_mosaic.sh
index 6f8474b5..bbe20059 100755
--- a/training/continue_mistral_mosaic.sh
+++ b/training/continue_mistral_mosaic.sh
@@ -1,18 +1,20 @@
 #!/bin/bash
 
 ##SBATCH --exclude=nid006865,nid005613,nid005988
-#SBATCH --nodes=2
+#SBATCH --nodes=4
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=56
 #SBATCH --mem=0
 #SBATCH --partition=standard-g
-#SBATCH --time=0-01:00:00
+#SBATCH --time=0-24:00:00
 #SBATCH --gpus-per-node=mi250:8
 #SBATCH --exclusive=user
 #SBATCH --hint=nomultithread
 #SBATCH --account=project_465000670
 #SBATCH --output=logs/%j.out
 #SBATCH --error=logs/%j.err
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=rasmus.larsen@alexandra.dk
 
 # if run without sbatch, invoke here
 if [ -z $SLURM_JOB_ID ]; then
@@ -98,4 +100,4 @@ srun \
     /scratch/project_465000670/danish-foundation-models/scripts/lumi/mosaic_in_container.sh \
     $CMD
 
-echo "END $SLURM_JOBID: $(date)"
\ No newline at end of file
+echo "END $SLURM_JOBID: $(date)"
diff --git a/training/make_venv.sh b/training/make_venv.sh
index 7553e9cb..980a58fa 100755
--- a/training/make_venv.sh
+++ b/training/make_venv.sh
@@ -27,4 +27,4 @@ export GPU_ARCHS="gfx90a"
 
 # export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') # this is for older versions of pytorch
 # patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
-python3 setup.py install
\ No newline at end of file
+python3 setup.py install
diff --git a/training/yamls/continue-mistral-7b.yaml b/training/yamls/continue-mistral-7b.yaml
index a0486f1b..cf747711 100644
--- a/training/yamls/continue-mistral-7b.yaml
+++ b/training/yamls/continue-mistral-7b.yaml
@@ -125,7 +125,7 @@ loggers:
   wandb: {}
 
 # Checkpoint to local filesystem or remote object store
-save_interval: 100ba # 2M tokens per batch = 200M tokens per checkpoint
-save_num_checkpoints_to_keep: 5  # cleans up checkpoints saved to DISK
+save_interval: 100ba # 4M tokens per batch = 400M tokens per checkpoint
+save_num_checkpoints_to_keep: 10  # cleans up checkpoints saved to DISK
 save_folder: ./{run_name}/checkpoints
 # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints