diff --git a/evaluation/benchmark_throughput.py b/evaluation/benchmark_throughput.py new file mode 100644 index 00000000..b3938d26 --- /dev/null +++ b/evaluation/benchmark_throughput.py @@ -0,0 +1,318 @@ +"""Benchmark offline inference throughput.""" +import argparse +import json +import random +import time +from typing import List, Optional, Tuple + +import torch +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizerBase) +from tqdm import tqdm + + +def sample_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int], +) -> List[Tuple[str, int, int]]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + + # Tokenize the prompts and completions. + prompts = [prompt for prompt, _ in dataset] + prompt_token_ids = tokenizer(prompts).input_ids + completions = [completion for _, completion in dataset] + completion_token_ids = tokenizer(completions).input_ids + tokenized_dataset = [] + for i in range(len(dataset)): + output_len = len(completion_token_ids[i]) + if fixed_output_len is not None: + output_len = fixed_output_len + tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) + + # Filter out too long sequences. + filtered_dataset: List[Tuple[str, int, int]] = [] + for prompt, prompt_token_ids, output_len in tokenized_dataset: + prompt_len = len(prompt_token_ids) + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + filtered_dataset.append((prompt, prompt_len, output_len)) + + # Sample the requests. + sampled_requests = random.sample(filtered_dataset, num_requests) + return sampled_requests + + +def run_vllm( + requests: List[Tuple[str, int, int]], + model: str, + tokenizer: str, + quantization: Optional[str], + tensor_parallel_size: int, + seed: int, + n: int, + use_beam_search: bool, + trust_remote_code: bool, + dtype: str, + max_model_len: Optional[int], + enforce_eager: bool, +) -> float: + from vllm import LLM, SamplingParams + llm = LLM( + model=model, + tokenizer=tokenizer, + quantization=quantization, + tensor_parallel_size=tensor_parallel_size, + seed=seed, + trust_remote_code=trust_remote_code, + dtype=dtype, + max_model_len=max_model_len, + enforce_eager=enforce_eager, + ) + + # Add the requests to the engine. + for prompt, _, output_len in requests: + sampling_params = SamplingParams( + n=n, + temperature=0.0 if use_beam_search else 1.0, + top_p=1.0, + use_beam_search=use_beam_search, + ignore_eos=True, + max_tokens=output_len, + ) + # FIXME(woosuk): Do not use internal method. + llm._add_request( + prompt=prompt, + prompt_token_ids=None, + sampling_params=sampling_params, + ) + + start = time.perf_counter() + # FIXME(woosuk): Do not use internal method. + llm._run_engine(use_tqdm=True) + end = time.perf_counter() + return end - start + + +def run_hf( + requests: List[Tuple[str, int, int]], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + use_beam_search: bool, + max_batch_size: int, + trust_remote_code: bool, +) -> float: + assert not use_beam_search + llm = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + #if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: List[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt, prompt_len, output_len = requests[i] + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + _, next_prompt_len, next_output_len = requests[i + 1] + if (max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len)) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", + padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=not use_beam_search, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def run_mii( + requests: List[Tuple[str, int, int]], + model: str, + tensor_parallel_size: int, + output_len: int, +) -> float: + from mii import pipeline + llm = pipeline(model, tensor_parallel=tensor_parallel_size) + prompts = [prompt for prompt, _, _ in requests] + + start = time.perf_counter() + llm(prompts, max_new_tokens=output_len) + end = time.perf_counter() + return end - start + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + if args.dataset is None: + # Synthesize a prompt with the given input length. + prompt = "hi" * (args.input_len - 1) + requests = [(prompt, args.input_len, args.output_len) + for _ in range(args.num_prompts)] + else: + requests = sample_requests(args.dataset, args.num_prompts, tokenizer, + args.output_len) + + if args.backend == "vllm": + elapsed_time = run_vllm(requests, args.model, args.tokenizer, + args.quantization, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search, + args.trust_remote_code, args.dtype, + args.max_model_len, args.enforce_eager) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + elapsed_time = run_hf(requests, args.model, tokenizer, args.n, + args.use_beam_search, args.hf_max_batch_size, + args.trust_remote_code) + elif args.backend == "mii": + elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, + args.output_len) + else: + raise ValueError(f"Unknown backend: {args.backend}") + total_num_tokens = sum(prompt_len + output_len + for _, prompt_len, output_len in requests) + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} tokens/s") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--backend", + type=str, + choices=["vllm", "hf", "mii"], + default="vllm") + parser.add_argument("--dataset", + type=str, + default=None, + help="Path to the dataset.") + parser.add_argument("--input-len", + type=int, + default=None, + help="Input prompt length for each request") + parser.add_argument("--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument("--model", type=str, default="facebook/opt-125m") + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument('--quantization', + '-q', + choices=['awq', 'gptq', 'squeezellm', None], + default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') + parser.add_argument( + '--dtype', + type=str, + default='auto', + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') + parser.add_argument("--enforce-eager", + action="store_true", + help="enforce eager execution") + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if args.dataset is None: + assert args.input_len is not None + assert args.output_len is not None + else: + assert args.input_len is None + + if args.backend == "vllm": + if args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + elif args.backend == "hf": + if args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend.") + if args.quantization is not None: + raise ValueError("Quantization is only for vLLM backend.") + elif args.backend == "mii": + if args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.use_beam_search: + raise ValueError("Beam search is not supported for MII backend.") + if args.quantization is not None: + raise ValueError("Quantization is only for vLLM backend.") + if args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + if args.tokenizer != args.model: + raise ValueError("Tokenizer must be the same as the model for MII " + "backend.") + main(args) diff --git a/evaluation/eval.sh b/evaluation/eval.sh new file mode 100644 index 00000000..b53c327b --- /dev/null +++ b/evaluation/eval.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=56 +#SBATCH --mem=0 +#SBATCH --partition=standard-g +#SBATCH --time=0-01:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_465000670 +#SBATCH --output=eval-logs/%j.out +#SBATCH --error=eval-logs/%j.err + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +# LUMI setup +# module load LUMI/22.08 partition/G singularity-bindings/system-cpeGNU-22.08-noglibc # singularity-bindings is BROKEN +module load LUMI/22.08 partition/G + +# These replace the module load of singularity-bindings +local_libfabric_version=1.15.2.0 +local_craympich_version=8.1.27 +export SINGULARITYENV_LD_LIBRARY_PATH="/lib64:/opt/cray/pe/mpich/$local_craympich_version/ofi/gnu/9.1/lib-abi-mpich:/opt/cray/pe/lib64:/opt/cray/pe:/opt/cray/libfabric/$local_libfabric_version/lib64:/usr/lib64:/opt/cray/pe/gcc-libs:${SINGULARITYENV_LD_LIBRARY_PATH}" +export SINGULARITY_BIND="/opt/cray,/usr/lib64/libbrotlidec.so.1,/usr/lib64/libbrotlicommon.so.1,/usr/lib64/libnl-3.so.200,/usr/lib64/libnl-route-3.so.200,/usr/lib64/libcxi.so.1,/usr/lib64/libcurl.so.4,/usr/lib64/libnghttp2.so.14,/usr/lib64/libidn2.so.0,/usr/lib64/libssh.so.4,/usr/lib64/libpsl.so.5,/usr/lib64/libssl.so.1.1,/usr/lib64/libcrypto.so.1.1,/usr/lib64/libgssapi_krb5.so.2,/usr/lib64/libldap_r-2.4.so.2,/usr/lib64/liblber-2.4.so.2,/usr/lib64/libjson-c.so.3,/usr/lib64/libunistring.so.2,/usr/lib64/libkrb5.so.3,/usr/lib64/libk5crypto.so.3,/usr/lib64/libkrb5support.so.0,/usr/lib64/libsasl2.so.3,/usr/lib64/libkeyutils.so.1,/var/spool/slurmd/mpi_cray_shasta,/usr/lib64/libzstd.so.1,/lib64/libselinux.so.1,/usr/lib64/libpcre.so.1,${SINGULARITY_BIND}" + +# These are some more custom exports +export SINGULARITY_BIND=/users/larsenra/aws-ofi-rccl/install:/opt/aws-ofi-rccl,/usr/lib64/libjitterentropy.so.3,${SINGULARITY_BIND} +export SINGULARITYENV_LD_LIBRARY_PATH=/opt/ompi/lib:${EBROOTAWSMINOFIMINRCCL}/lib:/opt/cray/xpmem/2.5.2-2.4_3.47__gd0f7936.shasta/lib64:/opt/aws-ofi-rccl/lib:${SINGULARITYENV_LD_LIBRARY_PATH} +export SINGULARITY_BIND=$(echo $SINGULARITY_BIND | sed 's|,/usr/lib64/libssh.so.4||g') # do not bind host libssh which is built against a wrong libssl for some reason +export LC_ALL=C +export HF_DATASETS_CACHE="/scratch/project_465000670/.cache/huggingface" +export HF_HOME="/scratch/project_465000670/.cache/huggingface" + +# values for distributed setup +GPUS_PER_NODE=$SLURM_GPUS_PER_NODE +NNODES=$SLURM_NNODES +export NODE_RANK=$SLURM_NODEID +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_PORT=9999 +export WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# compilers in the container +export CC=gcc-11 +export CXX=g++-11 + +CONTAINER="/project/project_465000670/pytorch_rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1.sif" + +SING_BIND="/scratch/project_465000670" + +# hold separate logs for easier debugging +rm -rf separate-logs +mkdir -p separate-logs + +set -exuo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +CHECKPOINT_PATH=checkpoints + +GIT_ROOT=$(git rev-parse --show-toplevel) +PATH_TO_SCRIPTS="scripts/lumi" +cd ${GIT_ROOT} # ensure that we are in the git root for remaining paths to work +CMD=" \ + llm-foundry/scripts/train/train.py \ + ${PATH_TO_SCRIPTS}/yamls/continue-mistral-7b.yaml + " + +# Bind masks from Samuel (TODO: unused for now since composer handles process spawning, but might help performance to use this) +c=fe + +# Bind mask for one thread per core +BIND_MASK_1="0x${c}000000000000,0x${c}00000000000000,0x${c}0000,0x${c}000000,0x${c},0x${c}00,0x${c}00000000,0x${c}0000000000" + +# Bind mask for two threads per core +BIND_MASK_2="0x${c}00000000000000${c}000000000000,0x${c}00000000000000${c}00000000000000,0x${c}00000000000000${c}0000,0x${c}00000000000000${c}000000,0x${c}00000000000000${c},0x${c}00000000000000${c}00,0x${c}00000000000000${c}00000000,0x${c}00000000000000${c}0000000000" + +BIND_MASK="$BIND_MASK_1" +#echo "Using --cpu-bind=mask_cpu:$BIND_MASK" + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# --cpu-bind=mask_cpu:$BIND_MASK \ +srun \ + --label \ + singularity exec -B "$SING_BIND" "$CONTAINER" \ + /scratch/project_465000670/danish-foundation-models/scripts/lumi/mosaic_in_container.sh \ + $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/evaluation/make_venv.sh b/evaluation/make_venv.sh new file mode 100755 index 00000000..42b1b3e4 --- /dev/null +++ b/evaluation/make_venv.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Important: should be run in the `rocm/pytorch`` container +set -euxo pipefail +export LC_ALL=C + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +GIT_ROOT=$(git rev-parse --show-toplevel) + +cd ${SCRIPT_DIR} +rm -rf .venv +python3 -m venv .venv +source .venv/bin/activate + +pip install --upgrade pip +pip install packaging cmake # build requirements + +# Install pytorch +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7 + +# Install flash attention +TMP_DIR=$(mktemp -d) +git clone --recurse-submodules https://github.com/ROCmSoftwarePlatform/flash-attention ${TMP_DIR} +cd ${TMP_DIR} +export GPU_ARCHS="gfx90a" # for MI250X on LUMI +export MAX_JOBS=8 # this install breaks on dev nodes (memory?), so install on a login node while being nice +python3 setup.py install + +# Install vllm +TMP_DIR=$(mktemp -d) +git clone https://github.com/vllm-project/vllm.git ${TMP_DIR} +cd ${TMP_DIR} +pip install xformers==0.0.23 --no-deps # this step is from the vllm docs +bash patch_xformers.rocm.sh # so is this +pip install -U -r requirements-rocm.txt +python setup.py install + +# Install scandeval +pip install scandeval \ No newline at end of file diff --git a/llm-foundry b/llm-foundry new file mode 160000 index 00000000..8d5beb78 --- /dev/null +++ b/llm-foundry @@ -0,0 +1 @@ +Subproject commit 8d5beb78be7e6449f5e0b4b8252f046ac5e584f9 diff --git a/training/continue_mistral_mosaic.sh b/training/continue_mistral_mosaic.sh index 6f8474b5..bbe20059 100755 --- a/training/continue_mistral_mosaic.sh +++ b/training/continue_mistral_mosaic.sh @@ -1,18 +1,20 @@ #!/bin/bash ##SBATCH --exclude=nid006865,nid005613,nid005988 -#SBATCH --nodes=2 +#SBATCH --nodes=4 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=56 #SBATCH --mem=0 #SBATCH --partition=standard-g -#SBATCH --time=0-01:00:00 +#SBATCH --time=0-24:00:00 #SBATCH --gpus-per-node=mi250:8 #SBATCH --exclusive=user #SBATCH --hint=nomultithread #SBATCH --account=project_465000670 #SBATCH --output=logs/%j.out #SBATCH --error=logs/%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=rasmus.larsen@alexandra.dk # if run without sbatch, invoke here if [ -z $SLURM_JOB_ID ]; then @@ -98,4 +100,4 @@ srun \ /scratch/project_465000670/danish-foundation-models/scripts/lumi/mosaic_in_container.sh \ $CMD -echo "END $SLURM_JOBID: $(date)" \ No newline at end of file +echo "END $SLURM_JOBID: $(date)" diff --git a/training/make_venv.sh b/training/make_venv.sh index 7553e9cb..980a58fa 100755 --- a/training/make_venv.sh +++ b/training/make_venv.sh @@ -27,4 +27,4 @@ export GPU_ARCHS="gfx90a" # export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') # this is for older versions of pytorch # patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch -python3 setup.py install \ No newline at end of file +python3 setup.py install diff --git a/training/yamls/continue-mistral-7b.yaml b/training/yamls/continue-mistral-7b.yaml index a0486f1b..cf747711 100644 --- a/training/yamls/continue-mistral-7b.yaml +++ b/training/yamls/continue-mistral-7b.yaml @@ -125,7 +125,7 @@ loggers: wandb: {} # Checkpoint to local filesystem or remote object store -save_interval: 100ba # 2M tokens per batch = 200M tokens per checkpoint -save_num_checkpoints_to_keep: 5 # cleans up checkpoints saved to DISK +save_interval: 100ba # 4M tokens per batch = 400M tokens per checkpoint +save_num_checkpoints_to_keep: 10 # cleans up checkpoints saved to DISK save_folder: ./{run_name}/checkpoints # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints