Skip to content

Commit

Permalink
Report the inference benchmark of models with different size (#794)
Browse files Browse the repository at this point in the history
* update test scripts for models with different sizes

* update

* only test after tunning gemm

* chmod +x

* fix typo

* benchmark on a100

* fix typo

* fix typo

* per-token latency percentile in profile_throughput

* fix

* fix

* rename

* make the script accept parameters

* minor fix

* indent

* reformat table

* change to 3000

* minor fix
  • Loading branch information
lvhan028 authored Dec 6, 2023
1 parent 5b9e454 commit ebe90bc
Show file tree
Hide file tree
Showing 9 changed files with 556 additions and 49 deletions.
92 changes: 92 additions & 0 deletions benchmark/benchmark_13b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-13b model"
exit 1
fi

workspace_dir=$(dirname $(realpath "$0"))

tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"

# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi

# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini

apt-get update
apt-get install crudini -y

crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 500
crudini --set ${config_path} llama max_batch_size 128
# end of update config

cd ${workspace_dir}

# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"

batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}

benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"

python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}

################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)

echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size

python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}

output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}

mv gemm_config.in ${output_path}
92 changes: 92 additions & 0 deletions benchmark/benchmark_20b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of internlm-20b model"
exit 1
fi

workspace_dir=$(dirname $(realpath "$0"))

tp=2
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"

# convert
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi

# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini

apt-get update
apt-get install crudini -y

crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 700
crudini --set ${config_path} llama max_batch_size 128
# end of update config

cd ${workspace_dir}

# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"

batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}

benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"

python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}

################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)

echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size

python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}

output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}

cp gemm_config.in ${output_path}
71 changes: 71 additions & 0 deletions benchmark/benchmark_70b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-70b model"
exit 1
fi

workspace_dir=$(dirname $(realpath "$0"))

tp=4
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"

# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi

# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini

apt-get update
apt-get install crudini -y

crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 4000
crudini --set ${config_path} llama max_batch_size 256
# end of update config

cd ${workspace_dir}

# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"

batches=(64 128 256)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}

benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"

python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 64 128 256 \
--csv ${output_path}/generation.csv
}

output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
93 changes: 93 additions & 0 deletions benchmark/benchmark_7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-7b model"
exit 1
fi

workspace_dir=$(dirname $(realpath "$0"))

tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"

# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi

# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini

apt-get update
apt-get install crudini -y

crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 1000
crudini --set ${config_path} llama max_batch_size 128
# end of update config

cd ${workspace_dir}

# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"

batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}

benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"

python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}

################################# BENCHMARK AFTER TUNING GEMM #################################
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"

# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)

echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size

python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}

# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}

mv gemm_config.in ${output_path}
10 changes: 5 additions & 5 deletions benchmark/profile_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
for _ in range(test_round):
token_latency_stats = [0] * (output_seqlen + 1)
prev = time.perf_counter()
n_pre_token = 0
n_prev_token = 0
"""
The iterator provided by `stream_infer` denotes the number of generated tokens so far,
which is represented by the variable `n_token`.
Please note that `n_token` is not a continuous value. In other words, during the iteration,
its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
So, it is quite difficult to get the latency of each generated token.
As a work-around, we set the latency `new-prev` of each iteration to the first token of
As a work-around, we set the latency `now-prev` of each iteration to the first token of
the new generated tokens, and leave the latency of the rest tokens being 0.
For example, in the first iteration, 5 tokens are generated.
The time elapsing in this iteration `now-prev` is set to the latency of first token of
Expand All @@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
temperature=temperature):
_, n_token = outputs[0]
now = time.perf_counter()
if n_pre_token != n_token:
token_latency_stats[n_pre_token] = np.round(now - prev, 3)
n_pre_token = n_token
if n_prev_token != n_token:
token_latency_stats[n_prev_token] = np.round(now - prev, 3)
n_prev_token = n_token
prev = now
if session_id == 1:
pbar.update(1)
Expand Down
Loading

0 comments on commit ebe90bc

Please sign in to comment.