-
Notifications
You must be signed in to change notification settings - Fork 453
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Report the inference benchmark of models with different size (#794)
* update test scripts for models with different sizes * update * only test after tunning gemm * chmod +x * fix typo * benchmark on a100 * fix typo * fix typo * per-token latency percentile in profile_throughput * fix * fix * rename * make the script accept parameters * minor fix * indent * reformat table * change to 3000 * minor fix
- Loading branch information
Showing
9 changed files
with
556 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-13b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=1 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 500 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
mv gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of internlm-20b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=2 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 700 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
cp gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-70b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=4 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 4000 | ||
crudini --set ${config_path} llama max_batch_size 256 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128 256) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 64 128 256 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-7b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=1 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 1000 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
|
||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
mv gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.