-
Notifications
You must be signed in to change notification settings - Fork 455
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/main' into vl2
- Loading branch information
Showing
21 changed files
with
837 additions
and
259 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-13b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=1 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 500 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
mv gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of internlm-20b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=2 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 700 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
cp gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-70b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=4 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 4000 | ||
crudini --set ${config_path} llama max_batch_size 256 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128 256) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 64 128 256 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}" | ||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#!/bin/bash | ||
if [ -z "$1" ] | ||
then | ||
echo "Error. Please input the model path of llama2-7b model" | ||
exit 1 | ||
fi | ||
|
||
workspace_dir=$(dirname $(realpath "$0")) | ||
|
||
tp=1 | ||
model_path="$1" | ||
model_foldername=$(basename "$model_path") | ||
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}" | ||
|
||
# convert | ||
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp} | ||
if [ $? != 0 ] | ||
then | ||
exit 1 | ||
fi | ||
|
||
# update recommended config to config.ini | ||
config_path=${turbomind_model_path}/triton_models/weights/config.ini | ||
|
||
apt-get update | ||
apt-get install crudini -y | ||
|
||
crudini --set ${config_path} llama max_context_token_num 4 | ||
crudini --set ${config_path} llama cache_chunk_size -1 | ||
crudini --set ${config_path} llama cache_max_entry_count 1000 | ||
crudini --set ${config_path} llama max_batch_size 128 | ||
# end of update config | ||
|
||
cd ${workspace_dir} | ||
|
||
# download dataset | ||
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
benchmark_rpm () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
batches=(64 128) | ||
for batch in "${batches[@]}" | ||
do | ||
for i in {1..3} | ||
do | ||
python3 profile_throughput.py \ | ||
ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
${turbomind_model_path} \ | ||
--concurrency "$batch" \ | ||
--num_prompts 3000 \ | ||
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv | ||
done | ||
done | ||
} | ||
|
||
benchmark_generation () { | ||
output_path=$1 | ||
mkdir -p "${output_path}" | ||
|
||
python3 profile_generation.py \ | ||
${turbomind_model_path} \ | ||
--concurrency 1 16 32 64 \ | ||
--csv ${output_path}/generation.csv | ||
} | ||
|
||
################################# BENCHMARK AFTER TUNING GEMM ################################# | ||
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}" | ||
|
||
# tune gemm | ||
head_num=$(crudini --get "${config_path}" llama head_num) | ||
size_per_head=$(crudini --get "${config_path}" llama size_per_head) | ||
vocab_size=$(crudini --get "${config_path}" llama vocab_size) | ||
inter_size=$(crudini --get "${config_path}" llama inter_size) | ||
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size) | ||
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size) | ||
|
||
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size | ||
|
||
python3 -m lmdeploy.turbomind.generate_gemm_config \ | ||
--head_num ${head_num} \ | ||
--size_per_head ${size_per_head} \ | ||
--vocab_size ${vocab_size} \ | ||
--inter_size ${inter_size} \ | ||
--tensor_para_size ${tensor_para_size} \ | ||
--max_batch_size ${max_batch_size} | ||
|
||
# benchmark request throughput and static inference | ||
benchmark_rpm ${output_path} | ||
benchmark_generation ${output_path} | ||
|
||
mv gemm_config.in ${output_path} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.