Report the inference benchmark of models with different size (#794)

* update test scripts for models with different sizes * update * only test after tunning gemm * chmod +x * fix typo * benchmark on a100 * fix typo * fix typo * per-token latency percentile in profile_throughput * fix * fix * rename * make the script accept parameters * minor fix * indent * reformat table * change to 3000 * minor fix
InternLM · Dec 6, 2023 · ebe90bc · ebe90bc
1 parent 5b9e454
commit ebe90bc
Show file tree

Hide file tree

Showing 9 changed files with 556 additions and 49 deletions.
diff --git a/benchmark/benchmark_13b.sh b/benchmark/benchmark_13b.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-13b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 500
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
diff --git a/benchmark/benchmark_20b.sh b/benchmark/benchmark_20b.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of internlm-20b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=2
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 700
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+cp gemm_config.in ${output_path}
diff --git a/benchmark/benchmark_70b.sh b/benchmark/benchmark_70b.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-70b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=4
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 4000
+crudini --set ${config_path} llama max_batch_size 256
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128 256)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 64 128 256 \
+        --csv ${output_path}/generation.csv
+}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation  ${output_path}
diff --git a/benchmark/benchmark_7b.sh b/benchmark/benchmark_7b.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-7b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 1000
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
     for _ in range(test_round):
         token_latency_stats = [0] * (output_seqlen + 1)
         prev = time.perf_counter()
-        n_pre_token = 0
+        n_prev_token = 0
         """
         The iterator provided by `stream_infer` denotes the number of generated tokens so far,
         which is represented by the variable `n_token`.
         Please note that `n_token` is not a continuous value. In other words, during the iteration,
         its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
         So, it is quite difficult to get the latency of each generated token.
-        As a work-around, we set the latency `new-prev` of each iteration to the first token of
+        As a work-around, we set the latency `now-prev` of each iteration to the first token of
         the new generated tokens, and leave the latency of the rest tokens being 0.
         For example, in the first iteration, 5 tokens are generated.
         The time elapsing in this iteration `now-prev` is set to the latency of first token of
@@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
                                             temperature=temperature):
             _, n_token = outputs[0]
             now = time.perf_counter()
-            if n_pre_token != n_token:
-                token_latency_stats[n_pre_token] = np.round(now - prev, 3)
-                n_pre_token = n_token
+            if n_prev_token != n_token:
+                token_latency_stats[n_prev_token] = np.round(now - prev, 3)
+                n_prev_token = n_token
             prev = now
         if session_id == 1:
             pbar.update(1)