Merge remote-tracking branch 'origin/main' into vl2

InternLM · Dec 12, 2023 · 59bc4a6 · 59bc4a6
2 parents e215bba + 72869ef
commit 59bc4a6
Show file tree

Hide file tree

Showing 21 changed files with 837 additions and 259 deletions.
diff --git a/benchmark/benchmark_13b.sh b/benchmark/benchmark_13b.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-13b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 500
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
diff --git a/benchmark/benchmark_20b.sh b/benchmark/benchmark_20b.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of internlm-20b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=2
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 700
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+cp gemm_config.in ${output_path}
diff --git a/benchmark/benchmark_70b.sh b/benchmark/benchmark_70b.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-70b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=4
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 4000
+crudini --set ${config_path} llama max_batch_size 256
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128 256)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 64 128 256 \
+        --csv ${output_path}/generation.csv
+}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation  ${output_path}
diff --git a/benchmark/benchmark_7b.sh b/benchmark/benchmark_7b.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-7b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 1000
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
     for _ in range(test_round):
         token_latency_stats = [0] * (output_seqlen + 1)
         prev = time.perf_counter()
-        n_pre_token = 0
+        n_prev_token = 0
         """
         The iterator provided by `stream_infer` denotes the number of generated tokens so far,
         which is represented by the variable `n_token`.
         Please note that `n_token` is not a continuous value. In other words, during the iteration,
         its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
         So, it is quite difficult to get the latency of each generated token.
-        As a work-around, we set the latency `new-prev` of each iteration to the first token of
+        As a work-around, we set the latency `now-prev` of each iteration to the first token of
         the new generated tokens, and leave the latency of the rest tokens being 0.
         For example, in the first iteration, 5 tokens are generated.
         The time elapsing in this iteration `now-prev` is set to the latency of first token of
@@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
                                             temperature=temperature):
             _, n_token = outputs[0]
             now = time.perf_counter()
-            if n_pre_token != n_token:
-                token_latency_stats[n_pre_token] = np.round(now - prev, 3)
-                n_pre_token = n_token
+            if n_prev_token != n_token:
+                token_latency_stats[n_prev_token] = np.round(now - prev, 3)
+                n_prev_token = n_token
             prev = now
         if session_id == 1:
             pbar.update(1)