Skip to content

Commit 8258be5

Browse files
[ci] refactor longtext benchmark (#4087)
* update * update * update * Update Docker tag from cuda12.4 to cuda12.8 * update * update * update * update * update * add longtext benchmark into workflow * update * update * update * update * update * fix * update * update * update * update * update * update * add ascend config * update * update * update * update timeout and ascend config * update * update --------- Co-authored-by: littlegy <[email protected]>
1 parent e3708ef commit 8258be5

27 files changed

+560
-242
lines changed

.github/scripts/action_tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def generate_benchmark_report(report_path: str):
229229
for f in csv_files:
230230
df = pd.read_csv(f)
231231
merged_df = pd.concat([merged_df, df], ignore_index=True)
232-
if 'throughput' in backend_subfolder:
232+
if 'throughput' in backend_subfolder or 'longtext' in backend_subfolder:
233233
merged_df = merged_df.sort_values(by=merged_df.columns[1])
234234

235235
grouped_df = merged_df.groupby(merged_df.columns[1])

.github/workflows/api_eval.yml

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,8 @@ on:
3737
env:
3838
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
3939
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
40-
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
4140
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
42-
REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
41+
REPORT_DIR: /nvme/qa_test_models/evaluation-reports/allure_report/${{ github.run_id }}
4342
COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
4443
FAIL_CONFIG: '--lf'
4544
TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
@@ -62,8 +61,8 @@ jobs:
6261
env:
6362
PYTHON_VERSION: ${{ matrix.pyver }}
6463
PLAT_NAME: manylinux2014_x86_64
65-
DOCKER_TAG: cuda12.4
66-
OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
64+
DOCKER_TAG: cuda12.8
65+
OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
6766
steps:
6867
- name: Checkout repository
6968
uses: actions/checkout@v3
@@ -98,18 +97,17 @@ jobs:
9897
matrix:
9998
backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
10099
container:
101-
image: openmmlab/lmdeploy:latest-cu12
100+
image: openmmlab/lmdeploy:latest-cu12.8
102101
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
103102
volumes:
104103
- /nvme/github-actions/pip-cache:/root/.cache/pip
105104
- /nvme/github-actions/packages:/root/packages
106105
- /nvme/github-actions/resources:/root/resources
107106
- /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
108107
- /nvme/qa_test_models:/nvme/qa_test_models
109-
- /mnt/shared:/mnt/shared
110-
- /mnt/bigdisk:/mnt/bigdisk
108+
- /nvme/huggingface_hub:/nvme/huggingface_hub
109+
- /mnt/121:/mnt/121
111110
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
112-
- /mnt/187:/mnt/187
113111
steps:
114112
- name: Create and change to _wk directory
115113
run: |
@@ -138,9 +136,8 @@ jobs:
138136
run: |
139137
python3 -m pip list
140138
lmdeploy check_env
141-
rm -rf allure-results
142-
mkdir -p ${{ env.REPORT_DIR }}/.pytest_cache
143-
ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
139+
mkdir ${{env.REPORT_DIR}} -p
140+
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
144141
- name: Setup paths for evaluation
145142
if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
146143
run: |
@@ -161,5 +158,6 @@ jobs:
161158
- name: Clear workspace
162159
if: always()
163160
run: |
161+
echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
164162
export workdir=$(pwd)
165163
rm -rf $workdir/*

.github/workflows/api_eval_h800.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ env:
3939
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
4040
OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
4141
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
42-
REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
42+
REPORT_DIR: /nvme/qa_test_models/evaluation-reports/allure_report/${{ github.run_id }}
4343
COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
4444
FAIL_CONFIG: '--lf'
4545
TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
@@ -142,9 +142,8 @@ jobs:
142142
run: |
143143
python3 -m pip list
144144
lmdeploy check_env
145-
rm -rf allure-results
146-
mkdir -p ${{ env.REPORT_DIR }}/.pytest_cache
147-
ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
145+
mkdir ${{env.REPORT_DIR}} -p
146+
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
148147
- name: Setup paths for evaluation
149148
if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
150149
run: |
@@ -165,5 +164,6 @@ jobs:
165164
- name: Clear workspace
166165
if: always()
167166
run: |
167+
echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
168168
export workdir=$(pwd)
169169
rm -rf $workdir/*

.github/workflows/benchmark.yml

Lines changed: 64 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ on:
1515
default: 'main'
1616
benchmark_type:
1717
required: true
18-
description: 'Set benchmark type. Default is "["generation", "throughput", "api_server"]"'
18+
description: 'Set benchmark type. Default is "["longtext", "throughput", "api_server"]"'
1919
type: string
20-
default: "['apiserver', 'throughput']"
20+
default: "['apiserver', 'throughput', 'longtext']"
2121
offline_mode:
2222
required: true
2323
description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -27,10 +27,12 @@ on:
2727
env:
2828
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
2929
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
30-
OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
30+
OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
3131
REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
32+
ALLURE_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/allure_report/${{ github.run_id }}
33+
TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
34+
OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
3235
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
33-
FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
3436

3537
jobs:
3638
linux-build:
@@ -42,7 +44,7 @@ jobs:
4244
env:
4345
PYTHON_VERSION: ${{ matrix.pyver }}
4446
PLAT_NAME: manylinux2014_x86_64
45-
DOCKER_TAG: cuda12.4
47+
DOCKER_TAG: cuda12.8
4648
steps:
4749
- name: Checkout repository
4850
uses: actions/checkout@v3
@@ -67,25 +69,16 @@ jobs:
6769
retention-days: 1
6870
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
6971

70-
71-
benchmark:
72+
download_pkgs:
7273
needs: linux-build
73-
if: ${{github.event_name == 'schedule' || !cancelled()}}
74+
if: ${{!cancelled()}}
7475
runs-on: [self-hosted, linux-a100]
75-
strategy:
76-
fail-fast: false
77-
matrix:
78-
benchmark_type: ${{fromJSON(github.event.inputs.benchmark_type)}}
79-
timeout-minutes: 480
76+
timeout-minutes: 50
8077
container:
8178
image: openmmlab/lmdeploy:latest-cu12.8
8279
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
8380
volumes:
84-
- /nvme/github-actions/pip-cache:/root/.cache/pip
85-
- /nvme/github-actions/packages:/root/packages
8681
- /nvme/qa_test_models:/nvme/qa_test_models
87-
- /mnt/shared:/mnt/shared
88-
- /mnt/bigdisk:/mnt/bigdisk
8982
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
9083
steps:
9184
- name: Clone repository
@@ -94,42 +87,82 @@ jobs:
9487
with:
9588
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
9689
ref: ${{github.event.inputs.repo_ref || 'main'}}
90+
- name: Copy repository
91+
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
92+
run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
9793
- name: Copy repository - offline
9894
if: ${{inputs.offline_mode}}
99-
run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
95+
run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
10096
- name: Download Artifacts
10197
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
10298
uses: actions/download-artifact@v4
10399
with:
104100
name: my-artifact-${{ github.run_id }}-py310
101+
- name: Copy Artifacts
102+
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
103+
run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
104+
- name: Copy Artifacts - offline
105+
if: ${{inputs.offline_mode}}
106+
run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
107+
- name: Mark as start
108+
run: |
109+
mkdir ${{env.REPORT_DIR}} -p
110+
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
111+
112+
benchmark:
113+
needs: download_pkgs
114+
if: ${{github.event_name == 'schedule' || !cancelled()}}
115+
runs-on: [self-hosted, linux-a100]
116+
strategy:
117+
fail-fast: false
118+
matrix:
119+
benchmark_type: ${{fromJSON(github.event.inputs.benchmark_type)}}
120+
gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
121+
include:
122+
- n: 8
123+
gpu_num: gpu_num_1
124+
- n: 4
125+
gpu_num: gpu_num_2
126+
- n: 2
127+
gpu_num: gpu_num_4
128+
- n: 1
129+
gpu_num: gpu_num_8
130+
timeout-minutes: 480
131+
container:
132+
image: openmmlab/lmdeploy:latest-cu12
133+
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
134+
volumes:
135+
- /nvme/github-actions/pip-cache:/root/.cache/pip
136+
- /nvme/github-actions/packages:/root/packages
137+
- /nvme/qa_test_models:/nvme/qa_test_models
138+
- /nvme/huggingface_hub:/nvme/huggingface_hub
139+
- /mnt/121:/mnt/121
140+
- /mnt/bigdisk:/mnt/bigdisk
141+
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
142+
steps:
143+
- name: Copy repository and Artifacts
144+
run: |
145+
cp -r ${{env.TEST_CODE_PATH}}/. .
146+
mkdir ${{env.REPORT_DIR}} -p
147+
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
105148
- name: Install lmdeploy - dependency
106149
run: |
107150
python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
108151
- name: Install lmdeploy
109-
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
110152
run: |
111153
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
112154
python3 -m pip install -r requirements/test.txt
113-
- name: Install lmdeploy - offline
114-
if: ${{inputs.offline_mode}}
115-
run: |
116-
python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
117-
python3 -m pip install -r requirements/test.txt
118155
- name: Check env
119156
run: |
120157
python3 -m pip list
121158
lmdeploy check_env
122-
mkdir ${{env.REPORT_DIR}}/allure-results/.pytest_cache -p
123-
ln -s ${{env.REPORT_DIR}}/allure-results/.pytest_cache autotest
124159
- name: Run other benchmark
125160
run: |
126-
pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 8 --run_id ${{ github.run_id }} -m 'gpu_num_1 and not pr_test' ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
127-
pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 4 --run_id ${{ github.run_id }} -m 'gpu_num_2 and not pr_test' ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
128-
pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n 2 --run_id ${{ github.run_id }} -m 'gpu_num_4 and not pr_test' ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results || true
129-
pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py --run_id ${{ github.run_id }} -m 'gpu_num_8 and not pr_test' ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}}/allure-results
161+
pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} --run_id ${{ github.run_id }} -m '${{matrix.gpu_num}} and not pr_test' --alluredir=${{env.ALLURE_REPORT_DIR}}
130162
- name: Clear workfile
131163
if: always()
132164
run: |
165+
echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
133166
chmod -R 777 $REPORT_DIR
134167
export workdir=$(pwd)
135168
cd ..
@@ -157,5 +190,6 @@ jobs:
157190
ref: ${{github.event.inputs.repo_ref || 'main'}}
158191
- name: Get overview
159192
run: |
193+
echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
160194
pip install pandas fire mmengine
161195
python3 .github/scripts/action_tools.py generate_benchmark_report $REPORT_DIR

0 commit comments

Comments
 (0)