WIP server: bench: init #103
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Benchmark | |
name: Benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
gpu-series: | |
description: 'Azure GPU series to run with' | |
required: true | |
type: choice | |
options: | |
- Standard_NC4as_T4_v3 | |
- Standard_NC24ads_A100_v4 | |
- Standard_NC80adis_H100_v5 | |
push: | |
branches: | |
- master | |
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] | |
pull_request: | |
types: [opened, synchronize, reopened] | |
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] | |
schedule: | |
- cron: '04 2 * * *' | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
bench-server-baseline: | |
runs-on: Standard_NC4as_T4_v3 | |
env: | |
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it | |
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} | |
steps: | |
- name: Clone | |
id: checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install python env | |
id: pipenv | |
run: | | |
cd examples/server/bench | |
python3 -m venv venv | |
source venv/bin/activate | |
pip install -r requirements.txt | |
- name: Prometheus | |
id: install_prometheus | |
run: | | |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz | |
tar xzf prometheus*.tar.gz --strip-components=1 | |
./prometheus --config.file=examples/server/bench/prometheus.yml & | |
while ! nc -z localhost 9090; do | |
sleep 0.1 | |
done | |
- name: Install k6 | |
id: k6_installation | |
run: | | |
cd examples/server/bench | |
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz | |
tar xzf k6*.tar.gz --strip-components=1 | |
- name: Build | |
id: cmake_build | |
run: | | |
set -eux | |
mkdir build | |
cd build | |
cmake .. \ | |
-DLLAMA_NATIVE=OFF \ | |
-DLLAMA_BUILD_SERVER=ON \ | |
-DLLAMA_CURL=ON \ | |
-DLLAMA_CUBLAS=ON \ | |
-DCUDAToolkit_ROOT=/usr/local/cuda \ | |
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ | |
-DCMAKE_CUDA_ARCHITECTURES=75 \ | |
-DLLAMA_FATAL_WARNINGS=OFF \ | |
-DLLAMA_ALL_WARNINGS=OFF \ | |
-DCMAKE_BUILD_TYPE=Release; | |
cmake --build . --config Release -j $(nproc) --target server | |
- name: Download the dataset | |
id: download_dataset | |
run: | | |
cd examples/server/bench | |
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
- name: Server bench | |
id: server_bench | |
run: | | |
set -eux | |
cd examples/server/bench | |
source venv/bin/activate | |
BENCH_K6_BIN_PATH=./k6 python bench.py \ | |
--runner-label ${{ env.RUNNER_LABEL }} \ | |
--name ${{ github.job }} \ | |
--branch ${{ github.head_ref || github.ref_name }} \ | |
--commit ${{ github.sha }} \ | |
--scenario script.js \ | |
--duration 1m \ | |
--hf-repo ggml-org/models \ | |
--hf-file phi-2/ggml-model-q4_0.gguf \ | |
--model-path-prefix /models \ | |
--parallel 8 \ | |
-ngl 33 \ | |
--batch-size 2048 \ | |
--ubatch-size 256 \ | |
--ctx-size 16384 \ | |
--n-prompts 1000 \ | |
--max-prompt-tokens 1024 \ | |
--max-tokens 2048 | |
cat results.github.env >> $GITHUB_ENV | |
- name: Extract mermaid | |
id: server_bench_mermaid | |
env: | |
PROMPT_TOKENS_SECONDS: ${{ env.PROMPT_TOKENS_SECONDS }} | |
PREDICTED_TOKENS_SECONDS: ${{ env.PREDICTED_TOKENS_SECONDS }} | |
KV_CACHE_USAGE_RATIO: ${{ env.KV_CACHE_USAGE_RATIO }} | |
REQUESTS_PROCESSING: ${{ env.REQUESTS_PROCESSING }} | |
run: | | |
set -eux | |
echo PROMPT_TOKENS_SECONDS_=${PROMPT_TOKENS_SECONDS//<br>/\n} >> $GITHUB_ENV | |
echo PREDICTED_TOKENS_SECONDS_=${PREDICTED_TOKENS_SECONDS//<br>/\n} >> $GITHUB_ENV | |
echo KV_CACHE_USAGE_RATIO_=${KV_CACHE_USAGE_RATIO//<br>/\n} >> $GITHUB_ENV | |
echo REQUESTS_PROCESSING_=${REQUESTS_PROCESSING//<br>/\n} >> $GITHUB_ENV | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: benchmark-results | |
compression-level: 9 | |
path: | | |
examples/server/bench/*.png | |
examples/server/bench/*.json | |
examples/server/bench/*.log | |
- name: Commit status | |
uses: Sibz/github-status-action@v1 | |
with: | |
authToken: ${{secrets.GITHUB_TOKEN}} | |
context: bench-server-baseline | |
description: | | |
${{ env.BENCH_RESULTS }} | |
state: 'success' | |
- name: Upload benchmark images | |
uses: devicons/[email protected] | |
continue-on-error: true | |
id: imgur_step | |
with: | |
client_id: ${{secrets.IMGUR_CLIENT_ID}} | |
path: | | |
examples/server/bench/prompt_tokens_seconds.png | |
examples/server/bench/predicted_tokens_seconds.png | |
examples/server/bench/kv_cache_usage_ratio.png | |
examples/server/bench/requests_processing.png | |
- name: Comment PR | |
uses: mshick/add-pr-comment@v2 | |
id: comment_pr | |
if: ${{ github.event.pull_request != '' }} | |
continue-on-error: true | |
with: | |
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} | |
message: | | |
📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 | |
- ${{ env.BENCH_GRAPH_TITLE }} | |
- ${{ env.BENCH_GRAPH_YLABEL }} | |
<p align="center"> | |
<img width="100%" height="100%" src="${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" alt="prompt_tokens_seconds" /> | |
${{ env.PROMPT_TOKENS_SECONDS_ }} | |
<img width="100%" height="100%" src="${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" alt="predicted_tokens_seconds"/> | |
${{ env.PREDICTED_TOKENS_SECONDS_ }} | |
</p> | |
<details> | |
<summary>Details</summary> | |
<p align="center"> | |
<img width="100%" height="100%" src="${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" alt="kv_cache_usage_ratio" /> | |
${{ env.KV_CACHE_USAGE_RATIO_ }} | |
<img width="100%" height="100%" src="${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" alt="requests_processing"/> | |
${{ env.REQUESTS_PROCESSING_ }} | |
</p> | |
</detail> |