diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 746f1ec603dc6..c8e2f5f8263dc 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -19,11 +19,10 @@ on: push: branches: - master - - hp/server/bench/workflow # FIXME remove - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] schedule: - cron: '04 2 * * *' @@ -36,7 +35,7 @@ jobs: runs-on: Standard_NC4as_T4_v3 env: RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it - if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request != '' || github.event.push.ref == 'refs/heads/master' }} + #if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} steps: - name: Clone id: checkout @@ -119,7 +118,7 @@ jobs: --n-prompts 1000 \ --max-prompt-tokens 1024 \ --max-tokens 2048 - + cat results.github.env >> $GITHUB_ENV # - name: Comment PR @@ -134,9 +133,10 @@ jobs: - name: Commit status uses: Sibz/github-status-action@v1 with: + authToken: ${{secrets.GITHUB_TOKEN}} context: ${{ github.job }} description: | - $BENCH_RESULTS + ${{ env.BENCH_RESULTS }} state: 'success' - name: Upload results diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py new file mode 100644 index 0000000000000..6fc42de527c57 --- /dev/null +++ b/examples/server/bench/bench.py @@ -0,0 +1,269 @@ +import argparse +import base64 +import json +import os +import re +import signal +import socket +import subprocess +import sys +import threading +import time +import traceback +from contextlib import closing +from datetime import datetime + +import matplotlib.pyplot as plt +import requests + + +def main(args_in: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description="Start server benchmark scenario") + parser.add_argument("--name", type=str, help="Bench name", required=True) + parser.add_argument("--runner-label", type=str, help="Runner label", required=True) + parser.add_argument("--branch", type=str, help="Branch name", default="detached") + parser.add_argument("--commit", type=str, help="Commit name", default="dirty") + parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0") + parser.add_argument("--port", type=int, help="Server listen host", default="8080") + parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models") + parser.add_argument("--n-prompts", type=int, + help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True) + parser.add_argument("--max-prompt-tokens", type=int, + help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset", + required=True) + parser.add_argument("--max-tokens", type=int, + help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens", + required=True) + parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) + parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) + parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) + parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) + parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) + parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True) + parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) + parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) + parser.add_argument("--duration", type=str, help="Bench scenario", required=True) + + args = parser.parse_args(args_in) + + start_time = time.time() + + # Start the server and performance scenario + try: + server_process = start_server(args) + except Exception: + print("bench: server start error :") + traceback.print_exc(file=sys.stdout) + sys.exit(1) + + # start the benchmark + bench_results_str = "" + try: + start_benchmark(args) + + with open("results.github.env", 'w') as github_env: + # parse output + with open('k6-results.json', 'r') as bench_results: + # Load JSON data from file + data = json.load(bench_results) + for metric_name in data['metrics']: + for metric_metric in data['metrics'][metric_name]: + value = data['metrics'][metric_name][metric_metric] + if isinstance(value, float): + github_env.write( + f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={round(value, 2)}\n") + token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] + bench_results_str = json.dumps(data) + + except Exception: + print("bench: error :") + traceback.print_exc(file=sys.stdout) + + # Stop the server + if server_process: + try: + print(f"bench: shutting down server pid={server_process.pid} ...") + if os.name == 'nt': + interrupt = signal.CTRL_C_EVENT + else: + interrupt = signal.SIGINT + server_process.send_signal(interrupt) + server_process.wait(0.5) + + except subprocess.TimeoutExpired: + print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...") + server_process.kill() # SIGKILL + server_process.wait() + + while is_server_listening(args.host, args.port): + time.sleep(0.1) + + # Prometheus + end_time = time.time() + image_data = [] + pr_comment = f"tk/s={round(token_seconds, 2)}" + if is_server_listening("0.0.0.0", 9090): + metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', + 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] + + for metric in metrics: + resp = requests.get(f"http://localhost:9090/api/v1/query_range", + params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) + if resp.status_code != 200: + print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") + else: + metric_data = resp.json() + values = metric_data['data']['result'][0]['values'] + timestamps, metric_values = zip(*values) + metric_values = [float(value) for value in metric_values] + timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + plt.figure(figsize=(16, 10), dpi=80) + plt.plot(timestamps, metric_values, label=metric) + plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) + plt.yticks(fontsize=12, alpha=.7) + + plt.title(f"{args.name} on {args.runner_label}\n" + f"duration={args.duration} {round(token_seconds, 2)}tk/s\n" + f"branch={args.branch} commit={args.commit}", + fontsize=14, wrap=True) + plt.grid(axis='both', alpha=.3) + plt.ylabel(f"llamacpp:{metric}", fontsize=14) + plt.xlabel(f"hf-repo={args.hf_repo} hf-file={args.hf_file}\n" + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" + f" pp={args.max_prompt_tokens} pp+tg={args.max_tokens}", fontsize=14, wrap=True) + plt.gcf().autofmt_xdate() + + # Remove borders + plt.gca().spines["top"].set_alpha(0.0) + plt.gca().spines["bottom"].set_alpha(0.3) + plt.gca().spines["right"].set_alpha(0.0) + plt.gca().spines["left"].set_alpha(0.3) + + # Save the plot as a PNG image + plt.savefig(f'{metric}.png') + plt.close() + with open(f'{metric}.png', "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()).decode() + image_data.append(f"data:image/png;base64,{encoded_string}") + # pr_comment = f""" + # llama.cpp server benchmark results for {args.name} on {args.runner_label}: {round(token_seconds, 2)}tk/s + #

+ # prompt_tokens_seconds + # predicted_tokens_seconds + #

+ #
+ # Details + #

+ # kv_cache_usage_ratio + # requests_processing + # requests_deferred + #

+ # + # """ + + with open("results.github.env", 'a') as github_env: + github_env.write(f"BENCH_RESULTS='{bench_results_str}'") + + +def start_benchmark(args): + k6_path = 'k6' + if 'BENCH_K6_BIN_PATH' in os.environ: + k6_path = os.environ['BENCH_K6_BIN_PATH'] + k6_args = [ + 'run', args.scenario, + '--no-color', + ] + k6_args.extend(['--duration', args.duration]) + k6_args.extend(['--iterations', args.n_prompts]) + k6_args.extend(['--vus', args.parallel]) + k6_args.extend(['--summary-export', 'k6-results.json']) + args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " + args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) + print(f"bench: starting k6 with: {args}") + k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr) + if k6_completed.returncode != 0: + raise Exception("bench: unable to run k6") + + +def start_server(args): + server_process = start_server_background(args) + + attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + + while not is_server_listening(args.host, args.port): + attempts += 1 + if attempts > max_attempts: + assert False, "server not started" + print(f"bench: waiting for server to start ...") + time.sleep(0.5) + + print("bench: server started.") + return server_process + + +def start_server_background(args): + # Start the server + server_path = '../../../build/bin/server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_args = [ + '--host', args.host, + '--port', args.port, + ] + model_file = args.model_path_prefix + os.path.sep + args.hf_file + model_dir = os.path.dirname(model_file) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + server_args.extend(['--model', model_file]) + server_args.extend(['--hf-repo', args.hf_repo]) + server_args.extend(['--hf-file', args.hf_file]) + server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) + server_args.extend(['--ctx-size', args.ctx_size]) + server_args.extend(['--parallel', args.parallel]) + server_args.extend(['--batch-size', args.batch_size]) + server_args.extend(['--ubatch-size', args.ubatch_size]) + server_args.extend(['--n-predict', args.max_tokens * 2]) + server_args.extend(['--defrag-thold', "0.1"]) + server_args.append('--cont-batching') + server_args.append('--metrics') + server_args.extend(['--log-format', "text"]) + args = [str(arg) for arg in [server_path, *server_args]] + print(f"bench: starting server with: {' '.join(args)}") + pkwargs = { + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE + } + server_process = subprocess.Popen( + args, + **pkwargs) + + def server_log(in_stream, out_stream): + for line in iter(in_stream.readline, b''): + print(line.decode('utf-8'), end='', file=out_stream) + + thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout)) + thread_stdout.start() + thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr)) + thread_stderr.start() + + return server_process + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + _is_server_listening = result == 0 + if _is_server_listening: + print(f"server is listening on {server_fqdn}:{server_port}...") + return _is_server_listening + + +def escape_metric_name(metric_name): + return re.sub('[^A-Z0-9]', '_', metric_name.upper()) + + +if __name__ == '__main__': + main() diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml new file mode 100644 index 0000000000000..b15ee52443fe8 --- /dev/null +++ b/examples/server/bench/prometheus.yml @@ -0,0 +1,9 @@ +global: + scrape_interval: 10s + external_labels: + llamacpp: 'server' + +scrape_configs: + - job_name: 'llama.cpp server' + static_configs: + - targets: ['localhost:8080'] diff --git a/examples/server/bench/requirements.txt b/examples/server/bench/requirements.txt new file mode 100644 index 0000000000000..66ed226eda6f0 --- /dev/null +++ b/examples/server/bench/requirements.txt @@ -0,0 +1,2 @@ +matplotlib +requests