forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
286 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
import argparse | ||
import base64 | ||
import json | ||
import os | ||
import re | ||
import signal | ||
import socket | ||
import subprocess | ||
import sys | ||
import threading | ||
import time | ||
import traceback | ||
from contextlib import closing | ||
from datetime import datetime | ||
|
||
import matplotlib.pyplot as plt | ||
import requests | ||
|
||
|
||
def main(args_in: list[str] | None = None) -> None: | ||
parser = argparse.ArgumentParser(description="Start server benchmark scenario") | ||
parser.add_argument("--name", type=str, help="Bench name", required=True) | ||
parser.add_argument("--runner-label", type=str, help="Runner label", required=True) | ||
parser.add_argument("--branch", type=str, help="Branch name", default="detached") | ||
parser.add_argument("--commit", type=str, help="Commit name", default="dirty") | ||
parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0") | ||
parser.add_argument("--port", type=int, help="Server listen host", default="8080") | ||
parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models") | ||
parser.add_argument("--n-prompts", type=int, | ||
help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True) | ||
parser.add_argument("--max-prompt-tokens", type=int, | ||
help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset", | ||
required=True) | ||
parser.add_argument("--max-tokens", type=int, | ||
help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens", | ||
required=True) | ||
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) | ||
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) | ||
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) | ||
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) | ||
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) | ||
parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True) | ||
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) | ||
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) | ||
parser.add_argument("--duration", type=str, help="Bench scenario", required=True) | ||
|
||
args = parser.parse_args(args_in) | ||
|
||
start_time = time.time() | ||
|
||
# Start the server and performance scenario | ||
try: | ||
server_process = start_server(args) | ||
except Exception: | ||
print("bench: server start error :") | ||
traceback.print_exc(file=sys.stdout) | ||
sys.exit(1) | ||
|
||
# start the benchmark | ||
bench_results_str = "" | ||
try: | ||
start_benchmark(args) | ||
|
||
with open("results.github.env", 'w') as github_env: | ||
# parse output | ||
with open('k6-results.json', 'r') as bench_results: | ||
# Load JSON data from file | ||
data = json.load(bench_results) | ||
for metric_name in data['metrics']: | ||
for metric_metric in data['metrics'][metric_name]: | ||
value = data['metrics'][metric_name][metric_metric] | ||
if isinstance(value, float): | ||
github_env.write( | ||
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={round(value, 2)}\n") | ||
token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] | ||
bench_results_str = json.dumps(data) | ||
|
||
except Exception: | ||
print("bench: error :") | ||
traceback.print_exc(file=sys.stdout) | ||
|
||
# Stop the server | ||
if server_process: | ||
try: | ||
print(f"bench: shutting down server pid={server_process.pid} ...") | ||
if os.name == 'nt': | ||
interrupt = signal.CTRL_C_EVENT | ||
else: | ||
interrupt = signal.SIGINT | ||
server_process.send_signal(interrupt) | ||
server_process.wait(0.5) | ||
|
||
except subprocess.TimeoutExpired: | ||
print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...") | ||
server_process.kill() # SIGKILL | ||
server_process.wait() | ||
|
||
while is_server_listening(args.host, args.port): | ||
time.sleep(0.1) | ||
|
||
# Prometheus | ||
end_time = time.time() | ||
image_data = [] | ||
pr_comment = f"tk/s={round(token_seconds, 2)}" | ||
if is_server_listening("0.0.0.0", 9090): | ||
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', | ||
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] | ||
|
||
for metric in metrics: | ||
resp = requests.get(f"http://localhost:9090/api/v1/query_range", | ||
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) | ||
if resp.status_code != 200: | ||
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") | ||
else: | ||
metric_data = resp.json() | ||
values = metric_data['data']['result'][0]['values'] | ||
timestamps, metric_values = zip(*values) | ||
metric_values = [float(value) for value in metric_values] | ||
timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] | ||
plt.figure(figsize=(16, 10), dpi=80) | ||
plt.plot(timestamps, metric_values, label=metric) | ||
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) | ||
plt.yticks(fontsize=12, alpha=.7) | ||
|
||
plt.title(f"{args.name} on {args.runner_label}\n" | ||
f"duration={args.duration} {round(token_seconds, 2)}tk/s\n" | ||
f"branch={args.branch} commit={args.commit}", | ||
fontsize=14, wrap=True) | ||
plt.grid(axis='both', alpha=.3) | ||
plt.ylabel(f"llamacpp:{metric}", fontsize=14) | ||
plt.xlabel(f"hf-repo={args.hf_repo} hf-file={args.hf_file}\n" | ||
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" | ||
f" pp={args.max_prompt_tokens} pp+tg={args.max_tokens}", fontsize=14, wrap=True) | ||
plt.gcf().autofmt_xdate() | ||
|
||
# Remove borders | ||
plt.gca().spines["top"].set_alpha(0.0) | ||
plt.gca().spines["bottom"].set_alpha(0.3) | ||
plt.gca().spines["right"].set_alpha(0.0) | ||
plt.gca().spines["left"].set_alpha(0.3) | ||
|
||
# Save the plot as a PNG image | ||
plt.savefig(f'{metric}.png') | ||
plt.close() | ||
with open(f'{metric}.png', "rb") as image_file: | ||
encoded_string = base64.b64encode(image_file.read()).decode() | ||
image_data.append(f"data:image/png;base64,{encoded_string}") | ||
# pr_comment = f""" | ||
# llama.cpp server benchmark results for {args.name} on {args.runner_label}: {round(token_seconds, 2)}tk/s | ||
# <p align="center"> | ||
# <img src="{image_data[0]}" alt="prompt_tokens_seconds" /> | ||
# <img src="{image_data[1]}" alt="predicted_tokens_seconds"/> | ||
# </p> | ||
# <details> | ||
# <summary>Details</summary> | ||
# <p align="center"> | ||
# <img src="{image_data[2]}" alt="kv_cache_usage_ratio" /> | ||
# <img src="{image_data[3]}" alt="requests_processing"/> | ||
# <img src="{image_data[4]}" alt="requests_deferred"/> | ||
# </p> | ||
# </detail> | ||
# """ | ||
|
||
with open("results.github.env", 'a') as github_env: | ||
github_env.write(f"BENCH_RESULTS='{bench_results_str}'") | ||
|
||
|
||
def start_benchmark(args): | ||
k6_path = 'k6' | ||
if 'BENCH_K6_BIN_PATH' in os.environ: | ||
k6_path = os.environ['BENCH_K6_BIN_PATH'] | ||
k6_args = [ | ||
'run', args.scenario, | ||
'--no-color', | ||
] | ||
k6_args.extend(['--duration', args.duration]) | ||
k6_args.extend(['--iterations', args.n_prompts]) | ||
k6_args.extend(['--vus', args.parallel]) | ||
k6_args.extend(['--summary-export', 'k6-results.json']) | ||
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " | ||
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) | ||
print(f"bench: starting k6 with: {args}") | ||
k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr) | ||
if k6_completed.returncode != 0: | ||
raise Exception("bench: unable to run k6") | ||
|
||
|
||
def start_server(args): | ||
server_process = start_server_background(args) | ||
|
||
attempts = 0 | ||
max_attempts = 20 | ||
if 'GITHUB_ACTIONS' in os.environ: | ||
max_attempts *= 2 | ||
|
||
while not is_server_listening(args.host, args.port): | ||
attempts += 1 | ||
if attempts > max_attempts: | ||
assert False, "server not started" | ||
print(f"bench: waiting for server to start ...") | ||
time.sleep(0.5) | ||
|
||
print("bench: server started.") | ||
return server_process | ||
|
||
|
||
def start_server_background(args): | ||
# Start the server | ||
server_path = '../../../build/bin/server' | ||
if 'LLAMA_SERVER_BIN_PATH' in os.environ: | ||
server_path = os.environ['LLAMA_SERVER_BIN_PATH'] | ||
server_args = [ | ||
'--host', args.host, | ||
'--port', args.port, | ||
] | ||
model_file = args.model_path_prefix + os.path.sep + args.hf_file | ||
model_dir = os.path.dirname(model_file) | ||
if not os.path.exists(model_dir): | ||
os.makedirs(model_dir) | ||
server_args.extend(['--model', model_file]) | ||
server_args.extend(['--hf-repo', args.hf_repo]) | ||
server_args.extend(['--hf-file', args.hf_file]) | ||
server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) | ||
server_args.extend(['--ctx-size', args.ctx_size]) | ||
server_args.extend(['--parallel', args.parallel]) | ||
server_args.extend(['--batch-size', args.batch_size]) | ||
server_args.extend(['--ubatch-size', args.ubatch_size]) | ||
server_args.extend(['--n-predict', args.max_tokens * 2]) | ||
server_args.extend(['--defrag-thold', "0.1"]) | ||
server_args.append('--cont-batching') | ||
server_args.append('--metrics') | ||
server_args.extend(['--log-format', "text"]) | ||
args = [str(arg) for arg in [server_path, *server_args]] | ||
print(f"bench: starting server with: {' '.join(args)}") | ||
pkwargs = { | ||
'stdout': subprocess.PIPE, | ||
'stderr': subprocess.PIPE | ||
} | ||
server_process = subprocess.Popen( | ||
args, | ||
**pkwargs) | ||
|
||
def server_log(in_stream, out_stream): | ||
for line in iter(in_stream.readline, b''): | ||
print(line.decode('utf-8'), end='', file=out_stream) | ||
|
||
thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout)) | ||
thread_stdout.start() | ||
thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr)) | ||
thread_stderr.start() | ||
|
||
return server_process | ||
|
||
|
||
def is_server_listening(server_fqdn, server_port): | ||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: | ||
result = sock.connect_ex((server_fqdn, server_port)) | ||
_is_server_listening = result == 0 | ||
if _is_server_listening: | ||
print(f"server is listening on {server_fqdn}:{server_port}...") | ||
return _is_server_listening | ||
|
||
|
||
def escape_metric_name(metric_name): | ||
return re.sub('[^A-Z0-9]', '_', metric_name.upper()) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
global: | ||
scrape_interval: 10s | ||
external_labels: | ||
llamacpp: 'server' | ||
|
||
scrape_configs: | ||
- job_name: 'llama.cpp server' | ||
static_configs: | ||
- targets: ['localhost:8080'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
matplotlib | ||
requests |