Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow benchmark to write json output #801

Merged
merged 3 commits into from
Sep 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import argparse
import asyncio
from datetime import datetime
import json
import random
import time
Expand Down Expand Up @@ -266,6 +267,42 @@ async def benchmark(
await asyncio.gather(*tasks)


def save_json_results(args: argparse.Namespace, benchmark_result):
# dimensions values are strings
dimensions_json = {}
# metrics values are numerical
metrics_json = {}

# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
dimensions_json["date"] = current_dt
dimensions_json["backend"] = args.backend
dimensions_json["model_id"] = args.model
dimensions_json["tokenizer_id"] = args.tokenizer
if args.additional_metadata_metrics_to_save is not None:
dimensions_json = {
**dimensions_json,
**json.loads(args.additional_metadata_metrics_to_save),
}
metrics_json["num_prompts"] = args.num_prompts

# Traffic
metrics_json["request_rate"] = args.request_rate
metrics_json = {**metrics_json, **benchmark_result}

final_json = {}
final_json["metrics"] = metrics_json
final_json["dimensions"] = dimensions_json

# Save to file
base_model_id = args.model.split("/")[-1]
file_name = (
f"{args.backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
)
with open(file_name, "w", encoding="utf-8") as outfile:
json.dump(final_json, outfile)


def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
Expand Down Expand Up @@ -305,24 +342,32 @@ def main(args: argparse.Namespace):
args.model,
)
)
benchmark_result = {}
benchmark_end_time = time.time()
benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s")
print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}")
benchmark_result['benchmark_time'] = benchmark_time

total_output_tokens = np.sum([output_len for _, output_len, _ in
REQUEST_LATENCY])
output_tokens_per_min = 60 * total_output_tokens / benchmark_time
print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
benchmark_result['total_output_token'] = int(total_output_tokens)
benchmark_result['output_tokens_per_min'] = output_tokens_per_min

total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
REQUEST_LATENCY])
input_tokens_per_min = 60 * total_input_tokens / benchmark_time
print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
benchmark_result['total_input_tokens'] = int(total_input_tokens)
benchmark_result['input_tokens_per_min'] = input_tokens_per_min

total_tokens = total_input_tokens + total_output_tokens
tokens_per_min = 60 * total_tokens / benchmark_time
print(f"Tokens/min: {tokens_per_min:.2f}")
benchmark_result['total_tokens'] = int(total_tokens)
benchmark_result['tokens_per_min'] = tokens_per_min

if args.machine_cost:
print(
Expand All @@ -336,6 +381,7 @@ def main(args: argparse.Namespace):
"Average seconds/request (includes waiting time on server):"
f" {avg_latency:.2f}"
)
benchmark_result['avg_latency'] = avg_latency

avg_per_token_latency = np.mean([
latency / (prompt_len + output_len)
Expand All @@ -345,6 +391,7 @@ def main(args: argparse.Namespace):
"Average milliseconds/token (includes waiting time on server):"
f" {1000 * avg_per_token_latency:.2f}"
)
benchmark_result['avg_per_token_latency'] = avg_per_token_latency

avg_per_output_token_latency = np.mean(
[latency / output_len for _, output_len, latency in REQUEST_LATENCY]
Expand All @@ -353,6 +400,7 @@ def main(args: argparse.Namespace):
"Average milliseconds/output_token (includes waiting time on server):"
f" {1000 * avg_per_output_token_latency:.2f}"
)
benchmark_result['avg_per_output_token_latency'] = avg_per_output_token_latency

avg_input_len = np.mean(
[prompt_len for prompt_len, _, _ in REQUEST_LATENCY]
Expand All @@ -361,6 +409,7 @@ def main(args: argparse.Namespace):
"Average input length:"
f" {avg_input_len:.2f}"
)
benchmark_result['avg_input_len'] = avg_input_len

avg_output_len = np.mean(
[output_len for _, output_len, _ in REQUEST_LATENCY]
Expand All @@ -369,6 +418,10 @@ def main(args: argparse.Namespace):
"Average output length:"
f" {avg_output_len:.2f}"
)
benchmark_result['avg_output_len'] = avg_output_len

if args.save_json_results:
save_json_results(args, benchmark_result)


if __name__ == "__main__":
Expand Down Expand Up @@ -479,6 +532,18 @@ def main(args: argparse.Namespace):
" and max_output_length."
),
)
parser.add_argument(
"--save-json-results",
action="store_true",
help="Whether to save benchmark results to a json file.",
)
parser.add_argument(
"--additional-metadata-metrics-to-save",
type=str,
help=(
"Additional metadata about the workload. Should be a dictionary in"
" the form of a string."
),
)
cmd_args = parser.parse_args()
main(cmd_args)