Skip to content

Commit

Permalink
add benchmark test script
Browse files Browse the repository at this point in the history
Signed-off-by: lvliang-intel <[email protected]>
  • Loading branch information
lvliang-intel committed May 24, 2024
1 parent ad79fb5 commit 0acf3d4
Show file tree
Hide file tree
Showing 3 changed files with 257 additions and 0 deletions.
Empty file added GenAIEval/benchmark/__init__.py
Empty file.
65 changes: 65 additions & 0 deletions GenAIEval/benchmark/chatqna_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import concurrent.futures
import numpy
import json
import random
import requests
import time


response_times = []

def extract_qText(json_data):
try:
file = open('data.json')
data = json.load(file)
post_json_data = {}
post_json_data["model"] = "Intel/neural-chat-7b-v3-3"
post_json_data["messages"] = data[random.randint(0, len(data) - 1)]["qText"]
return json.dumps(post_json_data)
except (json.JSONDecodeError, KeyError, IndexError):
return None

def send_request(url, json_data):
global response_times
start_time = time.time()
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json_data, headers=headers)
end_time = time.time()
response_times.append(end_time - start_time)
print(f"Question: {json_data} Response: {response.status_code} - {response.text}")

def calculate_p50(latencies):
latencies.sort()
length = len(latencies)
if length % 2 == 0: # Even number of observations
return (latencies[length // 2] + latencies[length // 2 - 1]) / 2
else: # Odd number of observations
return latencies[length // 2]

def main(url, json_data, concurrency):
global response_times
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
future_to_url = {executor.submit(send_request, url, extract_qText(json_data)): url for _ in range(concurrency*2)}
for future in concurrent.futures.as_completed(future_to_url):
_ = future_to_url[future]

print(f"Total Requests: {concurrency*2}")

# Calculate the P50 (median)
p50 = numpy.percentile(response_times, 50)
print("P50 latency is ", p50, "s")

# Calculate the P99
p99 = numpy.percentile(response_times, 99)
print("P99 latency is ", p99, "s")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Concurrent client to send POST requests")
parser.add_argument("--backend_url", type=str, default="http://localhost:12345", help="Service URL to send requests to")
parser.add_argument("--json_data", type=str, default='{"inputs":"Which NFL team won the Super Bowl in the 2010 season?","parameters":{"do_sample": true}}', help="JSON data to send")
parser.add_argument("--concurrency", type=int, default=100, help="Concurrency level")
args = parser.parse_args()
main(args.backend_url, args.json_data, args.concurrency)

Loading

0 comments on commit 0acf3d4

Please sign in to comment.