diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 043125d3e..f36de1daf 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -10,12 +10,14 @@
- [Step 4: create and give service account access to write to output gcs bucket](#step-4-create-and-give-service-account-access-to-write-to-output-gcs-bucket)
- [Step 5: create artifact repository for automated Locust docker build](#step-5-create-artifact-repository-for-automated-locust-docker-build)
- [Step 6: create and configure terraform.tfvars](#step-6-create-and-configure-terraformtfvars)
- - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+ - [optional: set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+ - [optional: set up secret token in secret manager](#optional-set-up-secret-token-in-secret-manager)
- [Step 7: login to gcloud](#step-7-login-to-gcloud)
- [Step 8: terraform initialize, plan and apply](#step-8-terraform-initialize-plan-and-apply)
- [Step 9: start an end to end benchmark](#step-9-start-an-end-to-end-benchmark)
- [option 1: initiate a single end to end Locust benchmark run via curl command](#option-1-initiate-a-single-end-to-end-locust-benchmark-run-via-curl-command)
- [option 2: interactive benchmark with locust web ui](#option-2-interactive-benchmark-with-locust-web-ui)
+ - [writing custom metrics](#writing-custom-metrics)
- [Additional Tips](#additional-tips)
- [Variables](#variables)
@@ -142,6 +144,18 @@ credentials_config = {
}
```
+#### [optional] set up secret token in Secret Manager
+
+A model may require a security token to access it. For example, Llama2 from HuggingFace is a gated model that requires a [user access token](https://huggingface.co/docs/hub/en/security-tokens). If the model you want to run does not require this, skip this step.
+
+If you followed steps from `.../../infra/`, Secret Manager and the user access token should already be set up. Alternatively you can create a Kubernetes Secret to store your Hugging Face CLI token. You can do this from the command line with kubectl:
+```bash
+kubectl create secret generic huggingface-secret --from-literal=token='************'
+```
+
+This command creates a new Secret named huggingface-secret, which has a key token containing your Hugging Face CLI token.
+It is important to note that for any production or shared environments, directly storing user access tokens as literals is not advisable.
+
### Step 7: login to gcloud
Run the following gcloud command for authorization:
@@ -208,6 +222,17 @@ In a web browser, visit the following website:
```
http://$LOCUST_SERVICE_IP:8089
```
+#### writing custom metrics
+
+If the variable `enable_custom_metrics` is set to `true` then custom metrics collected by the locust master is available at the following endpoints:
+* While the test is running
+```
+http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics.csv
+```
+* After a test ends:
+```
+http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics_final.csv
+```
### Additional Tips
@@ -237,4 +262,6 @@ To change the benchmark configuration, you will have to rerun terraform destroy
| [sax\_model](#input\_sax\_model) | Benchmark server configuration for sax model. Only required if framework is sax. | `string` | `""` | no |
| [tokenizer](#input\_tokenizer) | Benchmark server configuration for tokenizer. | `string` | `"tiiuae/falcon-7b"` | yes |
| [use\_beam\_search](#input\_use\_beam\_search) | Benchmark server configuration for use beam search. | `bool` | `false` | no |
+ [enable\_custom\_metric](#input\_enable\_custom\_metrics) | To collect custom metrics like number of tokens sent and received | `bool` | `false` | no |
+ [huggingface_secret](#input\_huggingface_secret) | Name of the kubectl huggingface secret token | `string` | `huggingface-secret` | no |
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
new file mode 100644
index 000000000..773be423a
--- /dev/null
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
@@ -0,0 +1,42 @@
+import datetime
+import logging
+
+class TokenMetricCollector:
+ def __init__(self):
+ self.tokens_sent = []
+ self.tokens_received = []
+ self.test_time = []
+ self.success_count = 0
+ self.failure_count = 0
+
+
+ def add_metric(self, sent, received, test_time, request_successful_bool):
+ if request_successful_bool==1:
+ self.tokens_sent.append(sent)
+ self.tokens_received.append(received)
+ self.test_time.append(test_time)
+ self.success_count += 1
+ else:
+ self.failure_count += 1
+
+
+ def calculate_average_tokens(self):
+ avg_sent = sum(self.tokens_sent) / len(self.tokens_sent) if self.tokens_sent else 0
+ avg_received = sum(self.tokens_received) / len(self.tokens_received) if self.tokens_received else 0
+ avg_test_time = sum(self.test_time) / len(self.test_time) if self.tokens_sent else 0
+
+ return avg_sent, avg_received, avg_test_time
+
+ def write_to_csv(self, file_path='custom_metrics.csv'):
+ import csv
+ avg_sent, avg_received, avg_test_time = self.calculate_average_tokens()
+ with open(file_path, mode='w', newline='') as file:
+ writer = csv.writer(file)
+ writer.writerow(['Metric', 'Average Value'])
+ writer.writerow(['# of Successful Req', self.success_count])
+ writer.writerow(['# of Failed Req', self.failure_count])
+ writer.writerow(['Avg Tokens Sent Per Req', avg_sent])
+ writer.writerow(['Avg Tokens Received Per Req', avg_received])
+ writer.writerow(['Avg Test Time', avg_test_time])
+ writer.writerow(['Timestamp', datetime.datetime.now()])
+
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
index 6620a44e9..296c79c88 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
@@ -31,4 +31,4 @@ Werkzeug==2.3.8
zipp==3.8.0
zope.event==4.5.0
zope.interface==5.4.0
-TensorFlow >= 2.0
+TensorFlow >= 2.0
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
index 9d7d8586f..b405ba557 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
@@ -21,6 +21,7 @@ LOCUST_MODE=${LOCUST_MODE:-standalone}
if [[ "$LOCUST_MODE" = "master" ]]; then
LOCUS_OPTS="$LOCUS_OPTS --master --stop-timeout 300"
elif [[ "$LOCUST_MODE" = "worker" ]]; then
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
FILTER_PROMPTS="python /locust-tasks/load_data.py"
FILTER_PROMPTS_OPTS="--gcs_path=$GCS_PATH --tokenizer=$TOKENIZER --max_prompt_len=$MAX_PROMPT_LEN --max_num_prompts=$MAX_NUM_PROMPTS"
echo "$FILTER_PROMPTS $FILTER_PROMPTS_OPTS"
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 99643a757..d85840488 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -14,15 +14,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import json
import logging
+import os
import random
-
+import threading
+import time
+from locust import web # Import the web module from Locust
+from flask import send_from_directory
+from typing import Callable, List
from locust import FastHttpUser, task, events
from locust.runners import MasterRunner
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
-logging.basicConfig(level=logging.INFO)
+from custom_metric_aggregator import TokenMetricCollector
+metric_collector = TokenMetricCollector()
+logging.basicConfig(level=logging.INFO)
def load_test_prompts():
"""Loads test prompts from a local file location."""
@@ -31,8 +40,9 @@ def load_test_prompts():
return test_data
-def generate_request(model_params, prompt):
+def generate_request(prompt):
"""Generates request for given model server"""
+ global model_params
backend = model_params["backend"]
best_of = model_params["best_of"]
output_len = model_params["max_output_len"]
@@ -89,6 +99,28 @@ def generate_request(model_params, prompt):
raise ValueError(f"Unknown backend: {backend}")
return pload
+def get_token_count(prompt, resp):
+ """Get number of tokens to prompt and resp using the tokenizer"""
+ global tokenizer
+ backend = model_params["backend"]
+
+ number_of_input_tokens = len(tokenizer.encode(prompt))
+ number_of_output_tokens = 0
+
+ if backend == "vllm":
+ number_of_output_tokens = 0 # to be added
+ elif backend == "tgi":
+ number_of_output_tokens = 0 # to be added
+ elif backend == "tensorrt_llm_triton":
+ resp_dict = json.loads(resp.content.decode('utf-8'))
+ number_of_output_tokens = len(tokenizer.encode(resp_dict['text_output']))
+ elif backend == "sax":
+ number_of_output_tokens = 0 # to be added
+ else:
+ raise ValueError(f"Unknown backend: {backend}")
+ return number_of_input_tokens, number_of_output_tokens
+
+
class BenchmarkUser(FastHttpUser):
weight = 1
@@ -97,7 +129,9 @@ class BenchmarkUser(FastHttpUser):
def lm_generate(self):
global test_data
global model_params
+ global tokenizer
+
if not test_data:
logging.error("No test data configured.")
logging.error("Stopping the runner")
@@ -106,15 +140,81 @@ def lm_generate(self):
prompt = test_data[random.randrange(0, len(test_data))]
- request = generate_request(model_params, prompt)
+ request = generate_request(prompt)
headers = {"User-Agent": "Benchmark Client", "Connection": "close"}
logging.info(f"Sending request: {request}")
+ test_start_time = time.time()
with self.client.post("/generate", headers=headers, json=request, catch_response=True) as resp:
- if resp.status_code != 200:
- # Locust considers response code < 400 as success, if not 200 mark as otherwise.
- resp.failure("Got unexpected response")
- logging.error(
- f"request {request} failed with: {resp.status_code}")
+ if resp.status_code == 200:
+ self.handle_successful_response(prompt, resp, test_start_time)
+ else:
+ self.handle_failed_response(request, resp)
+
+
+ def handle_successful_response(self, prompt, reponse, start_time):
+ global model_params
+ if model_params['enable_custom_metrics'] == 'true':
+ test_time = time.time() - start_time
+ request_successful_bool = 1
+ tokens_sent, tokens_received = get_token_count(prompt, reponse)
+
+ logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+ self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool])
+
+ def handle_failed_response(self, request, response):
+ global model_params
+ response.failure("Got unexpected response")
+ logging.error(f"request {request} failed with: {response.status_code}")
+ if model_params['enable_custom_metrics'] == 'true':
+ tokens_sent = -1
+ tokens_received = -1
+ test_time = -1
+ request_successful_bool = 0
+
+ logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+ self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool])
+
+
+"""
+methods for the locust master to write custom metrics
+"""
+def collect_metrics(msg, **_kwargs):
+ """locust master collects the metrics emitted by the locust workers and updates the metric_collector object"""
+ sent = msg.data[0]
+ received = msg.data[1]
+ test_time = msg.data[2]
+ request_successful_bool = msg.data[3]
+ logging.info(f'recevied from worker {msg.data}')
+ metric_collector.add_metric(sent, received, test_time, request_successful_bool)
+
+def periodically_write_metrics(environment):
+ metric_collector.write_to_csv()
+ threading.Timer(environment.parsed_options.csv_upload_frequency, periodically_write_metrics, args=(environment,)).start()
+
+def setup_periodic_metrics_writer(environment, **_kwargs):
+ """locust master periodically writes the collected metrics to csv"""
+ periodically_write_metrics(environment)
+
+def setup_custom_route(environment, **_kwargs):
+ """Sets up custom routes in the locust master for serving CSV files."""
+ directory = os.path.dirname('/') # Directory where the file is located
+
+ @environment.web_ui.app.route("/custom_metrics/")
+ def custom_metrics(filename):
+ if filename not in ['custom_metrics.csv', 'custom_metrics_final.csv']:
+ return "File not found.", 404 # Basic validation to prevent unauthorized file access
+ return send_from_directory(directory, filename, as_attachment=True)
+
+@events.test_stop.add_listener
+def on_test_stop(environment, **kwargs):
+ """on test stop the locust master writes the output to custom_metrics_final and resets the metric_collector for next tests"""
+ if isinstance(environment.runner, MasterRunner) and environment.parsed_options.enable_custom_metrics == 'true':
+ logging.info(f'init metric_collector')
+ metric_collector.write_to_csv('custom_metrics_final.csv')
+ metric_collector.__init__()
+ metric_collector.write_to_csv()
+
+
@events.init_command_line_parser.add_listener
@@ -129,6 +229,12 @@ def _(parser):
include_in_web_ui=True, default="", help="Required for sax backend. Used only for sax backend. Model name to send request to at API server for SAX model server.")
parser.add_argument("--use_beam_search", action="store_true", env_var="USE_BEAM_SEARCH",
include_in_web_ui=True, help="Whether to use beam search instead of sampling.")
+ parser.add_argument("--tokenizer", type=str, env_var="TOKENIZER",
+ include_in_web_ui=False, default="", help="Tokenizer to use for token calculations")
+ parser.add_argument("--enable_custom_metrics", type=str, env_var="ENABLE_CUSTOM_METRICS",
+ include_in_web_ui=True, default="false", help="enable custom metric")
+ parser.add_argument("--csv_upload_frequency", type=int, env_var="CSV_UPLOAD_FREQUENCY",
+ include_in_web_ui=True, default=10, help="upload custom metrics every X seconds")
@events.init.add_listener
@@ -136,6 +242,10 @@ def _(environment, **kwargs):
if not isinstance(environment.runner, MasterRunner):
global model_params
global test_data
+ global metric_collector
+ global tokenizer
+
+ tokenizer = AutoTokenizer.from_pretrained(environment.parsed_options.tokenizer)
logging.info(
"Loading test prompts from locust-tasks/filtered_prompts.txt.")
@@ -152,6 +262,15 @@ def _(environment, **kwargs):
"max_output_len": environment.parsed_options.max_output_len,
"sax_model": environment.parsed_options.sax_model,
"use_beam_search": environment.parsed_options.use_beam_search,
+ "tokenizer": environment.parsed_options.tokenizer,
+ "enable_custom_metrics" : environment.parsed_options.enable_custom_metrics,
+ "csv_upload_frequency" : environment.parsed_options.csv_upload_frequency,
}
logging.info(
f"Using the following benchmark parameters:\n {model_params}")
+
+ elif environment.parsed_options.enable_custom_metrics == 'true':
+ # code to setup the locust master to write custom metrics
+ setup_periodic_metrics_writer(environment)
+ setup_custom_route(environment)
+ environment.runner.register_message("metric_update", collect_metrics)
diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf
index 9002e99ab..6fab0603b 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/main.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf
@@ -41,6 +41,9 @@ locals {
sax_model = var.sax_model
tokenizer = var.tokenizer
use_beam_search = var.use_beam_search
+ enable_custom_metrics = var.enable_custom_metrics
+ huggingface_secret = var.huggingface_secret
+ csv_upload_frequency = var.csv_upload_frequency
})) : data]
])
}
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
index 64be13170..bcef25201 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
@@ -26,6 +26,8 @@ spec:
value: http://${inference_server_service}
- name: BACKEND
value: ${inference_server_framework}
+ - name: ENABLE_CUSTOM_METRICS
+ value: ${enable_custom_metrics}
ports:
- name: loc-master-web
containerPort: 8089
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
index e14fcb61f..d8a09dab5 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
@@ -44,3 +44,12 @@ spec:
value: ${tokenizer}
- name: USE_BEAM_SEARCH
value: ${use_beam_search}
+ - name: ENABLE_CUSTOM_METRICS
+ value: ${enable_custom_metrics}
+ - name: CSV_UPLOAD_FREQUENCY
+ value: ${csv_upload_frequency}
+ - name: HUGGINGFACE_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: ${huggingface_secret} # Replace ${huggingface_secret} with your secret's name
+ key: token
diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
index 920013b31..e8000c7f5 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
@@ -189,4 +189,24 @@ variable "run_test_automatically" {
description = "Run the test after deployment"
type = bool
default = false
+}
+
+variable "enable_custom_metrics" {
+ description = "enable custom metric output in Locust"
+ type = bool
+ default = false
+}
+
+variable "huggingface_secret" {
+ description = "name of the kubectl huggingface secret token"
+ type = string
+ nullable = true
+ default = "huggingface-secret"
+}
+
+variable "csv_upload_frequency" {
+ description = "how frequently, in seconds, to upload csv if custom metrics is turned on"
+ type = number
+ nullable = true
+ default = 10
}
\ No newline at end of file