diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md index 043125d3e..f36de1daf 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/README.md +++ b/benchmarks/benchmark/tools/locust-load-inference/README.md @@ -10,12 +10,14 @@ - [Step 4: create and give service account access to write to output gcs bucket](#step-4-create-and-give-service-account-access-to-write-to-output-gcs-bucket) - [Step 5: create artifact repository for automated Locust docker build](#step-5-create-artifact-repository-for-automated-locust-docker-build) - [Step 6: create and configure terraform.tfvars](#step-6-create-and-configure-terraformtfvars) - - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) + - [optional: set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) + - [optional: set up secret token in secret manager](#optional-set-up-secret-token-in-secret-manager) - [Step 7: login to gcloud](#step-7-login-to-gcloud) - [Step 8: terraform initialize, plan and apply](#step-8-terraform-initialize-plan-and-apply) - [Step 9: start an end to end benchmark](#step-9-start-an-end-to-end-benchmark) - [option 1: initiate a single end to end Locust benchmark run via curl command](#option-1-initiate-a-single-end-to-end-locust-benchmark-run-via-curl-command) - [option 2: interactive benchmark with locust web ui](#option-2-interactive-benchmark-with-locust-web-ui) + - [writing custom metrics](#writing-custom-metrics) - [Additional Tips](#additional-tips) - [Variables](#variables) @@ -142,6 +144,18 @@ credentials_config = { } ``` +#### [optional] set up secret token in Secret Manager + +A model may require a security token to access it. For example, Llama2 from HuggingFace is a gated model that requires a [user access token](https://huggingface.co/docs/hub/en/security-tokens). If the model you want to run does not require this, skip this step. + +If you followed steps from `.../../infra/`, Secret Manager and the user access token should already be set up. Alternatively you can create a Kubernetes Secret to store your Hugging Face CLI token. You can do this from the command line with kubectl: +```bash +kubectl create secret generic huggingface-secret --from-literal=token='************' +``` + +This command creates a new Secret named huggingface-secret, which has a key token containing your Hugging Face CLI token. +It is important to note that for any production or shared environments, directly storing user access tokens as literals is not advisable. + ### Step 7: login to gcloud Run the following gcloud command for authorization: @@ -208,6 +222,17 @@ In a web browser, visit the following website: ``` http://$LOCUST_SERVICE_IP:8089 ``` +#### writing custom metrics + +If the variable `enable_custom_metrics` is set to `true` then custom metrics collected by the locust master is available at the following endpoints: +* While the test is running +``` +http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics.csv +``` +* After a test ends: +``` +http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics_final.csv +``` ### Additional Tips @@ -237,4 +262,6 @@ To change the benchmark configuration, you will have to rerun terraform destroy | [sax\_model](#input\_sax\_model) | Benchmark server configuration for sax model. Only required if framework is sax. | `string` | `""` | no | | [tokenizer](#input\_tokenizer) | Benchmark server configuration for tokenizer. | `string` | `"tiiuae/falcon-7b"` | yes | | [use\_beam\_search](#input\_use\_beam\_search) | Benchmark server configuration for use beam search. | `bool` | `false` | no | + [enable\_custom\_metric](#input\_enable\_custom\_metrics) | To collect custom metrics like number of tokens sent and received | `bool` | `false` | no | + [huggingface_secret](#input\_huggingface_secret) | Name of the kubectl huggingface secret token | `string` | `huggingface-secret` | no | diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py new file mode 100644 index 000000000..773be423a --- /dev/null +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py @@ -0,0 +1,42 @@ +import datetime +import logging + +class TokenMetricCollector: + def __init__(self): + self.tokens_sent = [] + self.tokens_received = [] + self.test_time = [] + self.success_count = 0 + self.failure_count = 0 + + + def add_metric(self, sent, received, test_time, request_successful_bool): + if request_successful_bool==1: + self.tokens_sent.append(sent) + self.tokens_received.append(received) + self.test_time.append(test_time) + self.success_count += 1 + else: + self.failure_count += 1 + + + def calculate_average_tokens(self): + avg_sent = sum(self.tokens_sent) / len(self.tokens_sent) if self.tokens_sent else 0 + avg_received = sum(self.tokens_received) / len(self.tokens_received) if self.tokens_received else 0 + avg_test_time = sum(self.test_time) / len(self.test_time) if self.tokens_sent else 0 + + return avg_sent, avg_received, avg_test_time + + def write_to_csv(self, file_path='custom_metrics.csv'): + import csv + avg_sent, avg_received, avg_test_time = self.calculate_average_tokens() + with open(file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['Metric', 'Average Value']) + writer.writerow(['# of Successful Req', self.success_count]) + writer.writerow(['# of Failed Req', self.failure_count]) + writer.writerow(['Avg Tokens Sent Per Req', avg_sent]) + writer.writerow(['Avg Tokens Received Per Req', avg_received]) + writer.writerow(['Avg Test Time', avg_test_time]) + writer.writerow(['Timestamp', datetime.datetime.now()]) + diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt index 6620a44e9..296c79c88 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt @@ -31,4 +31,4 @@ Werkzeug==2.3.8 zipp==3.8.0 zope.event==4.5.0 zope.interface==5.4.0 -TensorFlow >= 2.0 +TensorFlow >= 2.0 \ No newline at end of file diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh index 9d7d8586f..b405ba557 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh @@ -21,6 +21,7 @@ LOCUST_MODE=${LOCUST_MODE:-standalone} if [[ "$LOCUST_MODE" = "master" ]]; then LOCUS_OPTS="$LOCUS_OPTS --master --stop-timeout 300" elif [[ "$LOCUST_MODE" = "worker" ]]; then + huggingface-cli login --token $HUGGINGFACE_TOKEN FILTER_PROMPTS="python /locust-tasks/load_data.py" FILTER_PROMPTS_OPTS="--gcs_path=$GCS_PATH --tokenizer=$TOKENIZER --max_prompt_len=$MAX_PROMPT_LEN --max_num_prompts=$MAX_NUM_PROMPTS" echo "$FILTER_PROMPTS $FILTER_PROMPTS_OPTS" diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py index 99643a757..d85840488 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py @@ -14,15 +14,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging +import os import random - +import threading +import time +from locust import web # Import the web module from Locust +from flask import send_from_directory +from typing import Callable, List from locust import FastHttpUser, task, events from locust.runners import MasterRunner +from transformers import AutoTokenizer, PreTrainedTokenizerBase -logging.basicConfig(level=logging.INFO) +from custom_metric_aggregator import TokenMetricCollector +metric_collector = TokenMetricCollector() +logging.basicConfig(level=logging.INFO) def load_test_prompts(): """Loads test prompts from a local file location.""" @@ -31,8 +40,9 @@ def load_test_prompts(): return test_data -def generate_request(model_params, prompt): +def generate_request(prompt): """Generates request for given model server""" + global model_params backend = model_params["backend"] best_of = model_params["best_of"] output_len = model_params["max_output_len"] @@ -89,6 +99,28 @@ def generate_request(model_params, prompt): raise ValueError(f"Unknown backend: {backend}") return pload +def get_token_count(prompt, resp): + """Get number of tokens to prompt and resp using the tokenizer""" + global tokenizer + backend = model_params["backend"] + + number_of_input_tokens = len(tokenizer.encode(prompt)) + number_of_output_tokens = 0 + + if backend == "vllm": + number_of_output_tokens = 0 # to be added + elif backend == "tgi": + number_of_output_tokens = 0 # to be added + elif backend == "tensorrt_llm_triton": + resp_dict = json.loads(resp.content.decode('utf-8')) + number_of_output_tokens = len(tokenizer.encode(resp_dict['text_output'])) + elif backend == "sax": + number_of_output_tokens = 0 # to be added + else: + raise ValueError(f"Unknown backend: {backend}") + return number_of_input_tokens, number_of_output_tokens + + class BenchmarkUser(FastHttpUser): weight = 1 @@ -97,7 +129,9 @@ class BenchmarkUser(FastHttpUser): def lm_generate(self): global test_data global model_params + global tokenizer + if not test_data: logging.error("No test data configured.") logging.error("Stopping the runner") @@ -106,15 +140,81 @@ def lm_generate(self): prompt = test_data[random.randrange(0, len(test_data))] - request = generate_request(model_params, prompt) + request = generate_request(prompt) headers = {"User-Agent": "Benchmark Client", "Connection": "close"} logging.info(f"Sending request: {request}") + test_start_time = time.time() with self.client.post("/generate", headers=headers, json=request, catch_response=True) as resp: - if resp.status_code != 200: - # Locust considers response code < 400 as success, if not 200 mark as otherwise. - resp.failure("Got unexpected response") - logging.error( - f"request {request} failed with: {resp.status_code}") + if resp.status_code == 200: + self.handle_successful_response(prompt, resp, test_start_time) + else: + self.handle_failed_response(request, resp) + + + def handle_successful_response(self, prompt, reponse, start_time): + global model_params + if model_params['enable_custom_metrics'] == 'true': + test_time = time.time() - start_time + request_successful_bool = 1 + tokens_sent, tokens_received = get_token_count(prompt, reponse) + + logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}') + self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool]) + + def handle_failed_response(self, request, response): + global model_params + response.failure("Got unexpected response") + logging.error(f"request {request} failed with: {response.status_code}") + if model_params['enable_custom_metrics'] == 'true': + tokens_sent = -1 + tokens_received = -1 + test_time = -1 + request_successful_bool = 0 + + logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}') + self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool]) + + +""" +methods for the locust master to write custom metrics +""" +def collect_metrics(msg, **_kwargs): + """locust master collects the metrics emitted by the locust workers and updates the metric_collector object""" + sent = msg.data[0] + received = msg.data[1] + test_time = msg.data[2] + request_successful_bool = msg.data[3] + logging.info(f'recevied from worker {msg.data}') + metric_collector.add_metric(sent, received, test_time, request_successful_bool) + +def periodically_write_metrics(environment): + metric_collector.write_to_csv() + threading.Timer(environment.parsed_options.csv_upload_frequency, periodically_write_metrics, args=(environment,)).start() + +def setup_periodic_metrics_writer(environment, **_kwargs): + """locust master periodically writes the collected metrics to csv""" + periodically_write_metrics(environment) + +def setup_custom_route(environment, **_kwargs): + """Sets up custom routes in the locust master for serving CSV files.""" + directory = os.path.dirname('/') # Directory where the file is located + + @environment.web_ui.app.route("/custom_metrics/") + def custom_metrics(filename): + if filename not in ['custom_metrics.csv', 'custom_metrics_final.csv']: + return "File not found.", 404 # Basic validation to prevent unauthorized file access + return send_from_directory(directory, filename, as_attachment=True) + +@events.test_stop.add_listener +def on_test_stop(environment, **kwargs): + """on test stop the locust master writes the output to custom_metrics_final and resets the metric_collector for next tests""" + if isinstance(environment.runner, MasterRunner) and environment.parsed_options.enable_custom_metrics == 'true': + logging.info(f'init metric_collector') + metric_collector.write_to_csv('custom_metrics_final.csv') + metric_collector.__init__() + metric_collector.write_to_csv() + + @events.init_command_line_parser.add_listener @@ -129,6 +229,12 @@ def _(parser): include_in_web_ui=True, default="", help="Required for sax backend. Used only for sax backend. Model name to send request to at API server for SAX model server.") parser.add_argument("--use_beam_search", action="store_true", env_var="USE_BEAM_SEARCH", include_in_web_ui=True, help="Whether to use beam search instead of sampling.") + parser.add_argument("--tokenizer", type=str, env_var="TOKENIZER", + include_in_web_ui=False, default="", help="Tokenizer to use for token calculations") + parser.add_argument("--enable_custom_metrics", type=str, env_var="ENABLE_CUSTOM_METRICS", + include_in_web_ui=True, default="false", help="enable custom metric") + parser.add_argument("--csv_upload_frequency", type=int, env_var="CSV_UPLOAD_FREQUENCY", + include_in_web_ui=True, default=10, help="upload custom metrics every X seconds") @events.init.add_listener @@ -136,6 +242,10 @@ def _(environment, **kwargs): if not isinstance(environment.runner, MasterRunner): global model_params global test_data + global metric_collector + global tokenizer + + tokenizer = AutoTokenizer.from_pretrained(environment.parsed_options.tokenizer) logging.info( "Loading test prompts from locust-tasks/filtered_prompts.txt.") @@ -152,6 +262,15 @@ def _(environment, **kwargs): "max_output_len": environment.parsed_options.max_output_len, "sax_model": environment.parsed_options.sax_model, "use_beam_search": environment.parsed_options.use_beam_search, + "tokenizer": environment.parsed_options.tokenizer, + "enable_custom_metrics" : environment.parsed_options.enable_custom_metrics, + "csv_upload_frequency" : environment.parsed_options.csv_upload_frequency, } logging.info( f"Using the following benchmark parameters:\n {model_params}") + + elif environment.parsed_options.enable_custom_metrics == 'true': + # code to setup the locust master to write custom metrics + setup_periodic_metrics_writer(environment) + setup_custom_route(environment) + environment.runner.register_message("metric_update", collect_metrics) diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf index 9002e99ab..6fab0603b 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/main.tf +++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf @@ -41,6 +41,9 @@ locals { sax_model = var.sax_model tokenizer = var.tokenizer use_beam_search = var.use_beam_search + enable_custom_metrics = var.enable_custom_metrics + huggingface_secret = var.huggingface_secret + csv_upload_frequency = var.csv_upload_frequency })) : data] ]) } diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl index 64be13170..bcef25201 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl +++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl @@ -26,6 +26,8 @@ spec: value: http://${inference_server_service} - name: BACKEND value: ${inference_server_framework} + - name: ENABLE_CUSTOM_METRICS + value: ${enable_custom_metrics} ports: - name: loc-master-web containerPort: 8089 diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl index e14fcb61f..d8a09dab5 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl +++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl @@ -44,3 +44,12 @@ spec: value: ${tokenizer} - name: USE_BEAM_SEARCH value: ${use_beam_search} + - name: ENABLE_CUSTOM_METRICS + value: ${enable_custom_metrics} + - name: CSV_UPLOAD_FREQUENCY + value: ${csv_upload_frequency} + - name: HUGGINGFACE_TOKEN + valueFrom: + secretKeyRef: + name: ${huggingface_secret} # Replace ${huggingface_secret} with your secret's name + key: token diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf index 920013b31..e8000c7f5 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf +++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf @@ -189,4 +189,24 @@ variable "run_test_automatically" { description = "Run the test after deployment" type = bool default = false +} + +variable "enable_custom_metrics" { + description = "enable custom metric output in Locust" + type = bool + default = false +} + +variable "huggingface_secret" { + description = "name of the kubectl huggingface secret token" + type = string + nullable = true + default = "huggingface-secret" +} + +variable "csv_upload_frequency" { + description = "how frequently, in seconds, to upload csv if custom metrics is turned on" + type = number + nullable = true + default = 10 } \ No newline at end of file