diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 043125d3e..f36de1daf 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -10,12 +10,14 @@
     - [Step 4: create and give service account access to write to output gcs bucket](#step-4-create-and-give-service-account-access-to-write-to-output-gcs-bucket)
     - [Step 5: create artifact repository for automated Locust docker build](#step-5-create-artifact-repository-for-automated-locust-docker-build)
     - [Step 6: create and configure terraform.tfvars](#step-6-create-and-configure-terraformtfvars)
-      - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+      - [optional: set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+      - [optional: set up secret token in secret manager](#optional-set-up-secret-token-in-secret-manager)
     - [Step 7: login to gcloud](#step-7-login-to-gcloud)
     - [Step 8: terraform initialize, plan and apply](#step-8-terraform-initialize-plan-and-apply)
     - [Step 9: start an end to end benchmark](#step-9-start-an-end-to-end-benchmark)
       - [option 1: initiate a single end to end Locust benchmark run via curl command](#option-1-initiate-a-single-end-to-end-locust-benchmark-run-via-curl-command)
       - [option 2: interactive benchmark with locust web ui](#option-2-interactive-benchmark-with-locust-web-ui)
+      - [writing custom metrics](#writing-custom-metrics)
     - [Additional Tips](#additional-tips)
   - [Variables](#variables)
 <!-- END TOC -->
@@ -142,6 +144,18 @@ credentials_config = {
 }
 ```
 
+#### [optional] set up secret token in Secret Manager
+
+A model may require a security token to access it. For example, Llama2 from HuggingFace is a gated model that requires a [user access token](https://huggingface.co/docs/hub/en/security-tokens). If the model you want to run does not require this, skip this step.
+
+If you followed steps from `.../../infra/`, Secret Manager and the user access token should already be set up. Alternatively you can create a Kubernetes Secret to store your Hugging Face CLI token. You can do this from the command line with kubectl:
+```bash
+kubectl create secret generic huggingface-secret --from-literal=token='************'
+```
+
+This command creates a new Secret named huggingface-secret, which has a key token containing your Hugging Face CLI token.
+It is important to note that for any production or shared environments, directly storing user access tokens as literals is not advisable.
+
 ### Step 7: login to gcloud
 
 Run the following gcloud command for authorization:
@@ -208,6 +222,17 @@ In a web browser, visit the following website:
 ```
 http://$LOCUST_SERVICE_IP:8089
 ```
+#### writing custom metrics
+
+If the variable `enable_custom_metrics` is set to `true` then  custom metrics collected by the locust master is available at the following endpoints:
+* While the test is running
+```
+http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics.csv
+```
+* After a test ends:
+```
+http://$LOCUST_SERVICE_IP:8089/custom_metrics/custom_metrics_final.csv
+```
 
 ### Additional Tips
 
@@ -237,4 +262,6 @@ To change the benchmark configuration, you will have to rerun terraform destroy
 | <a name="input_sax_model"></a> [sax\_model](#input\_sax\_model)                                                      | Benchmark server configuration for sax model. Only required if framework is sax.                                            | `string`                                                                                                                                                                                                    | `""`                 |    no    |
 | <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer)                                                        | Benchmark server configuration for tokenizer.                                                                               | `string`                                                                                                                                                                                                    | `"tiiuae/falcon-7b"` |   yes    |
 | <a name="input_use_beam_search"></a> [use\_beam\_search](#input\_use\_beam\_search)                                  | Benchmark server configuration for use beam search.                                                                         | `bool`                                                                                                                                                                                                      | `false`              |    no    |
+ <a name="enable_custom_metrics"></a> [enable\_custom\_metric](#input\_enable\_custom\_metrics)                                  | To collect custom metrics like number of tokens sent and received                                                                          | `bool`                                                                                                                                                                                                      | `false`              |    no    |
+  <a name="huggingface_secret"></a> [huggingface_secret](#input\_huggingface_secret)                                  | Name of the kubectl huggingface secret token                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
 <!-- END_TF_DOCS -->
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
new file mode 100644
index 000000000..773be423a
--- /dev/null
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
@@ -0,0 +1,42 @@
+import datetime
+import logging
+
+class TokenMetricCollector:
+    def __init__(self):
+        self.tokens_sent = []
+        self.tokens_received = []
+        self.test_time = []
+        self.success_count = 0
+        self.failure_count = 0
+        
+
+    def add_metric(self, sent, received, test_time, request_successful_bool):
+        if request_successful_bool==1:
+            self.tokens_sent.append(sent)
+            self.tokens_received.append(received)
+            self.test_time.append(test_time)
+            self.success_count += 1
+        else:
+            self.failure_count += 1
+            
+
+    def calculate_average_tokens(self):
+        avg_sent = sum(self.tokens_sent) / len(self.tokens_sent) if self.tokens_sent else 0
+        avg_received = sum(self.tokens_received) / len(self.tokens_received) if self.tokens_received else 0
+        avg_test_time = sum(self.test_time) / len(self.test_time) if self.tokens_sent else 0
+        
+        return avg_sent, avg_received, avg_test_time
+
+    def write_to_csv(self, file_path='custom_metrics.csv'):
+        import csv
+        avg_sent, avg_received, avg_test_time = self.calculate_average_tokens()
+        with open(file_path, mode='w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(['Metric', 'Average Value'])
+            writer.writerow(['# of Successful Req', self.success_count])
+            writer.writerow(['# of Failed Req', self.failure_count])
+            writer.writerow(['Avg Tokens Sent Per Req', avg_sent])
+            writer.writerow(['Avg Tokens Received Per Req', avg_received])
+            writer.writerow(['Avg Test Time', avg_test_time])
+            writer.writerow(['Timestamp', datetime.datetime.now()])
+            
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
index 6620a44e9..296c79c88 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
@@ -31,4 +31,4 @@ Werkzeug==2.3.8
 zipp==3.8.0
 zope.event==4.5.0
 zope.interface==5.4.0
-TensorFlow >= 2.0
+TensorFlow >= 2.0
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
index 9d7d8586f..b405ba557 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
@@ -21,6 +21,7 @@ LOCUST_MODE=${LOCUST_MODE:-standalone}
 if [[ "$LOCUST_MODE" = "master" ]]; then
     LOCUS_OPTS="$LOCUS_OPTS --master --stop-timeout 300"
 elif [[ "$LOCUST_MODE" = "worker" ]]; then
+    huggingface-cli login --token $HUGGINGFACE_TOKEN
     FILTER_PROMPTS="python /locust-tasks/load_data.py"
     FILTER_PROMPTS_OPTS="--gcs_path=$GCS_PATH --tokenizer=$TOKENIZER --max_prompt_len=$MAX_PROMPT_LEN --max_num_prompts=$MAX_NUM_PROMPTS"
     echo "$FILTER_PROMPTS $FILTER_PROMPTS_OPTS"
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 99643a757..d85840488 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -14,15 +14,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import logging
+import os
 import random
-
+import threading
+import time
+from locust import web  # Import the web module from Locust
+from flask import send_from_directory
+from typing import Callable, List
 from locust import FastHttpUser, task, events
 from locust.runners import MasterRunner
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 
-logging.basicConfig(level=logging.INFO)
+from custom_metric_aggregator import TokenMetricCollector
+metric_collector = TokenMetricCollector()
 
+logging.basicConfig(level=logging.INFO)
 
 def load_test_prompts():
     """Loads test prompts from a local file location."""
@@ -31,8 +40,9 @@ def load_test_prompts():
     return test_data
 
 
-def generate_request(model_params, prompt):
+def generate_request(prompt):
     """Generates request for given model server"""
+    global model_params
     backend = model_params["backend"]
     best_of = model_params["best_of"]
     output_len = model_params["max_output_len"]
@@ -89,6 +99,28 @@ def generate_request(model_params, prompt):
         raise ValueError(f"Unknown backend: {backend}")
     return pload
 
+def get_token_count(prompt, resp):
+    """Get number of tokens to prompt and resp using the tokenizer"""
+    global tokenizer
+    backend = model_params["backend"]
+    
+    number_of_input_tokens = len(tokenizer.encode(prompt))
+    number_of_output_tokens = 0
+    
+    if backend == "vllm":
+        number_of_output_tokens = 0 # to be added
+    elif backend == "tgi":
+        number_of_output_tokens = 0 # to be added
+    elif backend == "tensorrt_llm_triton":
+        resp_dict = json.loads(resp.content.decode('utf-8'))
+        number_of_output_tokens = len(tokenizer.encode(resp_dict['text_output']))
+    elif backend == "sax":
+        number_of_output_tokens = 0 # to be added
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+    return number_of_input_tokens, number_of_output_tokens
+
+
 
 class BenchmarkUser(FastHttpUser):
     weight = 1
@@ -97,7 +129,9 @@ class BenchmarkUser(FastHttpUser):
     def lm_generate(self):
         global test_data
         global model_params
+        global tokenizer
 
+        
         if not test_data:
             logging.error("No test data configured.")
             logging.error("Stopping the runner")
@@ -106,15 +140,81 @@ def lm_generate(self):
 
         prompt = test_data[random.randrange(0, len(test_data))]
 
-        request = generate_request(model_params, prompt)
+        request = generate_request(prompt)
         headers = {"User-Agent": "Benchmark Client", "Connection": "close"}
         logging.info(f"Sending request: {request}")
+        test_start_time = time.time()
         with self.client.post("/generate", headers=headers, json=request, catch_response=True) as resp:
-            if resp.status_code != 200:
-                # Locust considers response code  < 400 as success, if not 200 mark as otherwise.
-                resp.failure("Got unexpected response")
-                logging.error(
-                    f"request {request} failed with: {resp.status_code}")
+            if resp.status_code == 200:
+                self.handle_successful_response(prompt, resp, test_start_time)  
+            else:
+                self.handle_failed_response(request, resp)
+                
+                
+    def handle_successful_response(self, prompt, reponse, start_time):
+        global model_params
+        if model_params['enable_custom_metrics'] == 'true':
+            test_time = time.time() - start_time
+            request_successful_bool = 1
+            tokens_sent, tokens_received = get_token_count(prompt, reponse)
+            
+            logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+            self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool])
+            
+    def handle_failed_response(self, request, response):
+        global model_params
+        response.failure("Got unexpected response")
+        logging.error(f"request {request} failed with: {response.status_code}")
+        if model_params['enable_custom_metrics'] == 'true':
+            tokens_sent = -1
+            tokens_received = -1
+            test_time = -1
+            request_successful_bool = 0
+            
+            logging.info(f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+            self.environment.runner.send_message("metric_update", [tokens_sent, tokens_received, test_time, request_successful_bool])
+            
+        
+"""
+methods for the locust master to write custom metrics
+"""      
+def collect_metrics(msg, **_kwargs):
+    """locust master collects the metrics emitted by the locust workers and updates the metric_collector object"""
+    sent = msg.data[0]
+    received = msg.data[1]
+    test_time = msg.data[2]
+    request_successful_bool = msg.data[3]
+    logging.info(f'recevied from worker {msg.data}')
+    metric_collector.add_metric(sent, received, test_time, request_successful_bool)              
+
+def periodically_write_metrics(environment):
+    metric_collector.write_to_csv()
+    threading.Timer(environment.parsed_options.csv_upload_frequency, periodically_write_metrics, args=(environment,)).start()
+
+def setup_periodic_metrics_writer(environment,  **_kwargs):
+    """locust master periodically writes the collected metrics to csv"""
+    periodically_write_metrics(environment)  
+        
+def setup_custom_route(environment, **_kwargs):
+    """Sets up custom routes in the locust master for serving CSV files."""
+    directory = os.path.dirname('/')  # Directory where the file is located
+
+    @environment.web_ui.app.route("/custom_metrics/<filename>")
+    def custom_metrics(filename):
+        if filename not in ['custom_metrics.csv', 'custom_metrics_final.csv']:
+            return "File not found.", 404  # Basic validation to prevent unauthorized file access
+        return send_from_directory(directory, filename, as_attachment=True)
+    
+@events.test_stop.add_listener
+def on_test_stop(environment, **kwargs):
+    """on test stop the locust master writes the output to custom_metrics_final and resets the metric_collector for next tests"""
+    if isinstance(environment.runner, MasterRunner) and environment.parsed_options.enable_custom_metrics == 'true':
+        logging.info(f'init metric_collector')
+        metric_collector.write_to_csv('custom_metrics_final.csv')
+        metric_collector.__init__()
+        metric_collector.write_to_csv()
+        
+
 
 
 @events.init_command_line_parser.add_listener
@@ -129,6 +229,12 @@ def _(parser):
                         include_in_web_ui=True, default="",  help="Required for sax backend. Used only for sax backend. Model name to send request to at API server for SAX model server.")
     parser.add_argument("--use_beam_search", action="store_true", env_var="USE_BEAM_SEARCH",
                         include_in_web_ui=True, help="Whether to use beam search instead of sampling.")
+    parser.add_argument("--tokenizer", type=str, env_var="TOKENIZER",
+                        include_in_web_ui=False, default="", help="Tokenizer to use for token calculations")
+    parser.add_argument("--enable_custom_metrics", type=str, env_var="ENABLE_CUSTOM_METRICS",
+                        include_in_web_ui=True, default="false", help="enable custom metric")
+    parser.add_argument("--csv_upload_frequency", type=int, env_var="CSV_UPLOAD_FREQUENCY",
+                        include_in_web_ui=True, default=10, help="upload custom metrics every X seconds")
 
 
 @events.init.add_listener
@@ -136,6 +242,10 @@ def _(environment, **kwargs):
     if not isinstance(environment.runner, MasterRunner):
         global model_params
         global test_data
+        global metric_collector
+        global tokenizer 
+        
+        tokenizer = AutoTokenizer.from_pretrained(environment.parsed_options.tokenizer)
 
         logging.info(
             "Loading test prompts from locust-tasks/filtered_prompts.txt.")
@@ -152,6 +262,15 @@ def _(environment, **kwargs):
             "max_output_len": environment.parsed_options.max_output_len,
             "sax_model": environment.parsed_options.sax_model,
             "use_beam_search": environment.parsed_options.use_beam_search,
+            "tokenizer": environment.parsed_options.tokenizer,
+            "enable_custom_metrics" : environment.parsed_options.enable_custom_metrics,
+            "csv_upload_frequency" : environment.parsed_options.csv_upload_frequency,
         }
         logging.info(
             f"Using the following benchmark parameters:\n {model_params}")
+        
+    elif environment.parsed_options.enable_custom_metrics == 'true':
+        # code to setup the locust master to write custom metrics
+        setup_periodic_metrics_writer(environment)
+        setup_custom_route(environment)
+        environment.runner.register_message("metric_update", collect_metrics)
diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf
index 9002e99ab..6fab0603b 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/main.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf
@@ -41,6 +41,9 @@ locals {
       sax_model                  = var.sax_model
       tokenizer                  = var.tokenizer
       use_beam_search            = var.use_beam_search
+      enable_custom_metrics      = var.enable_custom_metrics
+      huggingface_secret         = var.huggingface_secret
+      csv_upload_frequency       = var.csv_upload_frequency
     })) : data]
   ])
 }
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
index 64be13170..bcef25201 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
@@ -26,6 +26,8 @@ spec:
               value: http://${inference_server_service}
             - name: BACKEND
               value: ${inference_server_framework}
+            - name: ENABLE_CUSTOM_METRICS
+              value: ${enable_custom_metrics}
           ports:
             - name: loc-master-web
               containerPort: 8089
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
index e14fcb61f..d8a09dab5 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
@@ -44,3 +44,12 @@ spec:
               value: ${tokenizer}
             - name: USE_BEAM_SEARCH
               value: ${use_beam_search}
+            - name: ENABLE_CUSTOM_METRICS
+              value: ${enable_custom_metrics}
+            - name: CSV_UPLOAD_FREQUENCY
+              value: ${csv_upload_frequency}
+            - name: HUGGINGFACE_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: ${huggingface_secret}  # Replace ${huggingface_secret} with your secret's name
+                  key: token
diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
index 920013b31..e8000c7f5 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
@@ -189,4 +189,24 @@ variable "run_test_automatically" {
   description = "Run the test after deployment"
   type        = bool
   default     = false
+}
+
+variable "enable_custom_metrics" {
+  description = "enable custom metric output in Locust"
+  type        = bool
+  default     = false
+}
+
+variable "huggingface_secret" {
+  description = "name of the kubectl huggingface secret token"
+  type        = string
+  nullable    = true
+  default     = "huggingface-secret"
+}
+
+variable "csv_upload_frequency" {
+  description = "how frequently, in seconds, to upload csv if custom metrics is turned on"
+  type        = number
+  nullable    = true
+  default     = 10
 }
\ No newline at end of file