From a58d42b0d66760096dbad2849ae8d7f324ac561e Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 16 Sep 2024 17:39:00 +0200
Subject: [PATCH 1/3] Update `image` to Hugging Face DLC for TGI

---
 .../manifest-templates/text-generation-inference.tftpl          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
index f2db5fbfd..377b5bce0 100644
--- a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
+++ b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
@@ -51,7 +51,7 @@ spec:
         - name: text-generation-inference
           ports:
             - containerPort: 80
-          image: "ghcr.io/huggingface/text-generation-inference:1.4.2"
+          image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
           args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
           env:
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}

From f7462e652dd52190b0b4f9cba883c02302f6af72 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 16 Sep 2024 18:10:39 +0200
Subject: [PATCH 2/3] Update a bunch `image` refs to Hugging Face DLC for TGI

---
 modules/inference-service/main.tf                           | 2 +-
 .../deploying-mistral-7b-instruct-L4gpus/README.md          | 6 +++---
 .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml    | 2 +-
 .../deploying-mixtral-8x7b-instruct-L4-gpus/README.md       | 4 ++--
 .../mixtral-8x7b.yaml                                       | 2 +-
 .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md       | 2 +-
 .../text-generation-interface.yaml                          | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
index 91e558369..44655fc23 100644
--- a/modules/inference-service/main.tf
+++ b/modules/inference-service/main.tf
@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
         }
         container {
-          image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
+          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
           name  = "mistral-7b-instruct"
 
           port {
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
index c572a3f85..6362c35a6 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -8,7 +8,7 @@ Learn how to serve the Mistral 7B instruct v0.1 chat model on GKE using just 1 x
 *   GPU Quota: Confirm you have the quota for at least one L4 GPU in your Google Cloud account.
 *   Model Access: Secure access to the Mistral 7B model by agreeing to the terms on Hugging Face, which typically involves creating an account and accepting the model's use conditions.
 *   Ensure you currently have installed a stable version of Transformers, 4.34.0 or newer.
-*  (OPTIONAL) If you intend to utlize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+
+*  (OPTIONAL) If you intend to utilize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+
 
 
 ### GPU-Memory Allocation 
@@ -104,7 +104,7 @@ Pod Template:
   Labels:  app=mistral-7b
   Containers:
    mistral-7b:
-    Image:      ghcr.io/huggingface/text-generation-inference:1.1.1
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
@@ -317,4 +317,4 @@ echo "Latency per Generated Token: $latency_per_token seconds"
 
 Visit the API docs at http://localhost:8080/docs for more details.
 
-This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment.
\ No newline at end of file
+This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment.
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
index 22b892f23..1828472a8 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: mistral-7b
-        image: ghcr.io/huggingface/text-generation-inference:1.1.1
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 1
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
index 06f085e58..60739ffc6 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -10,7 +10,7 @@ This guide walks you through the process of serving the Mixtral 8x7 model on Goo
 Transformers Library: Ensure you have installed a stable version of the Transformers library, version 4.34.0 or newer.
 *   HPA (Optional): If you plan to use the Horizontal Pod Autoscaler (HPA) to scale for incoming requests, ensure the 'maxReplicas' assignment in your mixtral-8x7.yaml HorizontalPodAutoscaler section is set to equal or be less than the number of GPUs available for deployment.
 
-### GPU-Memory Allication and Quantization Strategy
+### GPU-Memory Allocation and Quantization Strategy
 GPU-Memory Allocation and Quantization Strategy
 When deploying the Mixtral 8x7 model, it's crucial to assess both the memory requirements and the computational efficiency, especially when leveraging Nvidia L4 GPUs, each with 24 GB of GPU memory. A key factor in this consideration is the use of quantization techniques to optimize model performance and memory usage.
 
@@ -127,7 +127,7 @@ Pod Template:
   Labels:  app=mixtral8x7b
   Containers:
    mixtral8x7b:
-    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
index 4850aba50..72a7e61d6 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -30,7 +30,7 @@ spec:
         cloud.google.com/gke-accelerator: "nvidia-l4"
       containers:
       - name: mixtral8x7b
-        image: ghcr.io/huggingface/text-generation-inference:1.4.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         ports:
         - name: server-port
           containerPort: 8080
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
index 5c9a72f9a..617e4072c 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -76,7 +76,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: ghcr.io/huggingface/text-generation-inference:1.0.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 2
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
index a592d9433..a9963a719 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: ghcr.io/huggingface/text-generation-inference:1.0.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 2

From c5a46693925c9638d288b05eb758bcf8fd22b599 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:14:43 +0200
Subject: [PATCH 3/3] Revert `image` on benchmarks until verified

Included a harmless TODO note so that we remember to come back to this
once the benchmark is verified with the latest Hugging Face DLC for TGI
cc @annapendleton
---
 .../manifest-templates/text-generation-inference.tftpl          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
index 377b5bce0..eb7b46071 100644
--- a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
+++ b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl
@@ -51,7 +51,7 @@ spec:
         - name: text-generation-inference
           ports:
             - containerPort: 80
-          image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
+          image: "ghcr.io/huggingface/text-generation-inference:1.4.2"  # TODO(annapendleton,alvarobartt): update to Hugging Face DLC once verified
           args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
           env:
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}