GoogleCloudPlatform · alvarobartt · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024
@@ -51,7 +51,7 @@ spec:
         - name: text-generation-inference
           ports:
             - containerPort: 80
-          image: "ghcr.io/huggingface/text-generation-inference:1.4.2"
+          image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
           args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
           env:
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}

diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
         }
         container {
-          image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
+          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
           name  = "mistral-7b-instruct"
 
           port {

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -8,7 +8,7 @@ Learn how to serve the Mistral 7B instruct v0.1 chat model on GKE using just 1 x
 *   GPU Quota: Confirm you have the quota for at least one L4 GPU in your Google Cloud account.
 *   Model Access: Secure access to the Mistral 7B model by agreeing to the terms on Hugging Face, which typically involves creating an account and accepting the model's use conditions.
 *   Ensure you currently have installed a stable version of Transformers, 4.34.0 or newer.
-*  (OPTIONAL) If you intend to utlize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+
+*  (OPTIONAL) If you intend to utilize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+
 
 
 ### GPU-Memory Allocation 
@@ -104,7 +104,7 @@ Pod Template:
   Labels:  app=mistral-7b
   Containers:
    mistral-7b:
-    Image:      ghcr.io/huggingface/text-generation-inference:1.1.1
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
@@ -317,4 +317,4 @@ echo "Latency per Generated Token: $latency_per_token seconds"
 
 Visit the API docs at http://localhost:8080/docs for more details.
 
-This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment.
+This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment.
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: mistral-7b
-        image: ghcr.io/huggingface/text-generation-inference:1.1.1
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 1

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -10,7 +10,7 @@ This guide walks you through the process of serving the Mixtral 8x7 model on Goo
 Transformers Library: Ensure you have installed a stable version of the Transformers library, version 4.34.0 or newer.
 *   HPA (Optional): If you plan to use the Horizontal Pod Autoscaler (HPA) to scale for incoming requests, ensure the 'maxReplicas' assignment in your mixtral-8x7.yaml HorizontalPodAutoscaler section is set to equal or be less than the number of GPUs available for deployment.
 
-### GPU-Memory Allication and Quantization Strategy
+### GPU-Memory Allocation and Quantization Strategy
 GPU-Memory Allocation and Quantization Strategy
 When deploying the Mixtral 8x7 model, it's crucial to assess both the memory requirements and the computational efficiency, especially when leveraging Nvidia L4 GPUs, each with 24 GB of GPU memory. A key factor in this consideration is the use of quantization techniques to optimize model performance and memory usage.
 
@@ -127,7 +127,7 @@ Pod Template:
   Labels:  app=mixtral8x7b
   Containers:
    mixtral8x7b:
-    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:

diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -30,7 +30,7 @@ spec:
         cloud.google.com/gke-accelerator: "nvidia-l4"
       containers:
       - name: mixtral8x7b
-        image: ghcr.io/huggingface/text-generation-inference:1.4.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         ports:
         - name: server-port
           containerPort: 8080

diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -76,7 +76,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: ghcr.io/huggingface/text-generation-inference:1.0.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 2

diff --git a/...rials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/...rials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: ghcr.io/huggingface/text-generation-inference:1.0.3
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
         resources:
           limits:
             nvidia.com/gpu: 2