From a58d42b0d66760096dbad2849ae8d7f324ac561e Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:39:00 +0200 Subject: [PATCH 1/3] Update `image` to Hugging Face DLC for TGI --- .../manifest-templates/text-generation-inference.tftpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl index f2db5fbfd..377b5bce0 100644 --- a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl +++ b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl @@ -51,7 +51,7 @@ spec: - name: text-generation-inference ports: - containerPort: 80 - image: "ghcr.io/huggingface/text-generation-inference:1.4.2" + image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310" args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"] env: %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} From f7462e652dd52190b0b4f9cba883c02302f6af72 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:10:39 +0200 Subject: [PATCH 2/3] Update a bunch `image` refs to Hugging Face DLC for TGI --- modules/inference-service/main.tf | 2 +- .../deploying-mistral-7b-instruct-L4gpus/README.md | 6 +++--- .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml | 2 +- .../deploying-mixtral-8x7b-instruct-L4-gpus/README.md | 4 ++-- .../mixtral-8x7b.yaml | 2 +- .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md | 2 +- .../text-generation-interface.yaml | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf index 91e558369..44655fc23 100644 --- a/modules/inference-service/main.tf +++ b/modules/inference-service/main.tf @@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" { } } container { - image = "ghcr.io/huggingface/text-generation-inference:1.1.0" + image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310" name = "mistral-7b-instruct" port { diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md index c572a3f85..6362c35a6 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md @@ -8,7 +8,7 @@ Learn how to serve the Mistral 7B instruct v0.1 chat model on GKE using just 1 x * GPU Quota: Confirm you have the quota for at least one L4 GPU in your Google Cloud account. * Model Access: Secure access to the Mistral 7B model by agreeing to the terms on Hugging Face, which typically involves creating an account and accepting the model's use conditions. * Ensure you currently have installed a stable version of Transformers, 4.34.0 or newer. -* (OPTIONAL) If you intend to utlize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+ +* (OPTIONAL) If you intend to utilize the HPA, (horizontal pod autoscaler) in order to scale for incoming requests please make sure that the 'maxReplicas' assignment in your mistral-7b.yaml HorizontalPodAutoscaler section is configured to equal or be less than the number of GPUs you have available for the deployment. Additionally, ensure that you have a DCGM (Data Center GPU Manager) NVIDIA pod configured within your Kubernetes cluster to collect GPU metrics. Look at DCGM documentation for guidance on setting up and configuring this pod properly. This is essential for the Horizontal Pod Autoscaler (HPA) to accurately scale based on GPU utilization. Without proper GPU metrics, the autoscaler won't be able to make informed scaling decisions, potentially leading to under or over-provisioning of resources. Integrate the DCGM pod within your cluster's monitoring system to provide real-time GPU performance data to the HPA.+ ### GPU-Memory Allocation @@ -104,7 +104,7 @@ Pod Template: Labels: app=mistral-7b Containers: mistral-7b: - Image: ghcr.io/huggingface/text-generation-inference:1.1.1 + Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 Port: 8080/TCP Host Port: 0/TCP Limits: @@ -317,4 +317,4 @@ echo "Latency per Generated Token: $latency_per_token seconds" Visit the API docs at http://localhost:8080/docs for more details. -This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment. \ No newline at end of file +This README provides a concise guide to deploying the Mistral 7B instruct v.01 model, listed above are key steps and adjustments needed for a general sample deployment. Ensure to replace placeholders and commands with the specific details of your GKE setup and Mistralv01-instruct model deployment. diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml index 22b892f23..1828472a8 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml @@ -28,7 +28,7 @@ spec: spec: containers: - name: mistral-7b - image: ghcr.io/huggingface/text-generation-inference:1.1.1 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 resources: limits: nvidia.com/gpu: 1 diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md index 06f085e58..60739ffc6 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md @@ -10,7 +10,7 @@ This guide walks you through the process of serving the Mixtral 8x7 model on Goo Transformers Library: Ensure you have installed a stable version of the Transformers library, version 4.34.0 or newer. * HPA (Optional): If you plan to use the Horizontal Pod Autoscaler (HPA) to scale for incoming requests, ensure the 'maxReplicas' assignment in your mixtral-8x7.yaml HorizontalPodAutoscaler section is set to equal or be less than the number of GPUs available for deployment. -### GPU-Memory Allication and Quantization Strategy +### GPU-Memory Allocation and Quantization Strategy GPU-Memory Allocation and Quantization Strategy When deploying the Mixtral 8x7 model, it's crucial to assess both the memory requirements and the computational efficiency, especially when leveraging Nvidia L4 GPUs, each with 24 GB of GPU memory. A key factor in this consideration is the use of quantization techniques to optimize model performance and memory usage. @@ -127,7 +127,7 @@ Pod Template: Labels: app=mixtral8x7b Containers: mixtral8x7b: - Image: ghcr.io/huggingface/text-generation-inference:1.4.3 + Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 Port: 8080/TCP Host Port: 0/TCP Limits: diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml index 4850aba50..72a7e61d6 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml @@ -30,7 +30,7 @@ spec: cloud.google.com/gke-accelerator: "nvidia-l4" containers: - name: mixtral8x7b - image: ghcr.io/huggingface/text-generation-inference:1.4.3 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 ports: - name: server-port containerPort: 8080 diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md index 5c9a72f9a..617e4072c 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md @@ -76,7 +76,7 @@ spec: spec: containers: - name: llama-2-70b - image: ghcr.io/huggingface/text-generation-inference:1.0.3 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 resources: limits: nvidia.com/gpu: 2 diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml index a592d9433..a9963a719 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml @@ -28,7 +28,7 @@ spec: spec: containers: - name: llama-2-70b - image: ghcr.io/huggingface/text-generation-inference:1.0.3 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 resources: limits: nvidia.com/gpu: 2 From c5a46693925c9638d288b05eb758bcf8fd22b599 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:14:43 +0200 Subject: [PATCH 3/3] Revert `image` on benchmarks until verified Included a harmless TODO note so that we remember to come back to this once the benchmark is verified with the latest Hugging Face DLC for TGI cc @annapendleton --- .../manifest-templates/text-generation-inference.tftpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl index 377b5bce0..eb7b46071 100644 --- a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl +++ b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl @@ -51,7 +51,7 @@ spec: - name: text-generation-inference ports: - containerPort: 80 - image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310" + image: "ghcr.io/huggingface/text-generation-inference:1.4.2" # TODO(annapendleton,alvarobartt): update to Hugging Face DLC once verified args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"] env: %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}