From dfd04ddffba8063ab264979c855d489730e9d9da Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Wed, 4 Sep 2024 20:58:47 +0000
Subject: [PATCH 01/16] Add RayService vLLM TPU Inference script

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

bug fixes

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

remove extra ray init

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

Read hf token from os

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

Fix bugs

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

Remove hf token logic

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>

Fix serve script

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 105 ++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/serve_tpu.py

diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
new file mode 100644
index 0000000000..e42383e0d1
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -0,0 +1,105 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: this file was inspired from: https://github.com/richardsliu/vllm/blob/rayserve/examples/rayserve_tpu.py
+
+import os
+
+import json
+import logging
+from typing import Dict, List, Optional
+
+import ray
+from fastapi import FastAPI
+from ray import serve
+from starlette.requests import Request
+from starlette.responses import Response
+
+from vllm import LLM, SamplingParams
+
+logger = logging.getLogger("ray.serve")
+
+model_id = "meta-llama/Meta-Llama-3-70B"
+
+app = FastAPI()
+
+@serve.deployment(name="VLLMDeployment")
+@serve.ingress(app)
+class VLLMDeployment:
+    def __init__(
+        self,
+        num_tpu_chips,  
+    ):
+        self.llm = LLM(
+            model=model_id,
+            tensor_parallel_size=num_tpu_chips,
+            enforce_eager=True,
+        )
+
+    @app.post("/v1/generate")
+    async def generate(self, request: Request):
+        request_dict = await request.json()
+        prompts = request_dict.pop("prompt")
+        print("Processing prompt ", prompts)
+        sampling_params = SamplingParams(temperature=0.7,
+                                         top_p=1.0,
+                                         n=1,
+                                         max_tokens=1000)
+
+        outputs = self.llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = ""
+            token_ids = []
+            for completion_output in output.outputs:
+                generated_text += completion_output.text
+                token_ids.extend(list(completion_output.token_ids))
+
+            print("Generated text: ", generated_text)
+            ret = {
+                "prompt": prompt,
+                "text": generated_text,
+                "token_ids": token_ids,
+            }
+
+        return Response(content=json.dumps(ret))
+
+def get_num_tpu_chips() -> int:
+    if "TPU" not in ray.cluster_resources():
+        # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling).
+        if os.environ.get('TPU_CHIPS') is not None:
+            return int(os.environ.get('TPU_CHIPS'))
+        return 0
+    return int(ray.cluster_resources()["TPU"])
+
+def build_app(cli_args: Dict[str, str]) -> serve.Application:
+    """Builds the Serve app based on CLI arguments."""
+    ray.init(ignore_reinit_error=True)
+
+    # Set the model to use, defaults to Llama-3-70B.
+    if 'MODEL_ID' in os.environ:
+        model_id = os.environ.get('MODEL_ID')
+
+    num_tpu_chips = get_num_tpu_chips()
+    pg_resources = []
+    pg_resources.append({"CPU": 1})  # for the deployment replica
+    for i in range(num_tpu_chips):
+        pg_resources.append({"CPU": 1, "TPU": 1})  # for the vLLM actors
+
+    # Use PACK strategy since the deployment may use more than one TPU node.
+    return VLLMDeployment.options(
+        placement_group_bundles=pg_resources,
+        placement_group_strategy="PACK").bind(num_tpu_chips)
+
+model = build_app({})
\ No newline at end of file

From 335e75c1656947c45eee2b26a99605e8e12211b8 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Fri, 27 Sep 2024 08:11:27 +0000
Subject: [PATCH 02/16] Fix inference script

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
index e42383e0d1..d4d1a5624e 100644
--- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -30,8 +30,6 @@
 
 logger = logging.getLogger("ray.serve")
 
-model_id = "meta-llama/Meta-Llama-3-70B"
-
 app = FastAPI()
 
 @serve.deployment(name="VLLMDeployment")
@@ -39,10 +37,10 @@
 class VLLMDeployment:
     def __init__(
         self,
-        num_tpu_chips,  
+        num_tpu_chips,
     ):
         self.llm = LLM(
-            model=model_id,
+            model=os.environ['MODEL_ID'],
             tensor_parallel_size=num_tpu_chips,
             enforce_eager=True,
         )
@@ -87,10 +85,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     """Builds the Serve app based on CLI arguments."""
     ray.init(ignore_reinit_error=True)
 
-    # Set the model to use, defaults to Llama-3-70B.
-    if 'MODEL_ID' in os.environ:
-        model_id = os.environ.get('MODEL_ID')
-
     num_tpu_chips = get_num_tpu_chips()
     pg_resources = []
     pg_resources.append({"CPU": 1})  # for the deployment replica
@@ -102,4 +96,4 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
         placement_group_bundles=pg_resources,
         placement_group_strategy="PACK").bind(num_tpu_chips)
 
-model = build_app({})
\ No newline at end of file
+model = build_app({})

From fe6440c4329383dc20e2c3c2d65303c363b32ee1 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Fri, 27 Sep 2024 09:57:16 +0000
Subject: [PATCH 03/16] Add RayCluster and RayService CRs

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../llm/llama-3-8b-it/ray-cluster-tpu.yaml    | 94 ++++++++++++++++++
 .../llm/llama-3-8b-it/ray-service-tpu.yaml    | 98 +++++++++++++++++++
 .../llm/llama-3.1-70b/ray-cluster-tpu.yaml    | 94 ++++++++++++++++++
 .../llm/llama-3.1-70b/ray-service-tpu.yaml    | 98 +++++++++++++++++++
 4 files changed, 384 insertions(+)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml
new file mode 100644
index 0000000000..b40a79cfb5
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml
@@ -0,0 +1,94 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3-8B-Instruct"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 1
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 150Gi
+                memory: 200G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 150Gi
+                memory: 200G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3-8B-Instruct"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
new file mode 100644
index 0000000000..a9e57033a1
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
@@ -0,0 +1,98 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            TPU_CHIPS: "8"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: ryanaoleary/vllm:tpu
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: ryanaoleary/vllm:tpu
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 150Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 150Gi
+                  memory: 100G
+              env:
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+            cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml
new file mode 100644
index 0000000000..9198f6b393
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml
@@ -0,0 +1,94 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3.1-70B"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 1
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 150Gi
+                memory: 200G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 150Gi
+                memory: 200G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3.1-70B"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
new file mode 100644
index 0000000000..b6406df6f6
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
@@ -0,0 +1,98 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
+            TPU_CHIPS: "8"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: ryanaoleary/vllm:tpu
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: ryanaoleary/vllm:tpu
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 150Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 150Gi
+                  memory: 100G
+              env:
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+            cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu]

From ee804fd7c72c1a87a220b938e5bb19fa9f6f2547 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Fri, 27 Sep 2024 10:09:47 +0000
Subject: [PATCH 04/16] Fix working_dir link

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml | 2 +-
 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
index a9e57033a1..df90c0bf5e 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
@@ -26,7 +26,7 @@ spec:
         - name: VLLMDeployment
           num_replicas: 1
         runtime_env:
-          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
           env_vars:
             MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
             TPU_CHIPS: "8"
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
index b6406df6f6..dd48a67ccc 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
@@ -26,7 +26,7 @@ spec:
         - name: VLLMDeployment
           num_replicas: 1
         runtime_env:
-          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          working_dir: "htthttps://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
           env_vars:
             MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
             TPU_CHIPS: "8"

From 37f028079f92370c62e1988fc3b1d4343d52644e Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Fri, 4 Oct 2024 00:15:36 +0000
Subject: [PATCH 05/16] Set max model length

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
index d4d1a5624e..65d7a4c039 100644
--- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -42,6 +42,7 @@ def __init__(
         self.llm = LLM(
             model=os.environ['MODEL_ID'],
             tensor_parallel_size=num_tpu_chips,
+            max_model_len=1024,
             enforce_eager=True,
         )
 

From 11c7ee6b7ab7693ee26efca22cc7ca3ec4e8c4ad Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Fri, 4 Oct 2024 02:48:25 +0000
Subject: [PATCH 06/16] pass in max model len as env var

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
index 65d7a4c039..1ac693d520 100644
--- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -42,7 +42,7 @@ def __init__(
         self.llm = LLM(
             model=os.environ['MODEL_ID'],
             tensor_parallel_size=num_tpu_chips,
-            max_model_len=1024,
+            max_model_len=int(os.environ.get('MAX_MODEL_LEN')),
             enforce_eager=True,
         )
 
@@ -50,11 +50,12 @@ def __init__(
     async def generate(self, request: Request):
         request_dict = await request.json()
         prompts = request_dict.pop("prompt")
+        max_toks = int(request_dict.pop("max_tokens"))
         print("Processing prompt ", prompts)
         sampling_params = SamplingParams(temperature=0.7,
                                          top_p=1.0,
                                          n=1,
-                                         max_tokens=1000)
+                                         max_tokens=max_toks)
 
         outputs = self.llm.generate(prompts, sampling_params)
         for output in outputs:

From deeee5653cc072206b96015039c5d9fd663d3f63 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Mon, 7 Oct 2024 19:20:48 +0000
Subject: [PATCH 07/16] Add Ray CR manifests and fix inference script

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ...uster-tpu.yaml => ray-cluster-v4-tpu.yaml} |  16 +--
 .../llama-3-8b-it/ray-cluster-v5e-tpu.yaml    |  96 ++++++++++++++++
 ...rvice-tpu.yaml => ray-service-v4-tpu.yaml} |  14 ++-
 .../llama-3-8b-it/ray-service-v5e-tpu.yaml    | 100 +++++++++++++++++
 ...uster-tpu.yaml => ray-cluster-v4-tpu.yaml} |  28 +++--
 .../llama-3.1-70b/ray-cluster-v5e-tpu.yaml    | 100 +++++++++++++++++
 ...rvice-tpu.yaml => ray-service-v4-tpu.yaml} |  29 +++--
 .../llama-3.1-70b/ray-service-v5e-tpu.yaml    | 103 ++++++++++++++++++
 .../llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml | 100 +++++++++++++++++
 .../llava-1.5-13b/ray-cluster-v5e-tpu.yaml    | 100 +++++++++++++++++
 .../llm/llava-1.5-13b/ray-service-v4-tpu.yaml | 103 ++++++++++++++++++
 .../llava-1.5-13b/ray-service-v5e-tpu.yaml    | 103 ++++++++++++++++++
 .../llm/mistral-7b/ray-cluster-v4-tpu.yaml    | 100 +++++++++++++++++
 .../llm/mistral-7b/ray-cluster-v5e-tpu.yaml   | 100 +++++++++++++++++
 .../llm/mistral-7b/ray-service-v4-tpu.yaml    | 103 ++++++++++++++++++
 .../llm/mistral-7b/ray-service-v5e-tpu.yaml   | 103 ++++++++++++++++++
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py       |  26 ++++-
 17 files changed, 1285 insertions(+), 39 deletions(-)
 rename ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/{ray-cluster-tpu.yaml => ray-cluster-v4-tpu.yaml} (90%)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
 rename ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/{ray-service-tpu.yaml => ray-service-v4-tpu.yaml} (92%)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
 rename ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/{ray-cluster-tpu.yaml => ray-cluster-v4-tpu.yaml} (83%)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
 rename ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/{ray-service-tpu.yaml => ray-service-v4-tpu.yaml} (81%)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
similarity index 90%
rename from ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml
rename to ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
index b40a79cfb5..eb54cc59e8 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu]
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
@@ -73,22 +73,24 @@ spec:
               limits:
                 cpu: "100"
                 google.com/tpu: "4"
-                ephemeral-storage: 150Gi
-                memory: 200G
+                ephemeral-storage: 50Gi
+                memory: 100G
               requests:
                 cpu: "100"
                 google.com/tpu: "4"
-                ephemeral-storage: 150Gi
-                memory: 200G
+                ephemeral-storage: 50Gi
+                memory: 100G
             env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
               - name: HUGGING_FACE_HUB_TOKEN
                 valueFrom:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
               - name: MODEL_ID
-                value: "meta-llama/Llama-3-8B-Instruct"
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
           cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu]
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
new file mode 100644
index 0000000000..48f510296e
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
@@ -0,0 +1,96 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 1
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
similarity index 92%
rename from ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
rename to ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
index df90c0bf5e..6e95df89f7 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu]
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
@@ -38,7 +38,7 @@ spec:
         spec:
           containers:
           - name: ray-head
-            image: ryanaoleary/vllm:tpu
+            image: $VLLM_IMAGE
             imagePullPolicy: IfNotPresent
             ports:
             - containerPort: 6379
@@ -73,20 +73,22 @@ spec:
         spec:
           containers:
             - name: ray-worker
-              image: ryanaoleary/vllm:tpu
+              image: $VLLM_IMAGE
               imagePullPolicy: IfNotPresent
               resources:
                 limits:
                   cpu: "100"
                   google.com/tpu: "4"
-                  ephemeral-storage: 150Gi
+                  ephemeral-storage: 50Gi
                   memory: 100G
                 requests:
                   cpu: "100"
                   google.com/tpu: "4"
-                  ephemeral-storage: 150Gi
+                  ephemeral-storage: 50Gi
                   memory: 100G
               env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
                 - name: HUGGING_FACE_HUB_TOKEN
                   valueFrom:
                     secretKeyRef:
@@ -95,4 +97,4 @@ spec:
           nodeSelector:
             cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
             cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu]
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
new file mode 100644
index 0000000000..70aaf5938c
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            TPU_CHIPS: "8"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
similarity index 83%
rename from ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml
rename to ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
index 9198f6b393..ea469084e9 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu]
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4]
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
@@ -29,11 +29,11 @@ spec:
             resources:
               limits:
                 cpu: "8"
-                ephemeral-storage: 5Gi
+                ephemeral-storage: 8Gi
                 memory: 40G
               requests:
                 cpu: "8"
-                ephemeral-storage: 5Gi
+                ephemeral-storage: 8Gi
                 memory: 40G
             env:
               - name: HUGGING_FACE_HUB_TOKEN
@@ -43,6 +43,8 @@ spec:
                     key: hf_api_token
               - name: MODEL_ID
                 value: "meta-llama/Llama-3.1-70B"
+              - name: MAX_MODEL_LEN
+                value: "4096"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -60,8 +62,8 @@ spec:
   - groupName: tpu-group
     replicas: 1
     minReplicas: 0
-    maxReplicas: 1
-    numOfHosts: 2
+    maxReplicas: 2
+    numOfHosts: 4
     rayStartParams: {}
     template:
       spec:
@@ -73,14 +75,16 @@ spec:
               limits:
                 cpu: "100"
                 google.com/tpu: "4"
-                ephemeral-storage: 150Gi
-                memory: 200G
+                ephemeral-storage: 200Gi
+                memory: 150G
               requests:
                 cpu: "100"
                 google.com/tpu: "4"
-                ephemeral-storage: 150Gi
-                memory: 200G
+                ephemeral-storage: 200Gi
+                memory: 150G
             env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
               - name: HUGGING_FACE_HUB_TOKEN
                 valueFrom:
                   secretKeyRef:
@@ -88,7 +92,9 @@ spec:
                     key: hf_api_token
               - name: MODEL_ID
                 value: "meta-llama/Llama-3.1-70B"
+              - name: MAX_MODEL_LEN
+                value: "4096"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-          cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu]
+          cloud.google.com/gke-tpu-topology: 2x2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
new file mode 100644
index 0000000000..d54cb0985b
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3.1-70B"
+              - name: MAX_MODEL_LEN
+                value: "4096"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 2
+    numOfHosts: 4
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Llama-3.1-70B"
+              - name: MAX_MODEL_LEN
+                value: "4096"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 4x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
similarity index 81%
rename from ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
rename to ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
index dd48a67ccc..04fdb8056c 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu]
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4]
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
@@ -26,10 +26,11 @@ spec:
         - name: VLLMDeployment
           num_replicas: 1
         runtime_env:
-          working_dir: "htthttps://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
           env_vars:
             MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
-            TPU_CHIPS: "8"
+            MAX_MODEL_LEN: "4096"
+            TPU_CHIPS: "16"
   rayClusterConfig:
     rayVersion: 2.34.0
     headGroupSpec:
@@ -38,7 +39,7 @@ spec:
         spec:
           containers:
           - name: ray-head
-            image: ryanaoleary/vllm:tpu
+            image: $VLLM_IMAGE
             imagePullPolicy: IfNotPresent
             ports:
             - containerPort: 6379
@@ -58,35 +59,39 @@ spec:
             resources:
               limits:
                 cpu: "8"
+                ephemeral-storage: 8Gi
                 memory: 40G
               requests:
                 cpu: "8"
+                ephemeral-storage: 8Gi
                 memory: 40G
     workerGroupSpecs:
     - groupName: tpu-group
       replicas: 1
       minReplicas: 0
       maxReplicas: 2
-      numOfHosts: 2
+      numOfHosts: 4
       rayStartParams: {}
       template:
         spec:
           containers:
             - name: ray-worker
-              image: ryanaoleary/vllm:tpu
+              image: $VLLM_IMAGE
               imagePullPolicy: IfNotPresent
               resources:
                 limits:
                   cpu: "100"
                   google.com/tpu: "4"
-                  ephemeral-storage: 150Gi
-                  memory: 100G
+                  ephemeral-storage: 200Gi
+                  memory: 150G
                 requests:
                   cpu: "100"
                   google.com/tpu: "4"
-                  ephemeral-storage: 150Gi
-                  memory: 100G
+                  ephemeral-storage: 200Gi
+                  memory: 150G
               env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
                 - name: HUGGING_FACE_HUB_TOKEN
                   valueFrom:
                     secretKeyRef:
@@ -94,5 +99,5 @@ spec:
                       key: hf_api_token
           nodeSelector:
             cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-            cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu]
+            cloud.google.com/gke-tpu-topology: 2x2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
new file mode 100644
index 0000000000..0d05821316
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
@@ -0,0 +1,103 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
+            MAX_MODEL_LEN: "4096"
+            TPU_CHIPS: "16"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 4
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 4x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
new file mode 100644
index 0000000000..76b95c4b4d
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "llava-hf/llava-1.5-13b-hf"
+              - name: DTYPE
+                value: "bfloat16"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 2
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "llava-hf/llava-1.5-13b-hf"
+              - name: DTYPE
+                value: "bfloat16"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
new file mode 100644
index 0000000000..1aad70911b
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "llava-hf/llava-1.5-13b-hf"
+              - name: DTYPE
+                value: "bfloat16"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 2
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 200Gi
+                memory: 150G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "llava-hf/llava-1.5-13b-hf"
+              - name: DTYPE
+                value: "bfloat16"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
new file mode 100644
index 0000000000..e9cf9dec32
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
@@ -0,0 +1,103 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "llava-hf/llava-1.5-13b-hf"
+            TPU_CHIPS: "8"
+            DTYPE: "bfloat16"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+            cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
new file mode 100644
index 0000000000..ef1db10dee
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
@@ -0,0 +1,103 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "llava-hf/llava-1.5-13b-hf"
+            TPU_CHIPS: "8"
+            DTYPE: "bfloat16"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 200Gi
+                  memory: 150G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
new file mode 100644
index 0000000000..85165c6ca2
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "mistralai/Mistral-7B-Instruct-v0.3"
+              - name: TOKENIZER_MODE
+                value: "mistral"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 2
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "mistralai/Mistral-7B-Instruct-v0.3"
+              - name: TOKENIZER_MODE
+                value: "mistral"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
new file mode 100644
index 0000000000..cd0e3a1746
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "mistralai/Mistral-7B-Instruct-v0.3"
+              - name: TOKENIZER_MODE
+                value: "mistral"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 2
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "mistralai/Mistral-7B-Instruct-v0.3"
+              - name: TOKENIZER_MODE
+                value: "mistral"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
new file mode 100644
index 0000000000..65a3958b52
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
@@ -0,0 +1,103 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
+            TPU_CHIPS: "8"
+            TOKENIZER_MODE: "mistral"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+            cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml
new file mode 100644
index 0000000000..184c99ccf1
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml
@@ -0,0 +1,103 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
+            TPU_CHIPS: "8"
+            TOKENIZER_MODE: "mistral"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 8Gi
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
index 1ac693d520..347a402e05 100644
--- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -38,11 +38,16 @@ class VLLMDeployment:
     def __init__(
         self,
         num_tpu_chips,
+        max_model_len,
+        tokenizer_mode,
+        dtype,
     ):
         self.llm = LLM(
-            model=os.environ['MODEL_ID'],
+            model=os.environ['MODEL_ID'], # Error if not provided.
             tensor_parallel_size=num_tpu_chips,
-            max_model_len=int(os.environ.get('MAX_MODEL_LEN')),
+            max_model_len=max_model_len,
+            dtype=dtype,
+            tokenizer_mode=tokenizer_mode,
             enforce_eager=True,
         )
 
@@ -83,6 +88,21 @@ def get_num_tpu_chips() -> int:
         return 0
     return int(ray.cluster_resources()["TPU"])
 
+def get_max_model_len() -> Optional[int]:
+    if 'MAX_MODEL_LEN' in os.environ:
+        return int(os.environ['MAX_MODEL_LEN'])
+    return None
+
+def get_tokenizer_mode() -> str:
+    if 'TOKENIZER_MODE' in os.environ:
+        return os.environ['TOKENIZER_MODE']
+    return "auto"
+
+def get_dtype() -> str:
+    if 'DTYPE' in os.environ:
+        return os.environ['DTYPE']
+    return "auto"
+
 def build_app(cli_args: Dict[str, str]) -> serve.Application:
     """Builds the Serve app based on CLI arguments."""
     ray.init(ignore_reinit_error=True)
@@ -96,6 +116,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     # Use PACK strategy since the deployment may use more than one TPU node.
     return VLLMDeployment.options(
         placement_group_bundles=pg_resources,
-        placement_group_strategy="PACK").bind(num_tpu_chips)
+        placement_group_strategy="PACK").bind(num_tpu_chips, get_max_model_len(), get_tokenizer_mode(), get_dtype())
 
 model = build_app({})

From b12ffb7476d5205a24dd8ce3b2877edce63eb80d Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Mon, 7 Oct 2024 19:22:27 +0000
Subject: [PATCH 08/16] Fix model id

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
index eb54cc59e8..b53039590a 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
@@ -42,7 +42,7 @@ spec:
                     name: hf-secret
                     key: hf_api_token
               - name: MODEL_ID
-                value: "meta-llama/Llama-3-8B-Instruct"
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
             ports:
               - containerPort: 6379
                 name: gcs

From cbce28377a55670af3dc62be8870994d5f760dde Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Mon, 7 Oct 2024 21:33:16 +0000
Subject: [PATCH 09/16] Don't specify mxla and slicebuilder ports

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml        | 4 ----
 .../rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml       | 4 ----
 .../rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml        | 4 ----
 .../rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml       | 4 ----
 .../rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml        | 4 ----
 .../rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml       | 4 ----
 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml | 4 ----
 .../gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml  | 4 ----
 8 files changed, 32 deletions(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
index b53039590a..6e16216651 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
@@ -52,10 +52,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
index 48f510296e..f09d16a3db 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
@@ -52,10 +52,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
index ea469084e9..9b3df8452a 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
index d54cb0985b..939a395d56 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
index 76b95c4b4d..0d64de23ab 100644
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
index 1aad70911b..d7214d7a7c 100644
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
index 85165c6ca2..3f539689c8 100644
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
index cd0e3a1746..27983bd6eb 100644
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
@@ -54,10 +54,6 @@ spec:
                 name: client
               - containerPort: 8000
                 name: serve
-              - containerPort: 8471
-                name: slicebuilder
-              - containerPort: 8081
-                name: mxla
   workerGroupSpecs:
   - groupName: tpu-group
     replicas: 1

From aa21d2ca1c031e91a8a1b5ec192478cf19beb6a5 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Mon, 7 Oct 2024 21:37:09 +0000
Subject: [PATCH 10/16] Pass env vars to runtime env

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml    | 4 ----
 .../rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml   | 4 ----
 .../rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml    | 8 --------
 .../rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml   | 8 --------
 .../rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml    | 8 --------
 .../rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml   | 8 --------
 .../rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml       | 8 --------
 .../rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml      | 8 --------
 8 files changed, 56 deletions(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
index 6e16216651..1faecb6860 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
@@ -41,8 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Meta-Llama-3-8B-Instruct"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -84,8 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Meta-Llama-3-8B-Instruct"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
           cloud.google.com/gke-tpu-topology: 2x2x2
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
index f09d16a3db..a41106e3aa 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
@@ -41,8 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Meta-Llama-3-8B-Instruct"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -84,8 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Meta-Llama-3-8B-Instruct"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
           cloud.google.com/gke-tpu-topology: 2x4
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
index 9b3df8452a..b7b3f42643 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Llama-3.1-70B"
-              - name: MAX_MODEL_LEN
-                value: "4096"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Llama-3.1-70B"
-              - name: MAX_MODEL_LEN
-                value: "4096"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
           cloud.google.com/gke-tpu-topology: 2x2x4
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
index 939a395d56..7e5aa17b97 100644
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Llama-3.1-70B"
-              - name: MAX_MODEL_LEN
-                value: "4096"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "meta-llama/Llama-3.1-70B"
-              - name: MAX_MODEL_LEN
-                value: "4096"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
           cloud.google.com/gke-tpu-topology: 4x4
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
index 0d64de23ab..2ff2780fea 100644
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "llava-hf/llava-1.5-13b-hf"
-              - name: DTYPE
-                value: "bfloat16"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "llava-hf/llava-1.5-13b-hf"
-              - name: DTYPE
-                value: "bfloat16"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
           cloud.google.com/gke-tpu-topology: 2x2x2
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
index d7214d7a7c..a1c10c73d7 100644
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "llava-hf/llava-1.5-13b-hf"
-              - name: DTYPE
-                value: "bfloat16"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "llava-hf/llava-1.5-13b-hf"
-              - name: DTYPE
-                value: "bfloat16"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
           cloud.google.com/gke-tpu-topology: 2x4
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
index 3f539689c8..afa639fe35 100644
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "mistralai/Mistral-7B-Instruct-v0.3"
-              - name: TOKENIZER_MODE
-                value: "mistral"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "mistralai/Mistral-7B-Instruct-v0.3"
-              - name: TOKENIZER_MODE
-                value: "mistral"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
           cloud.google.com/gke-tpu-topology: 2x2x2
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
index 27983bd6eb..290c6cd9a8 100644
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
@@ -41,10 +41,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "mistralai/Mistral-7B-Instruct-v0.3"
-              - name: TOKENIZER_MODE
-                value: "mistral"
             ports:
               - containerPort: 6379
                 name: gcs
@@ -86,10 +82,6 @@ spec:
                   secretKeyRef:
                     name: hf-secret
                     key: hf_api_token
-              - name: MODEL_ID
-                value: "mistralai/Mistral-7B-Instruct-v0.3"
-              - name: TOKENIZER_MODE
-                value: "mistral"
         nodeSelector:
           cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
           cloud.google.com/gke-tpu-topology: 2x4

From 5b42cb8dd9c2bdcd2f6b1439a844462ec004d08d Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Thu, 10 Oct 2024 17:59:45 +0000
Subject: [PATCH 11/16] Add model composition example

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../model-composition/ray-service-tpu.yaml    |  97 ++++++++
 .../llm/model-composition/serve_tpu.py        | 218 ++++++++++++++++++
 2 files changed, 315 insertions(+)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py

diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
new file mode 100644
index 0000000000..42c8d3723a
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
@@ -0,0 +1,97 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: model-composition
+spec:
+  serveConfigV2: |
+    applications:
+    - name: llm
+      route_prefix: /
+      import_path:  ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model
+      deployments:
+      - name: MultiModelDeployment
+        num_replicas: 1
+      runtime_env:
+        working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+        env_vars:
+          ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+          SUMMARIZER_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
+  rayClusterConfig:
+    headGroupSpec:
+      rayStartParams:
+        dashboard-host: '0.0.0.0'
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+            ports:
+            - containerPort: 6379
+              name: gcs-server
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+    workerGroupSpecs:
+    - replicas: 2
+      minReplicas: 0
+      maxReplicas: 4
+      numOfHosts: 2
+      groupName: tpu-group
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: llm
+            image: $VLLM_IMAGE
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu]
diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
new file mode 100644
index 0000000000..ac138fad12
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
@@ -0,0 +1,218 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: this file was inspired from: https://github.com/ray-project/ray/blob//master/doc/source/serve/doc_code/vllm_example.py
+
+import json
+import os
+from typing import AsyncGenerator, Dict, List, Optional
+import random
+
+from fastapi import BackgroundTasks
+from starlette.requests import Request
+from starlette.responses import Response, StreamingResponse
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+
+import ray
+from ray import serve
+from ray.serve.handle import DeploymentHandle
+
+
+@serve.deployment(name="VLLMDeployment")
+class VLLMDeployment:
+    def __init__(self, **kwargs):
+        """
+        Construct a VLLM deployment.
+
+        Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+        for the full list of arguments.
+
+        Args:
+            model: name or path of the huggingface model to use
+            download_dir: directory to download and load the weights,
+                default to the default cache dir of huggingface.
+            use_np_weights: save a numpy copy of model weights for
+                faster loading. This can increase the disk usage by up to 2x.
+            use_dummy_weights: use dummy values for model weights.
+            dtype: data type for model weights and activations.
+                The "auto" option will use FP16 precision
+                for FP32 and FP16 models, and BF16 precision.
+                for BF16 models.
+            seed: random seed.
+            worker_use_ray: use Ray for distributed serving, will be
+                automatically set when using more than 1 GPU
+            pipeline_parallel_size: number of pipeline stages.
+            tensor_parallel_size: number of tensor parallel replicas.
+            block_size: token block size.
+            swap_space: CPU swap space size (GiB) per GPU.
+            gpu_memory_utilization: the percentage of GPU memory to be used for
+                the model executor
+            max_num_batched_tokens: maximum number of batched tokens per iteration
+            max_num_seqs: maximum number of sequences per iteration.
+            disable_log_stats: disable logging statistics.
+            engine_use_ray: use Ray to start the LLM engine in a separate
+                process as the server process.
+            disable_log_requests: disable logging requests.
+        """
+        args = AsyncEngineArgs(**kwargs)
+        self.engine = AsyncLLMEngine.from_engine_args(args)
+
+    async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]:
+        num_returned = 0
+        async for request_output in results_generator:
+            text_outputs = [output.text for output in request_output.outputs]
+            assert len(text_outputs) == 1
+            text_output = text_outputs[0][num_returned:]
+            ret = {"text": text_output}
+            yield (json.dumps(ret) + "\n").encode("utf-8")
+            num_returned += len(text_output)
+
+    async def may_abort_request(self, request_id) -> None:
+        await self.engine.abort(request_id)
+
+    async def __call__(self, request_dict: dict) -> str:
+        """Generate completion for the request.
+
+        The request should be a JSON object with the following fields:
+        - prompt: the prompt to use for the generation.
+        - stream: whether to stream the results or not.
+        - other fields: the sampling parameters (See `SamplingParams` for details).
+        """
+        # request_dict = await request.json()
+        prompt = request_dict.pop("prompt")
+        stream = request_dict.pop("stream", False)
+        max_tokens = request_dict.pop("max_tokens", 1000)
+        sampling_params = SamplingParams(**request_dict)
+        request_id = random_uuid()
+        results_generator = self.engine.generate(
+            prompt, sampling_params, request_id)
+        if stream:
+            background_tasks = BackgroundTasks()
+            # Using background_taks to abort the the request
+            # if the client disconnects.
+            background_tasks.add_task(self.may_abort_request, request_id)
+            return StreamingResponse(
+                self.stream_results(results_generator), background=background_tasks
+            )
+
+        final_output = None
+        async for request_output in results_generator:
+            final_output = request_output
+
+        assert final_output is not None
+        prompt = final_output.prompt
+        text_outputs = [
+            output.text for output in final_output.outputs]
+        ret = {"text": text_outputs, "max_tokens": max_tokens}
+        return json.dumps(ret)
+
+
+@serve.deployment
+class VLLMSummarizerDeployment:
+    def __init__(self, **kwargs):
+        args = AsyncEngineArgs(**kwargs)
+        self.engine = AsyncLLMEngine.from_engine_args(args)
+
+    async def __call__(self, response: str) -> str:
+        """Generates summarization of a response from another model.
+
+        The response should be a JSON object with the following fields:
+        - text: the response returned from another model to summarize
+        """
+        request_dict = json.loads(response)
+        text = request_dict.pop("text")
+        prompt = f"Summarize the following text into a single sentence: {text}"
+        sampling_params = SamplingParams(**request_dict)
+        request_id = random_uuid()
+        results_generator = self.engine.generate(
+            prompt, sampling_params, request_id)
+
+        final_output = None
+        async for request_output in results_generator:
+            final_output = request_output
+
+        assert final_output is not None
+        prompt = final_output.prompt
+        text_outputs = [
+            output.text for output in final_output.outputs]
+        ret = {"text": text_outputs}
+        return json.dumps(ret)
+
+
+@serve.deployment
+class MultiModelDeployment:
+    def __init__(self, assist_model: DeploymentHandle, summarizer_model: DeploymentHandle):
+        self.assistant_model = assist_model
+        self.summarizer_model = summarizer_model
+
+    async def __call__(self, request: Request) -> Response:
+        model_request = await request.json()
+        assistant_response = self.assistant_model.remote(model_request)
+        summarizer_response = await self.summarizer_model.remote(assistant_response)
+        return Response(content=summarizer_response)
+
+def get_num_tpu_chips() -> int:
+    if "TPU" not in ray.cluster_resources():
+        # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling).
+        if os.environ.get('TPU_CHIPS') is not None:
+            return int(os.environ.get('TPU_CHIPS'))
+        return 0
+    return int(ray.cluster_resources()["TPU"])
+
+def get_tpu_head() -> Optional[str]:
+    # return the TPU-{accelerator}-head resource
+    for key, _ in ray.cluster_resources().items():
+        if key.endswith("head"):
+            return key
+    return None
+
+def build_app(cli_args: Dict[str, str]) -> serve.Application:
+    """Builds the Serve app based on CLI arguments."""
+    ray.init(ignore_reinit_error=True)
+
+    num_tpu_chips = get_num_tpu_chips()
+    tpu_head = get_tpu_head()
+    tpu_slices = 1
+    if tpu_head is not None:
+        tpu_slices = ray.cluster_resources()[tpu_head]
+    num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices)
+    # Construct a placement group for 1 TPU slice. Each model should run on its own slice.
+    pg_resources = []
+    pg_resources.append({"CPU": 1})  # for the deployment replica
+    for i in range(num_tpu_chips_per_slice):
+        pg_resources.append({"CPU": 1, "TPU": 1})  # for the vLLM actors
+    # Add a TPU head to the placement group to ensure Ray workers are not placed across slices.
+    pg_resources.append({tpu_head: 1})
+
+    return MultiModelDeployment.bind(
+        VLLMDeployment.options(
+            placement_group_bundles=pg_resources,
+            placement_group_strategy="PACK").bind(
+            model=os.environ['ASSIST_MODEL_ID'],
+            tensor_parallel_size=num_tpu_chips_per_slice,
+            enforce_eager=True,
+        ),
+        VLLMSummarizerDeployment.options(
+            placement_group_bundles=pg_resources,
+            placement_group_strategy="PACK").bind(
+            model=os.environ['SUMMARIZER_MODEL_ID'],
+            tensor_parallel_size=num_tpu_chips_per_slice,
+            enforce_eager=True,
+        ),
+    )
+
+multi_model = build_app({})

From ced331d74a7eb6a9490a0cfebe0265c2e4a6069f Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Thu, 10 Oct 2024 18:02:43 +0000
Subject: [PATCH 12/16] Update name

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
index 42c8d3723a..cc194af015 100644
--- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
@@ -16,7 +16,7 @@
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
-  name: model-composition
+  name: vllm-tpu
 spec:
   serveConfigV2: |
     applications:

From 9e9d46602e8f9a05725532b93fb7348821e07565 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Thu, 10 Oct 2024 18:17:17 +0000
Subject: [PATCH 13/16] Support passing TPU_HEADS as env var

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
index ac138fad12..36caadb663 100644
--- a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
@@ -174,6 +174,10 @@ def get_num_tpu_chips() -> int:
     return int(ray.cluster_resources()["TPU"])
 
 def get_tpu_head() -> Optional[str]:
+    if "TPU" not in ray.cluster_resources():
+         # Pass in # TPU heads when the current Ray cluster resources can't be auto-detected.
+        if os.environ.get('TPU_HEADS') is not None:
+            return int(os.environ.get('TPU_HEADS'))
     # return the TPU-{accelerator}-head resource
     for key, _ in ray.cluster_resources().items():
         if key.endswith("head"):

From 06ed4fcb1d3d9bad089e24f89a01a29ff982b908 Mon Sep 17 00:00:00 2001
From: Ryan O'Leary <ryanaoleary@google.com>
Date: Thu, 10 Oct 2024 18:38:46 +0000
Subject: [PATCH 14/16] Update RayService TPU CR

Signed-off-by: Ryan O'Leary <ryanaoleary@google.com>
---
 .../rayserve/llm/model-composition/ray-service-tpu.yaml       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
index cc194af015..b69b8deb27 100644
--- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
+++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
@@ -30,7 +30,9 @@ spec:
         working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
         env_vars:
           ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
-          SUMMARIZER_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
+          SUMMARIZER_MODEL_ID: "google/gemma-7b-it"
+          TPU_CHIPS: "16"
+          TPU_HEADS: "2"
   rayClusterConfig:
     headGroupSpec:
       rayStartParams:

From 41ed84a1ce0963e08297ee1ba84fb0de1c27bcc6 Mon Sep 17 00:00:00 2001
From: ryanaoleary <ryanaoleary@google.com>
Date: Thu, 14 Nov 2024 01:43:05 +0000
Subject: [PATCH 15/16] Rescope PR to v5e and v6e for Llama-3.1-405B

Signed-off-by: ryanaoleary <ryanaoleary@google.com>
---
 .../llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml |  88 -------
 .../llama-3-8b-it/ray-cluster-v5e-tpu.yaml    |  88 -------
 .../llm/llama-3-8b-it/ray-service-v4-tpu.yaml | 100 --------
 .../llama-3-8b-it/ray-service-v5e-tpu.yaml    | 100 --------
 .../llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml |  88 -------
 .../llama-3.1-70b/ray-cluster-v5e-tpu.yaml    |  88 -------
 .../llm/llama-3.1-70b/ray-service-v4-tpu.yaml | 103 --------
 .../llama-3.1-70b/ray-service-v5e-tpu.yaml    | 103 --------
 .../llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml |  88 -------
 .../llava-1.5-13b/ray-cluster-v5e-tpu.yaml    |  88 -------
 .../llm/llava-1.5-13b/ray-service-v4-tpu.yaml | 103 --------
 .../llava-1.5-13b/ray-service-v5e-tpu.yaml    | 103 --------
 .../llm/mistral-7b/ray-cluster-v4-tpu.yaml    |  88 -------
 .../llm/mistral-7b/ray-cluster-v5e-tpu.yaml   |  88 -------
 .../llm/mistral-7b/ray-service-v4-tpu.yaml    | 103 --------
 .../llm/mistral-7b/ray-service-v5e-tpu.yaml   | 103 --------
 .../model-composition/ray-service-tpu.yaml    |  99 --------
 .../llm/model-composition/serve_tpu.py        | 222 ------------------
 ai-ml/gke-ray/rayserve/llm/serve_tpu.py       |   3 +-
 .../tpu/ray-cluster.tpu-v5e-multihost.yaml    | 150 ++++++++++++
 .../tpu/ray-cluster.tpu-v6e-multihost.yaml    | 146 ++++++++++++
 21 files changed, 298 insertions(+), 1844 deletions(-)
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
 delete mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
 create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml

diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
deleted file mode 100644
index 1faecb6860..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 5Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 5Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 1
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-          cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
deleted file mode 100644
index a41106e3aa..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 5Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 5Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 1
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-          cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
deleted file mode 100644
index 6e95df89f7..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
-            TPU_CHIPS: "8"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                memory: 40G
-              requests:
-                cpu: "8"
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-            cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
deleted file mode 100644
index 70aaf5938c..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
-            TPU_CHIPS: "8"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                memory: 40G
-              requests:
-                cpu: "8"
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-            cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
deleted file mode 100644
index b7b3f42643..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 4
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-          cloud.google.com/gke-tpu-topology: 2x2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
deleted file mode 100644
index 7e5aa17b97..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 4
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-          cloud.google.com/gke-tpu-topology: 4x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
deleted file mode 100644
index 04fdb8056c..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
-            MAX_MODEL_LEN: "4096"
-            TPU_CHIPS: "16"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 4
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-            cloud.google.com/gke-tpu-topology: 2x2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
deleted file mode 100644
index 0d05821316..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "meta-llama/Meta-Llama-3.1-70B"
-            MAX_MODEL_LEN: "4096"
-            TPU_CHIPS: "16"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 4
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-            cloud.google.com/gke-tpu-topology: 4x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
deleted file mode 100644
index 2ff2780fea..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-          cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
deleted file mode 100644
index a1c10c73d7..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 200Gi
-                memory: 150G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-          cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
deleted file mode 100644
index e9cf9dec32..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "llava-hf/llava-1.5-13b-hf"
-            TPU_CHIPS: "8"
-            DTYPE: "bfloat16"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-            cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
deleted file mode 100644
index ef1db10dee..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "llava-hf/llava-1.5-13b-hf"
-            TPU_CHIPS: "8"
-            DTYPE: "bfloat16"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 200Gi
-                  memory: 150G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-            cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
deleted file mode 100644
index afa639fe35..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-          cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
deleted file mode 100644
index 290c6cd9a8..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayCluster
-metadata:
-  name: vllm-tpu
-spec:
-  headGroupSpec:
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-            env:
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-            ports:
-              - containerPort: 6379
-                name: gcs
-              - containerPort: 8265
-                name: dashboard
-              - containerPort: 10001
-                name: client
-              - containerPort: 8000
-                name: serve
-  workerGroupSpecs:
-  - groupName: tpu-group
-    replicas: 1
-    minReplicas: 0
-    maxReplicas: 2
-    numOfHosts: 2
-    rayStartParams: {}
-    template:
-      spec:
-        containers:
-          - name: ray-worker
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-            env:
-              - name: JAX_PLATFORMS
-                value: "tpu"
-              - name: HUGGING_FACE_HUB_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-secret
-                    key: hf_api_token
-        nodeSelector:
-          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-          cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
deleted file mode 100644
index 65a3958b52..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
-            TPU_CHIPS: "8"
-            TOKENIZER_MODE: "mistral"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-            cloud.google.com/gke-tpu-topology: 2x2x2
-# [END gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml
deleted file mode 100644
index 184c99ccf1..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-      - name: llm
-        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
-        deployments:
-        - name: VLLMDeployment
-          num_replicas: 1
-        runtime_env:
-          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-          env_vars:
-            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
-            TPU_CHIPS: "8"
-            TOKENIZER_MODE: "mistral"
-  rayClusterConfig:
-    rayVersion: 2.34.0
-    headGroupSpec:
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            imagePullPolicy: IfNotPresent
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-              requests:
-                cpu: "8"
-                ephemeral-storage: 8Gi
-                memory: 40G
-    workerGroupSpecs:
-    - groupName: tpu-group
-      replicas: 1
-      minReplicas: 0
-      maxReplicas: 2
-      numOfHosts: 2
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-            - name: ray-worker
-              image: $VLLM_IMAGE
-              imagePullPolicy: IfNotPresent
-              resources:
-                limits:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-                requests:
-                  cpu: "100"
-                  google.com/tpu: "4"
-                  ephemeral-storage: 50Gi
-                  memory: 100G
-              env:
-                - name: JAX_PLATFORMS
-                  value: "tpu"
-                - name: HUGGING_FACE_HUB_TOKEN
-                  valueFrom:
-                    secretKeyRef:
-                      name: hf-secret
-                      key: hf_api_token
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-            cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
deleted file mode 100644
index b69b8deb27..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu]
-apiVersion: ray.io/v1
-kind: RayService
-metadata:
-  name: vllm-tpu
-spec:
-  serveConfigV2: |
-    applications:
-    - name: llm
-      route_prefix: /
-      import_path:  ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model
-      deployments:
-      - name: MultiModelDeployment
-        num_replicas: 1
-      runtime_env:
-        working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
-        env_vars:
-          ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
-          SUMMARIZER_MODEL_ID: "google/gemma-7b-it"
-          TPU_CHIPS: "16"
-          TPU_HEADS: "2"
-  rayClusterConfig:
-    headGroupSpec:
-      rayStartParams:
-        dashboard-host: '0.0.0.0'
-      template:
-        spec:
-          containers:
-          - name: ray-head
-            image: $VLLM_IMAGE
-            resources:
-              limits:
-                cpu: "8"
-                memory: 40G
-              requests:
-                cpu: "8"
-                memory: 40G
-            ports:
-            - containerPort: 6379
-              name: gcs-server
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-    workerGroupSpecs:
-    - replicas: 2
-      minReplicas: 0
-      maxReplicas: 4
-      numOfHosts: 2
-      groupName: tpu-group
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: llm
-            image: $VLLM_IMAGE
-            env:
-            - name: HUGGING_FACE_HUB_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-secret
-                  key: hf_api_token
-            resources:
-              limits:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-              requests:
-                cpu: "100"
-                google.com/tpu: "4"
-                ephemeral-storage: 50Gi
-                memory: 100G
-          nodeSelector:
-            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
-            cloud.google.com/gke-tpu-topology: 2x4
-# [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu]
diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
deleted file mode 100644
index 36caadb663..0000000000
--- a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: this file was inspired from: https://github.com/ray-project/ray/blob//master/doc/source/serve/doc_code/vllm_example.py
-
-import json
-import os
-from typing import AsyncGenerator, Dict, List, Optional
-import random
-
-from fastapi import BackgroundTasks
-from starlette.requests import Request
-from starlette.responses import Response, StreamingResponse
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
-from vllm.utils import random_uuid
-
-import ray
-from ray import serve
-from ray.serve.handle import DeploymentHandle
-
-
-@serve.deployment(name="VLLMDeployment")
-class VLLMDeployment:
-    def __init__(self, **kwargs):
-        """
-        Construct a VLLM deployment.
-
-        Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
-        for the full list of arguments.
-
-        Args:
-            model: name or path of the huggingface model to use
-            download_dir: directory to download and load the weights,
-                default to the default cache dir of huggingface.
-            use_np_weights: save a numpy copy of model weights for
-                faster loading. This can increase the disk usage by up to 2x.
-            use_dummy_weights: use dummy values for model weights.
-            dtype: data type for model weights and activations.
-                The "auto" option will use FP16 precision
-                for FP32 and FP16 models, and BF16 precision.
-                for BF16 models.
-            seed: random seed.
-            worker_use_ray: use Ray for distributed serving, will be
-                automatically set when using more than 1 GPU
-            pipeline_parallel_size: number of pipeline stages.
-            tensor_parallel_size: number of tensor parallel replicas.
-            block_size: token block size.
-            swap_space: CPU swap space size (GiB) per GPU.
-            gpu_memory_utilization: the percentage of GPU memory to be used for
-                the model executor
-            max_num_batched_tokens: maximum number of batched tokens per iteration
-            max_num_seqs: maximum number of sequences per iteration.
-            disable_log_stats: disable logging statistics.
-            engine_use_ray: use Ray to start the LLM engine in a separate
-                process as the server process.
-            disable_log_requests: disable logging requests.
-        """
-        args = AsyncEngineArgs(**kwargs)
-        self.engine = AsyncLLMEngine.from_engine_args(args)
-
-    async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]:
-        num_returned = 0
-        async for request_output in results_generator:
-            text_outputs = [output.text for output in request_output.outputs]
-            assert len(text_outputs) == 1
-            text_output = text_outputs[0][num_returned:]
-            ret = {"text": text_output}
-            yield (json.dumps(ret) + "\n").encode("utf-8")
-            num_returned += len(text_output)
-
-    async def may_abort_request(self, request_id) -> None:
-        await self.engine.abort(request_id)
-
-    async def __call__(self, request_dict: dict) -> str:
-        """Generate completion for the request.
-
-        The request should be a JSON object with the following fields:
-        - prompt: the prompt to use for the generation.
-        - stream: whether to stream the results or not.
-        - other fields: the sampling parameters (See `SamplingParams` for details).
-        """
-        # request_dict = await request.json()
-        prompt = request_dict.pop("prompt")
-        stream = request_dict.pop("stream", False)
-        max_tokens = request_dict.pop("max_tokens", 1000)
-        sampling_params = SamplingParams(**request_dict)
-        request_id = random_uuid()
-        results_generator = self.engine.generate(
-            prompt, sampling_params, request_id)
-        if stream:
-            background_tasks = BackgroundTasks()
-            # Using background_taks to abort the the request
-            # if the client disconnects.
-            background_tasks.add_task(self.may_abort_request, request_id)
-            return StreamingResponse(
-                self.stream_results(results_generator), background=background_tasks
-            )
-
-        final_output = None
-        async for request_output in results_generator:
-            final_output = request_output
-
-        assert final_output is not None
-        prompt = final_output.prompt
-        text_outputs = [
-            output.text for output in final_output.outputs]
-        ret = {"text": text_outputs, "max_tokens": max_tokens}
-        return json.dumps(ret)
-
-
-@serve.deployment
-class VLLMSummarizerDeployment:
-    def __init__(self, **kwargs):
-        args = AsyncEngineArgs(**kwargs)
-        self.engine = AsyncLLMEngine.from_engine_args(args)
-
-    async def __call__(self, response: str) -> str:
-        """Generates summarization of a response from another model.
-
-        The response should be a JSON object with the following fields:
-        - text: the response returned from another model to summarize
-        """
-        request_dict = json.loads(response)
-        text = request_dict.pop("text")
-        prompt = f"Summarize the following text into a single sentence: {text}"
-        sampling_params = SamplingParams(**request_dict)
-        request_id = random_uuid()
-        results_generator = self.engine.generate(
-            prompt, sampling_params, request_id)
-
-        final_output = None
-        async for request_output in results_generator:
-            final_output = request_output
-
-        assert final_output is not None
-        prompt = final_output.prompt
-        text_outputs = [
-            output.text for output in final_output.outputs]
-        ret = {"text": text_outputs}
-        return json.dumps(ret)
-
-
-@serve.deployment
-class MultiModelDeployment:
-    def __init__(self, assist_model: DeploymentHandle, summarizer_model: DeploymentHandle):
-        self.assistant_model = assist_model
-        self.summarizer_model = summarizer_model
-
-    async def __call__(self, request: Request) -> Response:
-        model_request = await request.json()
-        assistant_response = self.assistant_model.remote(model_request)
-        summarizer_response = await self.summarizer_model.remote(assistant_response)
-        return Response(content=summarizer_response)
-
-def get_num_tpu_chips() -> int:
-    if "TPU" not in ray.cluster_resources():
-        # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling).
-        if os.environ.get('TPU_CHIPS') is not None:
-            return int(os.environ.get('TPU_CHIPS'))
-        return 0
-    return int(ray.cluster_resources()["TPU"])
-
-def get_tpu_head() -> Optional[str]:
-    if "TPU" not in ray.cluster_resources():
-         # Pass in # TPU heads when the current Ray cluster resources can't be auto-detected.
-        if os.environ.get('TPU_HEADS') is not None:
-            return int(os.environ.get('TPU_HEADS'))
-    # return the TPU-{accelerator}-head resource
-    for key, _ in ray.cluster_resources().items():
-        if key.endswith("head"):
-            return key
-    return None
-
-def build_app(cli_args: Dict[str, str]) -> serve.Application:
-    """Builds the Serve app based on CLI arguments."""
-    ray.init(ignore_reinit_error=True)
-
-    num_tpu_chips = get_num_tpu_chips()
-    tpu_head = get_tpu_head()
-    tpu_slices = 1
-    if tpu_head is not None:
-        tpu_slices = ray.cluster_resources()[tpu_head]
-    num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices)
-    # Construct a placement group for 1 TPU slice. Each model should run on its own slice.
-    pg_resources = []
-    pg_resources.append({"CPU": 1})  # for the deployment replica
-    for i in range(num_tpu_chips_per_slice):
-        pg_resources.append({"CPU": 1, "TPU": 1})  # for the vLLM actors
-    # Add a TPU head to the placement group to ensure Ray workers are not placed across slices.
-    pg_resources.append({tpu_head: 1})
-
-    return MultiModelDeployment.bind(
-        VLLMDeployment.options(
-            placement_group_bundles=pg_resources,
-            placement_group_strategy="PACK").bind(
-            model=os.environ['ASSIST_MODEL_ID'],
-            tensor_parallel_size=num_tpu_chips_per_slice,
-            enforce_eager=True,
-        ),
-        VLLMSummarizerDeployment.options(
-            placement_group_bundles=pg_resources,
-            placement_group_strategy="PACK").bind(
-            model=os.environ['SUMMARIZER_MODEL_ID'],
-            tensor_parallel_size=num_tpu_chips_per_slice,
-            enforce_eager=True,
-        ),
-    )
-
-multi_model = build_app({})
diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
index 347a402e05..1042097fc6 100644
--- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
+++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py
@@ -43,7 +43,8 @@ def __init__(
         dtype,
     ):
         self.llm = LLM(
-            model=os.environ['MODEL_ID'], # Error if not provided.
+            model="/data/Meta-Llama-3.1-405B-Instruct", # Error if not provided.
+            served_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct",
             tensor_parallel_size=num_tpu_chips,
             max_model_len=max_model_len,
             dtype=dtype,
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml
new file mode 100644
index 0000000000..5547110fae
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml
@@ -0,0 +1,150 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v5e_multihost]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+      spec:
+        serviceAccountName: $KSA_NAME
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: VLLM_XLA_CACHE_PATH
+                value: "/data"
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3.1-405B-Instruct"
+              - name: MAX_MODEL_LEN
+                value: "4096"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+        volumes:
+        - name: gke-gcsfuse-cache
+          emptyDir:
+            medium: Memory
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcs-fuse-csi-ephemeral
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: $GSBUCKET
+              mountOptions: "implicit-dirs"
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    numOfHosts: 16
+    rayStartParams: {}
+    template:
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+      spec:
+        serviceAccountName: $KSA_NAME
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 40G
+                memory: 150G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 40G
+                memory: 150G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: VLLM_XLA_CACHE_PATH
+                value: "/data"
+            securityContext:
+              privileged: true
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+        volumes:
+        - name: gke-gcsfuse-cache
+          emptyDir:
+            medium: Memory
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcs-fuse-csi-ephemeral
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: $GSBUCKET
+              mountOptions: "implicit-dirs"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 8x8
+# [END gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v5e_multihost]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml
new file mode 100644
index 0000000000..9e015cf9c1
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml
@@ -0,0 +1,146 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v6e_multihost]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+      spec:
+        serviceAccountName: $KSA_NAME
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: VLLM_XLA_CACHE_PATH
+                value: "/data"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+        volumes:
+        - name: gke-gcsfuse-cache
+          emptyDir:
+            medium: Memory
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcs-fuse-csi-ephemeral
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: $GSBUCKET
+              mountOptions: "implicit-dirs"
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    numOfHosts: 8
+    rayStartParams: {}
+    template:
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+      spec:
+        serviceAccountName: $KSA_NAME
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 60G
+                memory: 200G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 60G
+                memory: 200G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: VLLM_XLA_CACHE_PATH
+                value: "/data"
+            securityContext:
+              privileged: true
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+        volumes:
+        - name: gke-gcsfuse-cache
+          emptyDir:
+            medium: Memory
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcs-fuse-csi-ephemeral
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: $GSBUCKET
+              mountOptions: "implicit-dirs"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+          cloud.google.com/gke-tpu-topology: 4x8
+# [END gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v6e_multihost]

From 53bbfeb22eaf5fe9cf232d2713e8090a9fc35e4e Mon Sep 17 00:00:00 2001
From: ryanaoleary <ryanaoleary@google.com>
Date: Thu, 14 Nov 2024 01:48:30 +0000
Subject: [PATCH 16/16] Add v5e and v6e RayServices

Signed-off-by: ryanaoleary <ryanaoleary@google.com>
---
 .../tpu/ray-service.tpu-v5e-multihost.yaml    | 156 ++++++++++++++++++
 .../tpu/ray-service.tpu-v6e-multihost.yaml    | 156 ++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml
 create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml

diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml
new file mode 100644
index 0000000000..b09d6afa4b
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml
@@ -0,0 +1,156 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v5e_multihost]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          env_vars:
+            MODEL_ID: "$MODEL_ID"
+            MAX_MODEL_LEN: "$MAX_MODEL_LEN"
+            TPU_CHIPS: "64"
+  rayClusterConfig:
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            - name: VLLM_XLA_CACHE_PATH
+              value: "/data"
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+          volumes:
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GSBUCKET
+                mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      numOfHosts: 16
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "8"
+                  ephemeral-storage: 40G
+                  memory: 200G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "8"
+                  ephemeral-storage: 40G
+                  memory: 200G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+                - name: VLLM_XLA_CACHE_PATH
+                  value: "/data"
+              volumeMounts:
+              - name: gcs-fuse-csi-ephemeral
+                mountPath: /data
+              - name: dshm
+                mountPath: /dev/shm
+          volumes:
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GSBUCKET
+                mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 8x8
+# [END gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v5e_multihost]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml
new file mode 100644
index 0000000000..6c34728bf3
--- /dev/null
+++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml
@@ -0,0 +1,156 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v6e_multihost]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip"
+          env_vars:
+            MODEL_ID: "$MODEL_ID"
+            MAX_MODEL_LEN: "$MAX_MODEL_LEN"
+            TPU_CHIPS: "32"
+  rayClusterConfig:
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            - name: VLLM_XLA_CACHE_PATH
+              value: "/data"
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            volumeMounts:
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+            - name: dshm
+              mountPath: /dev/shm
+          volumes:
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GSBUCKET
+                mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      numOfHosts: 8
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "8"
+                  ephemeral-storage: 40G
+                  memory: 200G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "8"
+                  ephemeral-storage: 40G
+                  memory: 200G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+                - name: VLLM_XLA_CACHE_PATH
+                  value: "/data"
+              volumeMounts:
+              - name: gcs-fuse-csi-ephemeral
+                mountPath: /data
+              - name: dshm
+                mountPath: /dev/shm
+          volumes:
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GSBUCKET
+                mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+            cloud.google.com/gke-tpu-topology: 4x8
+# [END gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v6e_multihost]